pax_global_header00006660000000000000000000000064143660060720014516gustar00rootroot0000000000000052 comment=7d59964c535f998d3852d0a33116ebf42383a0f1 rocSOLVER-rocm-5.5.1/000077500000000000000000000000001436600607200142025ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/.clang-format000077500000000000000000000065421436600607200165670ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignEscapedNewlines: Left AlignOperands: false AlignTrailingComments: false AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: true BinPackArguments: true BinPackParameters: false # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: { AfterCaseLabel: 'true' AfterClass: 'true' AfterControlStatement: 'true' AfterEnum : 'true' AfterFunction : 'true' AfterNamespace : 'true' AfterStruct : 'true' AfterUnion : 'true' BeforeCatch : 'true' BeforeElse : 'true' IndentBraces : 'false' # AfterExternBlock : 'true' } #BreakAfterJavaFieldAnnotations: true #BreakBeforeInheritanceComma: false #BreakBeforeBinaryOperators: None #BreakBeforeTernaryOperators: true #BreakConstructorInitializersBeforeComma: true #BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' #CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true SpaceBeforeCpp11BracedList: false DerivePointerAlignment: false ExperimentalAutoDetectBinPacking: false ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IndentCaseLabels: false IndentPPDirectives: None #FixNamespaceComments: true IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' #JavaScriptQuotes: Double MaxEmptyLinesToKeep: 1 NamespaceIndentation: Inner #ObjCBlockIndentWidth: 4 #ObjCSpaceAfterProperty: true #ObjCSpaceBeforeProtocolList: true PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 10 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: Never SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false #SpaceAfterTemplateKeyword: true #SpaceBeforeInheritanceColon: true #SortUsingDeclarations: true SortIncludes: true # Comments are for developers, they should arrange them ReflowComments: false #IncludeBlocks: Preserve --- rocSOLVER-rocm-5.5.1/.gitattributes000066400000000000000000000004471436600607200171020ustar00rootroot00000000000000# By default, convert all text files to Unix line endings on check-in # and native line endings on check-out * text=auto # Override the default behavior for specific files *.sh text eol=lf *.bat text eol=crlf # Reduce merge conflicts in changelog /CHANGELOG.md merge=union rocSOLVER-rocm-5.5.1/.github/000077500000000000000000000000001436600607200155425ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/.github/CODEOWNERS000066400000000000000000000000461436600607200171350ustar00rootroot00000000000000* @jzuniga-amd @tfalders @cgmb @qjojo rocSOLVER-rocm-5.5.1/.github/workflows/000077500000000000000000000000001436600607200175775ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/.github/workflows/docs.yaml000066400000000000000000000044631436600607200214220ustar00rootroot00000000000000name: Upload to the upload server # Controls when the workflow will run on: push: branches: [develop, master] tags: - rocm-5.* release: types: [published] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: # This workflow contains a single job called "build" build: # The type of runner that the job will run on runs-on: ubuntu-latest # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 - name: getting branch name shell: bash run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" id: branch_name - name: getting tag name shell: bash run: echo "##[set-output name=tag;]$(echo ${GITHUB_REF_NAME})" id: tag_name - name: zipping files run: zip -r ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip . -x '*.git*' '*.idea*' - name: echo-step run: echo "${{ github.event.release.target_commitish }}" - name: uploading archive to prod if: ${{ steps.branch_name.outputs.branch == 'master' || github.event.release.target_commitish == 'master'}} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.PROD_UPLOAD_URL }}' args: '-o ConnectTimeout=5' - name: uploading archive to staging if: ${{ steps.branch_name.outputs.branch == 'develop' || github.event.release.target_commitish == 'develop' }} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.STG_UPLOAD_URL }}' args: '-o ConnectTimeout=5' rocSOLVER-rocm-5.5.1/.gitignore000066400000000000000000000005751436600607200162010ustar00rootroot00000000000000# Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # vim tags tags .tags .*.swp # Editors .vscode # build-in-source directory build/ docBin/ # emacs temporary/backup files .\#* \#*\# *~ rocSOLVER-rocm-5.5.1/.jenkins/000077500000000000000000000000001436600607200157215ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/.jenkins/common.groovy000066400000000000000000000050411436600607200204600ustar00rootroot00000000000000// This file is for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. def runCompileCommand(platform, project, jobName, boolean sameOrg=false) { project.paths.construct_build_prefix() String compiler = 'hipcc' String hipClang = '' String debug = project.buildName.contains('Debug') ? '-g' : '' String centos = platform.jenkinsLabel.contains('centos') ? 'source scl_source enable devtoolset-7' : '' String noOptimizations = '' if (env.BRANCH_NAME ==~ /PR-\d+/) { pullRequest.labels.each { if (it == "noOptimizations") { noOptimizations = "-n" } } } def getRocBLAS = auxiliary.getLibrary('rocBLAS-internal',platform.jenkinsLabel, null, sameOrg) def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} ${getRocBLAS} ${auxiliary.exitIfNotSuccess()} ${centos} ${project.paths.build_command} ${hipClang} ${debug} ${noOptimizations} ${auxiliary.exitIfNotSuccess()} """ platform.runCommand(this, command) } def runTestCommand (platform, project, gfilter) { String buildType = project.buildName.contains('Debug') ? 'debug' : 'release' String hmmTestCommand = platform.jenkinsLabel.contains('gfx90a') ? 'HSA_XNACK=1 ./rocsolver-test --gtest_filter=*MANAGED_MALLOC* || true' : '' def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix}/build/${buildType}/clients/staging ./rocsolver-test --gtest_output=xml --gtest_color=yes --gtest_filter=${gfilter} if [ -f ./test-rocsolver-dlopen ]; then ./test-rocsolver-dlopen --gtest_color=yes fi ${hmmTestCommand} cd ../.. CTEST_OUTPUT_ON_FAILURE=1 ctest -R '^test-rocsolver-bench' """ platform.runCommand(this, command) junit "${project.paths.project_build_prefix}/build/${buildType}/clients/staging/*.xml" } def runPackageCommand(platform, project) { String buildType = project.buildName.contains('Debug') ? 'debug' : 'release' def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/${buildType}") platform.runCommand(this, packageHelper[0]) platform.archiveArtifacts(this, packageHelper[1]) } return this rocSOLVER-rocm-5.5.1/.jenkins/debug.groovy000066400000000000000000000050621436600607200202610ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocSOLVER', 'Debug') prj.timeout.compile = 600 prj.timeout.test = 45 prj.defaults.ccache = true // customize for project prj.paths.build_command = './install.sh -c' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> def gfilter = 'checkin*' commonGroovy.runTestCommand(platform, project, gfilter) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 6')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 5 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['gfx906']], urlJobName) } } } rocSOLVER-rocm-5.5.1/.jenkins/extended.groovy000066400000000000000000000051341436600607200207730ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocSOLVER', 'Extended') prj.timeout.compile = 600 prj.timeout.test = 420 prj.defaults.ccache = true // customize for project prj.paths.build_command = './install.sh -c' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> def gfilter = 'daily*' commonGroovy.runTestCommand(platform, project, gfilter) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 6')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx908'],sles15sp1:['gfx906']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 4 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['gfx906']], urlJobName) } } } rocSOLVER-rocm-5.5.1/.jenkins/precheckin.groovy000066400000000000000000000055671436600607200213200ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocSOLVER', 'PreCheckin') prj.timeout.compile = 600 prj.timeout.test = 45 prj.defaults.ccache = true // customize for project prj.paths.build_command = './install.sh -c --cmake-arg -DWERROR=ON' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> def gfilter = 'checkin*' commonGroovy.runTestCommand(platform, project, gfilter) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 6')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu18:['gfx900'],centos7:['gfx908'],sles15sp1:['gfx906']]), "compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx908'],centos8:['gfx906'],sles15sp1:['gfx906']]), "rocm-docker":([ubuntu18:['gfx900'],centos7:['gfx908'],sles15sp1:['gfx906']])] jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocSOLVER') propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 5 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['gfx900', 'gfx906']], urlJobName) } } } rocSOLVER-rocm-5.5.1/.jenkins/staticanalysis.groovy000066400000000000000000000034521436600607200222270ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCompileCommand(platform, project, jobName, boolean debug=false) { project.paths.construct_build_prefix() def command = """#!/usr/bin/env bash set -x ${project.paths.project_build_prefix}/docs/run_doc.sh """ try { platform.runCommand(this, command) } catch(e) { throw e } publishHTML([allowMissing: false, alwaysLinkToLastBuild: false, keepAll: false, reportDir: "${project.paths.project_build_prefix}/docs/build/html", reportFiles: "index.html", reportName: "Documentation", reportTitles: "Documentation"]) } def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocSOLVER', 'StaticAnalysis') // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = true boolean staticAnalysis = true def compileCommand = { platform, project-> runCompileCommand(platform, project, jobName, false) } buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])])) stage(urlJobName) { runCI([ubuntu20:['cpu']], urlJobName) } } rocSOLVER-rocm-5.5.1/.jenkins/staticlibrary.groovy000066400000000000000000000047501436600607200220520ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocSOLVER', 'StaticLibrary') prj.timeout.compile = 600 prj.timeout.test = 45 prj.defaults.ccache = true // customize for project prj.paths.build_command = './install.sh -a "gfx900;gfx906:xnack-" -c --static' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = true def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName, true) } def testCommand = { platform, project-> def gfilter = 'checkin*' commonGroovy.runTestCommand(platform, project, gfilter) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])] jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocSOLVER') propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['gfx900']], urlJobName) } } } rocSOLVER-rocm-5.5.1/.readthedocs.yaml000066400000000000000000000004171436600607200174330ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/source/conf.py formats: all python: version: "3.7" install: - requirements: docs/source/requirements.txt rocSOLVER-rocm-5.5.1/CHANGELOG.md000066400000000000000000000323251436600607200160200ustar00rootroot00000000000000# Change Log for rocSOLVER Full documentation for rocSOLVER is available at [rocsolver.readthedocs.io](https://rocsolver.readthedocs.io/en/latest/). ## rocSOLVER 3.21.0 for ROCm 5.5.0 ### Added - SVD for general matrices using Jacobi algorithm: - GESVDJ (with batched and strided\_batched versions) - LU factorization without pivoting for block tridiagonal matrices: - GEBLTTRF_NPVT (with batched and strided\_batched versions) - Linear system solver without pivoting for block tridiagonal matrices: - GEBLTTRS_NPVT (with batched and strided\_batched, versions) - Product of triangular matrices - LAUUM - Added experimental hipGraph support for rocSOLVER functions ### Optimized - Improved the performance of SYEVJ/HEEVJ. ### Changed - STEDC, SYEVD/HEEVD and SYGVD/HEGVD now use fully implemented Divide and Conquer approach. ### Fixed - SYEVJ/HEEVJ should now be invariant under matrix scaling. - SYEVJ/HEEVJ should now properly output the eigenvalues when no sweeps are executed. - Fixed GETF2\_NPVT and GETRF\_NPVT input data initialization in tests and benchmarks. - Fixed rocblas missing from the dependency list of the rocsolver deb and rpm packages. ## rocSOLVER 3.20.0 for ROCm 5.4.0 ### Added - Partial SVD for bidiagonal matrices: - BDSVDX - Partial SVD for general matrices: - GESVDX (with batched and strided\_batched versions) ### Changed - Changed `ROCSOLVER_EMBED_FMT` default to `ON` for users building directly with CMake. This matches the existing default when building with install.sh or rmake.py. ## rocSOLVER 3.19.0 for ROCm 5.3.0 ### Added - Partial eigensolver routines for symmetric/hermitian matrices: - SYEVX (with batched and strided\_batched versions) - HEEVX (with batched and strided\_batched versions) - Generalized symmetric- and hermitian-definite partial eigensolvers: - SYGVX (with batched and strided\_batched versions) - HEGVX (with batched and strided\_batched versions) - Eigensolver routines for symmetric/hermitian matrices using Jacobi algorithm: - SYEVJ (with batched and strided\_batched versions) - HEEVJ (with batched and strided\_batched versions) - Generalized symmetric- and hermitian-definite eigensolvers using Jacobi algorithm: - SYGVJ (with batched and strided\_batched versions) - HEGVJ (with batched and strided\_batched versions) - Added --profile_kernels option to rocsolver-bench, which will include kernel calls in the profile log (if profile logging is enabled with --profile). ### Changed - Changed rocsolver-bench result labels `cpu_time` and `gpu_time` to `cpu_time_us` and `gpu_time_us`, respectively. ### Removed - Removed dependency on cblas from the rocsolver test and benchmark clients. ### Fixed - Fixed incorrect SYGS2/HEGS2, SYGST/HEGST, SYGV/HEGV, and SYGVD/HEGVD results for batch counts larger than 32. - Fixed STEIN memory access fault when nev is 0. - Fixed incorrect STEBZ results for close eigenvalues when range = index. - Fixed git unsafe repository error when building with `./install.sh -cd` as a non-root user. ## rocSOLVER 3.18.0 for ROCm 5.2.0 ### Added - Partial eigenvalue decomposition routines: - STEBZ - STEIN - Package generation for test and benchmark executables on all supported OSes using CPack. - Added tests for multi-level logging - Added tests for rocsolver-bench client - File/Folder Reorg - Added File/Folder Reorg Changes with backward compatibility support using ROCM-CMAKE wrapper functions. ### Fixed - Fixed compatibility with libfmt 8.1 ## rocSOLVER 3.17.0 for ROCm 5.1.0 ### Optimized - Optimized non-pivoting and batch cases of the LU factorization ### Fixed - Fixed missing synchronization in SYTRF with `rocblas_fill_lower` that could potentially result in incorrect pivot values. - Fixed multi-level logging output to file with the `ROCSOLVER_LOG_PATH`, `ROCSOLVER_LOG_TRACE_PATH`, `ROCSOLVER_LOG_BENCH_PATH` and `ROCSOLVER_LOG_PROFILE_PATH` environment variables. - Fixed performance regression in the batched LU factorization of tiny matrices ## rocSOLVER 3.16.0 for ROCm 5.0.0 ### Added - Symmetric matrix factorizations: - LASYF - SYTF2, SYTRF (with batched and strided\_batched versions) - Added `rocsolver_get_version_string_size` to help with version string queries - Added `rocblas_layer_mode_ex` and the ability to print kernel calls in the trace and profile logs - Expanded batched and strided\_batched sample programs. ### Optimized - Improved general performance of LU factorization - Increased parallelism of specialized kernels when compiling from source, reducing build times on multi-core systems. ### Changed - The rocsolver-test client now prints the rocSOLVER version used to run the tests, rather than the version used to build them - The rocsolver-bench client now prints the rocSOLVER version used in the benchmark ### Fixed - Added missing stdint.h include to rocsolver.h ## rocSOLVER 3.15.0 for ROCm 4.5.0 ### Added - Eigensolver routines for symmetric/hermitian matrices using Divide and Conquer algorithm: - STEDC - SYEVD (with batched and strided\_batched versions) - HEEVD (with batched and strided\_batched versions) - Generalized symmetric- and hermitian-definite eigensolvers using Divide and Conquer algorithm: - SYGVD (with batched and strided\_batched versions) - HEGVD (with batched and strided\_batched versions) - Added --mem\_query option to rocsolver-bench, which will print the amount of device memory required by a function. - Added --profile option to rocsolver-bench, which will print profile logging results for a function. - RQ factorization routines: - GERQ2, GERQF (with batched and strided\_batched versions) - Linear solvers for general square systems: - GESV (with batched and strided\_batched versions) - Linear solvers for symmetric/hermitian positive definite systems: - POTRS (with batched and strided\_batched versions) - POSV (with batched and strided\_batched versions) - Inverse of symmetric/hermitian positive definite matrices: - POTRI (with batched and strided\_batched versions) - General matrix inversion without pivoting: - GETRI\_NPVT (with batched and strided\_batched versions) - GETRI\_NPVT\_OUTOFPLACE (with batched and strided\_batched versions) ### Optimized - Improved performance of LU factorization (especially for large matrix sizes) ### Changed - The -h option of install.sh now prints a help message, instead of doing nothing. - libfmt 7.1 is now a dependency - Raised minimum requirement for building rocSOLVER from source to CMake 3.13 - Raised reference LAPACK version used for rocSOLVER test and benchmark clients to v3.9.1 - Minor CMake improvements for users building from source without install.sh: - Removed fmt::fmt from rocsolver's public usage requirements - Enabled small-size optimizations by default - Split packaging into a runtime package ('rocsolver') and a development package ('rocsolver-devel'). The development package depends on the runtime package. To aid in the transition, the runtime package suggests the development package (except on CentOS 7). This use of the suggests feature is deprecated and will be removed in a future ROCm release. ### Fixed - Use of the GCC / Clang `__attribute__((deprecated(...)))` extension is now guarded by compiler detection macros. ## rocSOLVER 3.13.0 for ROCm 4.3.0 ### Added - Linear solvers for general non-square systems: - GELS now supports underdetermined and transposed cases - Inverse of triangular matrices - TRTRI (with batched and strided\_batched versions) - Out-of-place general matrix inversion - GETRI\_OUTOFPLACE (with batched and strided\_batched versions) ### Optimized - Improved general performance of matrix inversion (GETRI) ### Changed - Argument names for the benchmark client now match argument names from the public API ### Fixed - Fixed known issues with Thin-SVD. The problem was identified in the test specification, not in the thin-SVD implementation or the rocBLAS gemm\_batched routines. - Benchmark client will no longer crash as a result of leading dimension or stride arguments not being provided on the command line. ## rocSOLVER 3.12.0 for ROCm 4.2.0 ### Added - Multi-level logging functionality - Implementation of the Thin-SVD algorithm - Reductions of generalized symmetric- and hermitian-definite eigenproblems: - SYGS2, SYGST (with batched and strided\_batched versions) - HEGS2, HEGST (with batched and strided\_batched versions) - Symmetric and hermitian matrix eigensolvers: - SYEV (with batched and strided\_batched versions) - HEEV (with batched and strided\_batched versions) - Generalized symmetric- and hermitian-definite eigensolvers: - SYGV (with batched and strided\_batched versions) - HEGV (with batched and strided\_batched versions) ### Changed - Sorting method in STERF as original quick-sort was failing for large sizes. ### Removed - Removed hcc compiler support ### Fixed - Fixed GELS overwriting B even when info != 0 - Error when calling STEQR with n=1 from batched routines - Added `roc::rocblas` to the `roc::rocsolver` CMake usage requirements - Added rocblas to the dependency list of the rocsolver deb and rpm packages - Fixed rocblas symbol loading with dlopen and the `RTLD_NOW | RTLD_LOCAL` options ### Known Issues - Thin-SVD implementation is failing in some cases (in particular m=300, n=120) due to a possible bug in the gemm\_batched routines of rocBLAS. ## rocSOLVER 3.11.0 for ROCm 4.1.0 ### Added - Eigensolver routines for symmetric/hermitian matrices: - STERF, STEQR - Linear solvers for general non-square systems: - GELS (API added with batched and strided\_batched versions. Only the overdetermined non-transpose case is implemented in this release. Other cases will return `rocblas_status_not_implemented` status for now.) - Extended test coverage for functions returning info - Changelog file - Tridiagonalization routines for symmetric and hermitian matrices: - LATRD - SYTD2, SYTRD (with batched and strided\_batched versions) - HETD2, HETRD (with batched and strided\_batched versions) - Sample code and unit test for unified memory model/Heterogeneous Memory Management (HMM) ### Optimized - Improved performance of LU factorization of small and mid-size matrices (n <= 2048) ### Changed - Raised minimum requirement for building rocSOLVER from source to CMake 3.8 - Switched to use semantic versioning for the library - Enabled automatic reallocation of memory workspace in rocsolver clients ### Removed - Removed `-DOPTIMAL` from the `roc::rocsolver` CMake usage requirements. This is an internal rocSOLVER definition, and does not need to be defined by library users ### Fixed - Fixed runtime errors in debug mode caused by incorrect kernel launch bounds - Fixed complex unit test bug caused by incorrect zaxpy function signature - Eliminated a small memory transfer that was being done on the default stream - Fixed GESVD right singular vectors for 1x1 matrices ## rocSOLVER 3.10.0 for ROCm 3.10.0 ### Added - Orthonormal/Unitary matrix generator routines (reverse order): - ORG2L, UNG2L, ORGQL, UNGQL - ORGTR, UNGTR - Orthonormal/Unitary matrix multiplications routines (reverse order): - ORM2L, UNM2L, ORMQL, UNMQL - ORMTR, UNMTR ### Changed - Major library refactoring to adopt rocBLAS memory model ### Fixed - Returned values in parameter info of functions dealing with singularities ## rocSOLVER 3.9.0 for ROCm 3.9.0 ### Added - Improved debug build mode for developers - QL factorization routines: - GEQL2, GEQLF (with batched and strided\_batched versions) - SVD of general matrices routines: - GESVD (with batched and strided\_batched versions) ### Optimized - Improved performance of mid-size matrix inversion (64 < n <= 2048) ## rocSOLVER 3.8.0 for ROCm 3.8.0 ### Added - Sample codes for C, C++ and FORTRAN - LU factorization without pivoting routines: - GETF2\_NPVT, GETRF\_NPVT (with batched and strided\_batched versions) ### Optimized - Improved performance of LU factorization of mid-size matrices (64 < n <= 2048) - Improved performance of small-size matrix inversion (n <= 64) ### Fixed - Ensure the public API is C compatible ## rocSOLVER 3.7.0 for ROCm 3.7.0 ### Added - LU-factorization-based matrix inverse routines: - GETRI (with batched and strided\_batched versions) - SVD of bidiagonal matrices routine: - BDSQR ### Fixed - Ensure congruency on the input data when executing performance tests (benchmarks) ## rocSOLVER 3.6.0 for ROCm 3.6.0 ### Added - Complex precision support for all existing rocSOLVER functions - Bidiagonalization routines: - LABRD - GEBD2, GEBRD (with batched and strided\_batched versions) - Integration of rocSOLVER to hipBLAS ### Optimized - Improved performance of LU factorization of tiny matrices (n <= 64) ### Changed - Major clients refactoring to achieve better test coverage and benchmarking ## rocSOLVER 3.5.0 for ROCm 3.5.0 ### Added - Installation script and new build procedure - Documentation and integration with ReadTheDocs - Orthonormal matrix multiplication routines: - ORM2R, ORMQR - ORML2, ORMLQ - ORMBR ### Changed - Switched to use all rocBLAS types and enumerations - Major library refactoring to achieve better integration and rocBLAS support - hip-clang is now default compiler ### Deprecated - rocSOLVER types and enumerations - hcc compiler support rocSOLVER-rocm-5.5.1/CMakeLists.txt000066400000000000000000000217671436600607200167570ustar00rootroot00000000000000# ######################################################################## # Copyright (c) 2019-2022 Advanced Micro Devices, Inc. # ######################################################################## cmake_minimum_required(VERSION 3.13) # This has to be initialized before the project() command appears # Set the default build type to Release if(NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel.") endif() if(NOT DEFINED CMAKE_Fortran_COMPILER AND NOT DEFINED ENV{FC}) set(CMAKE_Fortran_COMPILER "gfortran") endif() # ROCM_BUILD_ID is added to the package name by rocm-cmake. Unsetting it prevents that. unset(ENV{ROCM_BUILD_ID}) # Disable ROCMClang detection to make CMake v3.21 work the same as CMake v3.20 and earlier. # https://gitlab.kitware.com/cmake/cmake/-/merge_requests/6533 if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.21.0 AND CMAKE_VERSION VERSION_LESS 3.21.3) set(__skip_rocmclang ON) endif() message(STATUS "Using CMake ${CMAKE_VERSION}") project(rocsolver LANGUAGES CXX) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD_REQUIRED ON) option(ROCSOLVER_EMBED_FMT "Hide libfmt symbols" ON) option(OPTIMAL "Build specialized kernels for small matrix sizes" ON) option(ROCSOLVER_FIND_PACKAGE_LAPACK_CONFIG "Skip module mode search for LAPACK" ON) # Add our CMake helper files to the lookup path list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) find_package(fmt REQUIRED) # ######################################################################## # Main # ######################################################################## # Get rocm-cmake include(get-rocm-cmake) # Include the rocm-cmake components we use include(ROCMSetupVersion) include(ROCMCreatePackage) include(ROCMInstallTargets) include(ROCMPackageConfigHelpers) include(ROCMInstallSymlinks) include(ROCMCheckTargetIds) include(ROCMClients) include(ROCMHeaderWrapper) include(os-detection) get_os_id(OS_ID) message(STATUS "OS detected is ${OS_ID}") # Versioning via rocm-cmake set(VERSION_STRING "3.21.0") rocm_setup_version(VERSION ${VERSION_STRING}) # Workaround until llvm and hip CMake modules fix symlink logic in their config files list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/llvm ${ROCM_PATH}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip ) if(CMAKE_BUILD_TYPE STREQUAL "Debug") set(DEFAULT_ARMOR_LEVEL 1) else() set(DEFAULT_ARMOR_LEVEL 0) endif() set(ARMOR_LEVEL "${DEFAULT_ARMOR_LEVEL}" CACHE STRING "Enables increasingly expensive runtime correctness checks") include(armor-config) # This option only works for make, nmake and ninja, but no reason it shouldn't be on all the time # It creates a compile_commands.json file for use with clang tooling or vim set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # BUILD_SHARED_LIBS is a cmake built-in # Make it an explicit option such that it shows in cmake-gui option(BUILD_SHARED_LIBS "Build rocSOLVER as a shared library" ON) # Include helper functions and wrapper functions include(util) include(CMakeDependentOption) option(BUILD_TESTING "Build rocSOLVER tests" OFF) if(BUILD_TESTING) enable_testing() endif() option(BUILD_LIBRARY "Build rocSOLVER library" ON) option_opposite(BUILD_LIBRARY SKIP_LIBRARY) option(BUILD_CLIENTS_TESTS "Build rocSOLVER test client" "${BUILD_TESTING}") option(BUILD_CLIENTS_BENCHMARKS "Build rocSOLVER benchmark client" OFF) option(BUILD_CLIENTS_SAMPLES "Build rocSOLVER samples" OFF) cmake_dependent_option(BUILD_CLIENTS_EXTRA_TESTS "Build extra tests" OFF BUILD_TESTING OFF) option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) option(BUILD_CODE_COVERAGE "Build rocSOLVER with code coverage enabled" OFF) option(WERROR "Treat warnings as errors" OFF) cmake_dependent_option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg backward compatibility enabled" ON "NOT WIN32" OFF) if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY) rocm_wrap_header_dir( ${CMAKE_SOURCE_DIR}/library/include/rocsolver PATTERNS "*.h" GUARDS SYMLINK WRAPPER WRAPPER_LOCATIONS ${CMAKE_INSTALL_INCLUDEDIR} ) endif() message(STATUS "Tests: ${BUILD_CLIENTS_TESTS}") message(STATUS "Benchmarks: ${BUILD_CLIENTS_BENCHMARKS}") message(STATUS "Samples: ${BUILD_CLIENTS_SAMPLES}") if(NOT DEFINED AMDGPU_TARGETS) # Query for compiler support of GPU archs rocm_check_target_ids(OPTIONAL_AMDGPU_TARGETS TARGETS gfx90a:xnack- gfx90a:xnack+ gfx1100 gfx1101 gfx1102 ) endif() # Set this before finding hip so that hip::device has the required arch flags # added as usage requirements on its interface set(AMDGPU_TARGETS "gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx1010;gfx1030;${OPTIONAL_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target") # Find HIP dependencies find_package(hip REQUIRED CONFIG PATHS ${ROCM_PATH} /opt/rocm) find_package(rocblas REQUIRED CONFIG PATHS ${ROCM_PATH}) get_imported_target_location(location roc::rocblas) message(STATUS "Found rocBLAS: ${location}") add_subdirectory(common) if(BUILD_LIBRARY) add_subdirectory(library) endif() if(BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_SAMPLES) if(NOT CLIENTS_OS) rocm_set_os_id(CLIENTS_OS) string(TOLOWER "${CLIENTS_OS}" CLIENTS_OS) rocm_read_os_release(CLIENTS_OS_VERSION VERSION_ID) endif() set(GFORTRAN_RPM "libgfortran4") set(GFORTRAN_DEB "libgfortran4") if(CLIENTS_OS STREQUAL "centos" OR CLIENTS_OS STREQUAL "rhel") if(CLIENTS_OS_VERSION VERSION_GREATER_EQUAL "8") set(GFORTRAN_RPM "libgfortran") endif() elseif(CLIENTS_OS STREQUAL "ubuntu" AND CLIENTS_OS_VERSION VERSION_GREATER_EQUAL "20.04") set(GFORTRAN_DEB "libgfortran5") endif() rocm_package_setup_component(clients) if(BUILD_CLIENTS_TESTS) rocm_package_setup_client_component(tests DEPENDS DEB "${GFORTRAN_DEB}" RPM "${GFORTRAN_RPM}") endif() if(BUILD_CLIENTS_BENCHMARKS) rocm_package_setup_client_component(benchmarks DEPENDS DEB "${GFORTRAN_DEB}" RPM "${GFORTRAN_RPM}") endif() add_subdirectory(clients) endif() # Package-specific CPACK vars rocm_package_add_dependencies(DEPENDS "rocblas >= 2.47" "rocblas < 2.48") if(OS_ID_sles) rocm_package_add_rpm_dependencies("libLLVM >= 7.0.1") endif() set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE.md") set(CPACK_RPM_PACKAGE_LICENSE "BSD") if(WIN32) set(CPACK_SOURCE_GENERATOR "ZIP") set(CPACK_GENERATOR "ZIP") set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE) set(INSTALL_PREFIX "C:/hipSDK") set(CPACK_SET_DESTDIR OFF) set(CPACK_PACKAGE_INSTALL_DIRECTORY "C:/hipSDK") set(CPACK_PACKAGING_INSTALL_PREFIX "") set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF) else() if(NOT CPACK_PACKAGING_INSTALL_PREFIX) set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}") endif() endif() set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" ) set(ROCSOLVER_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path placed into ldconfig file") rocm_create_package( NAME rocsolver DESCRIPTION "AMD ROCm SOLVER library" MAINTAINER "RocSOLVER maintainer " LDCONFIG LDCONFIG_DIR ${ROCSOLVER_CONFIG_DIR} ) # Code Coverage Build Commands: # make coverage_cleanup (clean coverage related files) # make coverage GTEST_FILTER=<> # make coverage_analysis GTEST_FILTER=<> (analyze tests) # make coverage_output (generate html documentation) if(BUILD_CODE_COVERAGE) # Run coverage analysis add_custom_target(coverage_analysis COMMAND echo Coverage GTEST_FILTER=\${GTEST_FILTER} COMMAND ./clients/staging/rocsolver-test --gtest_filter=\"\${GTEST_FILTER}\" WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) add_dependencies(coverage_analysis rocsolver) # Generate gcov-tool script # This little script is generated because the option '--gcov-tool ' of lcov cannot take arguments. add_custom_target(coverage_output DEPENDS coverage_analysis COMMAND mkdir -p lcoverage COMMAND echo "\\#!/bin/bash" > llvm-gcov.sh COMMAND echo "\\# THIS FILE HAS BEEN GENERATED" >> llvm-gcov.sh COMMAND printf "exec /opt/rocm/llvm/bin/llvm-cov gcov $$\\@" >> llvm-gcov.sh COMMAND chmod +x llvm-gcov.sh ) # Generate code coverage report add_custom_command(TARGET coverage_output COMMAND lcov --directory . --base-directory . --gcov-tool ${CMAKE_BINARY_DIR}/llvm-gcov.sh --capture -o lcoverage/raw_main_coverage.info COMMAND lcov --remove lcoverage/raw_main_coverage.info "'/opt/*'" "'/usr/*'" -o lcoverage/main_coverage.info COMMAND genhtml --ignore-errors source lcoverage/main_coverage.info --output-directory lcoverage ) add_custom_target(coverage DEPENDS coverage_output) # Delete gcov data files add_custom_target(coverage_cleanup COMMAND find ${CMAKE_BINARY_DIR} -name *.gcda -delete WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) endif() rocSOLVER-rocm-5.5.1/CONTRIBUTING.md000066400000000000000000000045031436600607200164350ustar00rootroot00000000000000# Contributing ## Philosophy AMD welcomes contributions from the community. Whether those contributions are bug reports, bug fixes, documentation additions, performance notes, or other improvements, we value collaboration with our users. We can build better solutions together. # Submitting a Pull Request To contribute changes to rocSOLVER, open a pull request targeting the `develop` branch. Pull requests will be tested and reviewed by the AMD development team. AMD may request changes or modify the submission before acceptance. ## Interface requirements The public interface must be: - C99 compatible - Source and binary compatible with previous releases - Fully documented with Doxygen and Sphinx All identifiers in the public headers must be prefixed with `rocblas`, `ROCBLAS`, `rocsolver`, or `ROCSOLVER`. The prefixes `_ROCLAPACK` and `_ROCSOLVER` are deprecated and should not be used in new code. All user-visible symbols must be prefixed with `rocblas` or `rocsolver`. ## Style guide In general, follow the style of the surrounding code. All code is auto-formatted using clang-format. To apply the rocsolver formatting, run `clang-format -i -style=file ` on any files you've changed. You can install git hooks to do this automatically upon commit by running `scripts/install-hooks --get-clang-format`. If you find you'd rather not use the hooks, they can be removed using `scripts/uninstall-hooks`. ## Tests To run the rocSOLVER test suite, first build the rocSOLVER test client following the instructions in [Building and Installation][1]. Then, run the `rocsolver-test` binary. For a typical build, the test binary will be found at `./build/release/clients/staging/rocsolver-test`. The full test suite is quite large and may take a long time to complete, so passing the [`--gtest_filter=`][2] option to rocsolver-test may be useful during development. A fast subset of tests can be run with `--gtest_filter='checkin*'`, while the extended tests can be run with `--gtest_filter='daily*'`. ## Rejected contributions Unfortunately, sometimes a contribution cannot be accepted. The rationale for a decision may or may not be disclosed. [1]: https://rocsolver.readthedocs.io/en/latest/userguide_install.html [2]: https://github.com/google/googletest/blob/release-1.10.0/googletest/docs/advanced.md#running-a-subset-of-the-tests rocSOLVER-rocm-5.5.1/LICENSE.md000066400000000000000000000120101436600607200156000ustar00rootroot00000000000000Copyright (c) 2018-2022 Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. This product includes code derived from the LAPACK and MAGMA projects. Copyright holders for these projects are indicated below, and distributed under their license terms as specified. -- LAPACK -- - Copyright (c) 1992-2013 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. - Copyright (c) 2000-2013 The University of California Berkeley. All rights reserved. - Copyright (c) 2006-2013 The University of Colorado Denver. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution. - Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. The copyright holders provide no reassurances that the source code provided does not infringe any patent, copyright, or any other intellectual property rights of third parties. The copyright holders disclaim any liability to any recipient for claims brought against recipient by any third party for infringement of that parties intellectual property rights. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -- MAGMA -- Copyright (c) 2009-2021 The University of Tennessee. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution. - Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. This software is provided by the copyright holders and contributors "as is" and any express or implied warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose are disclaimed. in no event shall the copyright owner or contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this software, even if advised of the possibility of such damage. rocSOLVER-rocm-5.5.1/README.md000066400000000000000000000076761436600607200155010ustar00rootroot00000000000000# rocSOLVER rocSOLVER is a work-in-progress implementation of a subset of [LAPACK][1] functionality on the [ROCm platform][2]. ## Documentation For a detailed description of the rocSOLVER library, its implemented routines, the installation process and user guide, see the [rocSOLVER documentation][3]. ## Building rocSOLVER To download the rocSOLVER source code, clone this repository with the command: git clone https://github.com/ROCmSoftwarePlatform/rocSOLVER.git rocSOLVER requires rocBLAS as a companion GPU BLAS implementation. For more information about rocBLAS and how to install it, see the [rocBLAS documentation][4]. After a standard installation of rocBLAS, the following commands will build rocSOLVER and install to `/opt/rocm`: cd rocSOLVER ./install.sh -i Once installed, rocSOLVER can be used just like any other library with a C API. The header file will need to be included in the user code, and both the rocBLAS and rocSOLVER shared libraries will become link-time and run-time dependencies for the user application. If you are a developer contributing to rocSOLVER, you may wish to run `./scripts/install-hooks` to install the git hooks for autoformatting. You may also want to take a look at the [contributing guidelines][7] ## Using rocSOLVER The following code snippet shows how to compute the QR factorization of a general m-by-n real matrix in double precision using rocSOLVER. A longer version of this example is provided by `example_basic.cpp` in the [samples directory][5]. For a description of the `rocsolver_dgeqrf` function, see the [rocSOLVER API documentation][6]. ```cpp ///////////////////////////// // example.cpp source code // ///////////////////////////// #include // for std::min #include // for size_t #include #include // for hip functions #include // for all the rocsolver C interfaces and type declarations int main() { rocblas_int M; rocblas_int N; rocblas_int lda; // here is where you would initialize M, N and lda with desired values rocblas_handle handle; rocblas_create_handle(&handle); size_t size_A = size_t(lda) * N; // the size of the array for the matrix size_t size_piv = size_t(std::min(M, N)); // the size of array for the Householder scalars std::vector hA(size_A); // creates array for matrix in CPU std::vector hIpiv(size_piv); // creates array for householder scalars in CPU double *dA, *dIpiv; hipMalloc(&dA, sizeof(double)*size_A); // allocates memory for matrix in GPU hipMalloc(&dIpiv, sizeof(double)*size_piv); // allocates memory for scalars in GPU // here is where you would initialize matrix A (array hA) with input data // note: matrices must be stored in column major format, // i.e. entry (i,j) should be accessed by hA[i + j*lda] // copy data to GPU hipMemcpy(dA, hA.data(), sizeof(double)*size_A, hipMemcpyHostToDevice); // compute the QR factorization on the GPU rocsolver_dgeqrf(handle, M, N, dA, lda, dIpiv); // copy the results back to CPU hipMemcpy(hA.data(), dA, sizeof(double)*size_A, hipMemcpyDeviceToHost); hipMemcpy(hIpiv.data(), dIpiv, sizeof(double)*size_piv, hipMemcpyDeviceToHost); // the results are now in hA and hIpiv, so you can use them here hipFree(dA); // de-allocate GPU memory hipFree(dIpiv); rocblas_destroy_handle(handle); // destroy handle } ``` The exact command used to compile the example above may vary depending on the system environment, but here is a typical example: /opt/rocm/bin/hipcc -I/opt/rocm/include -c example.cpp /opt/rocm/bin/hipcc -o example -L/opt/rocm/lib -lrocsolver -lrocblas example.o [1]: https://www.netlib.org/lapack/ [2]: https://docs.amd.com [3]: https://rocsolver.readthedocs.io [4]: https://rocblas.readthedocs.io [5]: clients/samples/ [6]: https://rocsolver.readthedocs.io/en/latest/api_lapackfunc.html#rocsolver-type-geqrf [7]: CONTRIBUTING.md rocSOLVER-rocm-5.5.1/bump_rocsolver_version.sh000077500000000000000000000014711436600607200213520ustar00rootroot00000000000000#!/bin/sh # run this script in develop after merging develop/staging into master at the feature-complete date # Edit script to bump versions for new development cycle/release. OLD_ROCSOLVER_VERSION="3.21.0" NEW_ROCSOLVER_VERSION="3.22.0" sed -i "s/${OLD_ROCSOLVER_VERSION}/${NEW_ROCSOLVER_VERSION}/g" CMakeLists.txt # for documentation OLD_ROCSOLVER_DOCS_VERSION="3.21" NEW_ROCSOLVER_DOCS_VERSION="3.22" sed -i "s/${OLD_ROCSOLVER_DOCS_VERSION}/${NEW_ROCSOLVER_DOCS_VERSION}/g" docs/source/conf.py # for rocBLAS package requirements OLD_ROCBLAS_VERSION_DOWN="2.47" NEW_ROCBLAS_VERSION_DOWN="2.48" OLD_ROCBLAS_VERSION_UP="2.48" NEW_ROCBLAS_VERSION_UP="2.49" sed -i "s/${OLD_ROCBLAS_VERSION_UP}/${NEW_ROCBLAS_VERSION_UP}/g" CMakeLists.txt sed -i "s/${OLD_ROCBLAS_VERSION_DOWN}/${NEW_ROCBLAS_VERSION_DOWN}/g" CMakeLists.txt rocSOLVER-rocm-5.5.1/clients/000077500000000000000000000000001436600607200156435ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/clients/CMakeLists.txt000077500000000000000000000100201436600607200203770ustar00rootroot00000000000000# ######################################################################## # Copyright (c) 2019-2022 Advanced Micro Devices, Inc. # ######################################################################## project(rocsolver-clients LANGUAGES C CXX) if(UNIX) enable_language(Fortran) endif() # Specify where to put the client binaries set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging") # The rocsolver target will exist if the library is being built along with the clients, # but if this is a clients-only build, we'll have to search for it. if(NOT TARGET rocsolver) find_package(rocsolver REQUIRED CONFIG PATHS ${ROCM_PATH}/rocsolver /opt/rocm/rocsolver) get_imported_target_location(location roc::rocsolver) message(STATUS "Found rocSOLVER: ${location}") endif() if(BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS) if(ROCSOLVER_FIND_PACKAGE_LAPACK_CONFIG) find_package(LAPACK 3.7 REQUIRED CONFIG) else() find_package(LAPACK 3.7 REQUIRED) endif() if(NOT LAPACK_LIBRARIES) set(LAPACK_LIBRARIES ${LAPACK_blas_LIBRARIES} ${LAPACK_lapack_LIBRARIES} ) endif() add_library(clients-common INTERFACE) target_include_directories(clients-common INTERFACE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include ) target_link_libraries(clients-common INTERFACE ${LAPACK_LIBRARIES} fmt::fmt ) target_link_options(clients-common INTERFACE ${LAPACK_LINKER_FLAGS} ) set(explicit_inst_files common/testing_laswp.cpp common/testing_larfg.cpp common/testing_larf.cpp common/testing_larft.cpp common/testing_larfb.cpp common/testing_latrd.cpp common/testing_labrd.cpp common/testing_lauum.cpp common/testing_bdsqr.cpp common/testing_bdsvdx.cpp common/testing_steqr.cpp common/testing_stedc.cpp common/testing_stein.cpp common/testing_lasyf.cpp common/testing_potf2_potrf.cpp common/testing_potrs.cpp common/testing_posv.cpp common/testing_potri.cpp common/testing_getf2_getrf_npvt.cpp common/testing_getf2_getrf.cpp common/testing_geqr2_geqrf.cpp common/testing_gerq2_gerqf.cpp common/testing_geql2_geqlf.cpp common/testing_gelq2_gelqf.cpp common/testing_getrs.cpp common/testing_gesv.cpp common/testing_gesvd.cpp common/testing_gesvdj.cpp common/testing_gesvdx.cpp common/testing_trtri.cpp common/testing_getri.cpp common/testing_getri_npvt.cpp common/testing_getri_outofplace.cpp common/testing_getri_npvt_outofplace.cpp common/testing_gels.cpp common/testing_gebd2_gebrd.cpp common/testing_sytf2_sytrf.cpp common/testing_sterf.cpp common/testing_stebz.cpp common/testing_orgxr_ungxr.cpp common/testing_orgxl_ungxl.cpp common/testing_orglx_unglx.cpp common/testing_orgbr_ungbr.cpp common/testing_orgtr_ungtr.cpp common/testing_ormxr_unmxr.cpp common/testing_ormxl_unmxl.cpp common/testing_ormlx_unmlx.cpp common/testing_ormbr_unmbr.cpp common/testing_ormtr_unmtr.cpp common/testing_sytxx_hetxx.cpp common/testing_sygsx_hegsx.cpp common/testing_syev_heev.cpp common/testing_syevd_heevd.cpp common/testing_syevj_heevj.cpp common/testing_syevx_heevx.cpp common/testing_sygv_hegv.cpp common/testing_sygvd_hegvd.cpp common/testing_sygvj_hegvj.cpp common/testing_sygvx_hegvx.cpp common/testing_lacgv.cpp common/testing_geblttrf_npvt.cpp common/testing_geblttrs_npvt.cpp ) set(common_source_files common/lapack_host_reference.cpp rocblascommon/clients_utility.cpp rocblascommon/program_options.cpp ${explicit_inst_files} ) prepend_path("${CMAKE_CURRENT_SOURCE_DIR}/" common_source_files common_source_paths) target_sources(clients-common INTERFACE ${common_source_paths}) if(BUILD_CLIENTS_BENCHMARKS) add_subdirectory(benchmarks) endif() if(BUILD_CLIENTS_TESTS) add_subdirectory(gtest) endif() endif() if(BUILD_CLIENTS_SAMPLES) add_subdirectory(samples) endif() if(BUILD_CLIENTS_EXTRA_TESTS) add_subdirectory(extras) endif() rocSOLVER-rocm-5.5.1/clients/benchmarks/000077500000000000000000000000001436600607200177605ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/clients/benchmarks/CMakeLists.txt000077500000000000000000000012431436600607200225230ustar00rootroot00000000000000# ######################################################################## # Copyright (c) 2016-2022 Advanced Micro Devices, Inc. # ######################################################################## add_executable(rocsolver-bench client.cpp) add_armor_flags(rocsolver-bench "${ARMOR_LEVEL}") target_link_libraries(rocsolver-bench PRIVATE Threads::Threads hip::device rocsolver-common clients-common roc::rocsolver ) # Turn on f16c intrinsics target_compile_options(rocsolver-bench PRIVATE -mf16c) target_compile_definitions(rocsolver-bench PRIVATE ROCM_USE_FLOAT16 ROCSOLVER_CLIENTS_BENCH ) rocm_install(TARGETS rocsolver-bench COMPONENT benchmarks) rocSOLVER-rocm-5.5.1/clients/benchmarks/client.cpp000066400000000000000000000551611436600607200217520ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2016-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #include #include "rocblascommon/program_options.hpp" #include "rocsolver_dispatcher.hpp" using namespace roc; // clang-format off const char* help_str = R"HELP_STR( rocSOLVER benchmark client help. Usage: ./rocsolver-bench In addition to some common general options, the following list of options corresponds to all the parameters that might be needed to test a given rocSOLVER function. The parameters are named as in the API user guide. The arrays are initialized internally by the program with random values. Note: When a required parameter/option is not provided, it will take the default value as listed below. If no default value is defined, the program will try to calculate a suitable value depending on the context of the problem and the tested function; if this is not possible, the program will abort with an error. Functions that accept multiple size parameters can generally be provided a single size parameter (typically, m) and a square-size matrix will be assumed. Example: ./rocsolver-bench -f getf2_batched -m 30 --lda 75 --batch_count 350 This will test getf2_batched with a set of 350 random 30x30 matrices. strideP will be set to be equal to 30. Options: )HELP_STR"; // clang-format on static std::string rocblas_version() { size_t size; rocblas_get_version_string_size(&size); std::string str(size - 1, '\0'); rocblas_get_version_string(str.data(), size); return str; } static std::string rocsolver_version() { size_t size; rocsolver_get_version_string_size(&size); std::string str(size - 1, '\0'); rocsolver_get_version_string(str.data(), size); return str; } static void print_version_info() { fmt::print("rocSOLVER version {} (with rocBLAS {})\n", rocsolver_version(), rocblas_version()); std::fflush(stdout); } int main(int argc, char* argv[]) try { Arguments argus; // disable unit_check in client benchmark, it is only // used in gtest unit test argus.unit_check = 0; // enable timing check,otherwise no performance data collected argus.timing = 1; std::string function; char precision = 's'; rocblas_int device_id = 0; // take arguments and set default values // clang-format off options_description desc("rocsolver client command line options"); desc.add_options()("help,h", "Produces this help message.") // test options ("batch_count", value(&argus.batch_count)->default_value(1), "Number of matrices or problem instances in the batch.\n" " Only applicable to batch routines.\n" " ") ("device", value(&device_id)->default_value(0), "Set the default device to be used for subsequent program runs.\n" " ") ("function,f", value(&function)->default_value("potf2"), "The LAPACK function to test.\n" " Options are: getf2, getrf, gesvd_batched, etc.\n" " ") ("iters,i", value(&argus.iters)->default_value(10), "Iterations to run inside the GPU timing loop.\n" " Reported time will be the average.\n" " ") ("mem_query", value(&argus.mem_query)->default_value(0), "Calculate the required amount of device workspace memory? 0 = No, 1 = Yes.\n" " This forces the client to print only the amount of device memory required by\n" " the function, in bytes.\n" " ") ("perf", value(&argus.perf)->default_value(0), "Ignore CPU timing results? 0 = No, 1 = Yes.\n" " This forces the client to print only the GPU time and the error if requested.\n" " ") ("precision,r", value(&precision)->default_value('s'), "Precision to be used in the tests.\n" " Options are: s, d, c, z.\n" " ") ("profile", value(&argus.profile)->default_value(0), "Print profile logging results for the tested function.\n" " The argument specifies the max depth of the nested output.\n" " If the argument is unset or <= 0, profile logging is disabled.\n" " ") ("profile_kernels", value(&argus.profile_kernels)->default_value(0), "Include kernels in profile logging results? 0 = No, 1 = Yes.\n" " Used in conjunction with --profile to include kernels in the profile log.\n" " ") ("singular", value(&argus.singular)->default_value(0), "Test with degenerate matrices? 0 = No, 1 = Yes\n" " This will produce matrices that are singular, non positive-definite, etc.\n" " ") ("verify,v", value(&argus.norm_check)->default_value(0), "Validate GPU results with CPU? 0 = No, 1 = Yes.\n" " This will additionally print the relative error of the computations.\n" " ") // size options ("k", value(), "Matrix/vector size parameter.\n" " Represents a sub-dimension of a problem.\n" " For example, the number of Householder reflections in a transformation.\n" " ") ("m", value(), "Matrix/vector size parameter.\n" " Typically, the number of rows of a matrix.\n" " ") ("n", value(), "Matrix/vector size parameter.\n" " Typically, the number of columns of a matrix,\n" " or the order of a system or transformation.\n" " ") ("nrhs", value(), "Matrix/vector size parameter.\n" " Typically, the number of columns of a matrix on the right-hand side of a problem.\n" " ") // leading dimension options ("lda", value(), "Matrix size parameter.\n" " Leading dimension of matrices A.\n" " ") ("ldb", value(), "Matrix size parameter.\n" " Leading dimension of matrices B.\n" " ") ("ldc", value(), "Matrix size parameter.\n" " Leading dimension of matrices C.\n" " ") ("ldt", value(), "Matrix size parameter.\n" " Leading dimension of matrices T.\n" " ") ("ldu", value(), "Matrix size parameter.\n" " Leading dimension of matrices U.\n" " ") ("ldv", value(), "Matrix size parameter.\n" " Leading dimension of matrices V.\n" " ") ("ldw", value(), "Matrix size parameter.\n" " Leading dimension of matrices W.\n" " ") ("ldx", value(), "Matrix size parameter.\n" " Leading dimension of matrices X.\n" " ") ("ldy", value(), "Matrix size parameter.\n" " Leading dimension of matrices Y.\n" " ") ("ldz", value(), "Matrix size parameter.\n" " Leading dimension of matrices Z.\n" " ") // stride options ("strideA", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors A.\n" " ") ("strideB", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors B.\n" " ") ("strideD", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors D.\n" " ") ("strideE", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors E.\n" " ") ("strideF", value(), "Matrix/vector stride parameter.\n" " Stride for vectors ifail.\n" " ") ("strideQ", value(), "Matrix/vector stride parameter.\n" " Stride for vectors tauq.\n" " ") ("strideP", value(), "Matrix/vector stride parameter.\n" " Stride for vectors tau, taup, and ipiv.\n" " ") ("strideS", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors S.\n" " ") ("strideU", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors U.\n" " ") ("strideV", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors V.\n" " ") ("strideW", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors W.\n" " ") ("strideX", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors X.\n" " ") ("strideZ", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors Z.\n" " ") // bdsqr options ("nc", value()->default_value(0), "The number of columns of matrix C.\n" " Only applicable to bdsqr.\n" " ") ("nu", value(), "The number of columns of matrix U.\n" " Only applicable to bdsqr.\n" " ") ("nv", value()->default_value(0), "The number of columns of matrix V.\n" " Only applicable to bdsqr.\n" " ") // bdsvdx options ("svect", value()->default_value('N'), "N = none, S or V = the singular vectors are computed.\n" " Indicates how the left singular vectors are to be calculated and stored.\n" " Only applicable to bdsvdx.\n" " ") // laswp options ("k1", value(), "First index for row interchange.\n" " Only applicable to laswp.\n" " ") ("k2", value(), "Last index for row interchange.\n" " Only applicable to laswp.\n" " ") // gesvd options ("left_svect", value()->default_value('N'), "N = none, A = the entire orthogonal matrix is computed,\n" " S or V = the singular vectors are computed,\n" " O = the singular vectors overwrite the original matrix.\n" " Indicates how the left singular vectors are to be calculated and stored.\n" " ") ("right_svect", value()->default_value('N'), "N = none, A = the entire orthogonal matrix is computed,\n" " S or V = the singular vectors are computed,\n" " O = the singular vectors overwrite the original matrix.\n" " Indicates how the right singular vectors are to be calculated and stored.\n" " ") // stein options ("nev", value(), "Number of eigenvectors to compute in a partial decomposition.\n" " Only applicable to stein.\n" " ") // trtri options ("diag", value()->default_value('N'), "N = non-unit triangular, U = unit triangular.\n" " Indicates whether the diagonal elements of a triangular matrix are assumed to be one.\n" " Only applicable to trtri.\n" " ") // stebz options ("eorder", value()->default_value('E'), "E = entire matrix, B = by blocks.\n" " Indicates whether the computed eigenvalues are ordered by blocks or for the entire matrix.\n" " Only applicable to stebz.\n" " ") // geblttrf/geblttrs options ("nb", value(), "Number of rows and columns in each block.\n" " Only applicable to block tridiagonal matrix APIs.\n" " ") ("nblocks", value(), "Number of blocks along the diagonal.\n" " Only applicable to block tridiagonal matrix APIs.\n" " ") // partial eigenvalue/singular value decomposition options ("il", value(), "Lower index in ordered subset of eigenvalues.\n" " Used in partial eigenvalue decomposition functions.\n" " ") ("iu", value(), "Upper index in ordered subset of eigenvalues.\n" " Used in partial eigenvalue decomposition functions.\n" " ") ("erange", value()->default_value('A'), "A = all eigenvalues, V = in (vl, vu], I = from the il-th to the iu-th.\n" " For partial eigenvalue decompositions, it indicates the type of interval in which\n" " the eigenvalues will be found.\n" " ") ("srange", value()->default_value('A'), "A = all singular values, V = in (vl, vu], I = from the il-th to the iu-th.\n" " For partial singular value decompositions, it indicates the type of interval in which\n" " the singular values will be found.\n" " ") ("vl", value(), "Lower bound of half-open interval (vl, vu].\n" " Used in partial eigenvalue decomposition functions.\n" " Note: the used random input matrices have all eigenvalues in [-20, 20].\n" " ") ("vu", value(), "Upper bound of half-open interval (vl, vu].\n" " Used in partial eigenvalue decomposition functions.\n" " Note: the used random input matrices have all eigenvalues in [-20, 20].\n" " ") // iterative Jacobi options ("max_sweeps", value()->default_value(100), "Maximum number of sweeps/iterations.\n" " Used in iterative Jacobi functions.\n" " ") ("esort", value()->default_value('A'), "N = no sorting, A = ascending order.\n" " Indicates whether the computed eigenvalues are sorted in ascending order.\n" " Used in iterative Jacobi functions.\n" " ") // other options ("abstol", value()->default_value(0), "Absolute tolerance at which convergence is accepted.\n" " Used in iterative Jacobi and partial eigenvalue decomposition functions.\n" " ") ("direct", value()->default_value('F'), "F = forward, B = backward.\n" " The order in which a series of transformations are applied.\n" " ") ("evect", value()->default_value('N'), "N = none, V = compute eigenvectors of the matrix,\n" " I = compute eigenvectors of the tridiagonal matrix.\n" " Indicates how the eigenvectors are to be calculated and stored.\n" " ") ("fast_alg", value()->default_value('O'), "O = out-of-place, I = in-place.\n" " Enables out-of-place computations.\n" " ") ("incx", value()->default_value(1), "Increment between values in vector x.\n" " ") ("itype", value()->default_value('1'), "1 = Ax, 2 = ABx, 3 = BAx.\n" " Problem type for generalized eigenproblems.\n" " ") ("side", value(), "L = left, R = right.\n" " The side from which a matrix should be multiplied.\n" " ") ("storev", value(), "C = column-wise, R = row-wise.\n" " Indicates whether data is stored column-wise or row-wise.\n" " ") ("trans", value()->default_value('N'), "N = no transpose, T = transpose, C = conjugate transpose.\n" " Indicates if a matrix should be transposed.\n" " ") ("uplo", value()->default_value('U'), "U = upper, L = lower.\n" " Indicates where the data for a triangular or symmetric/hermitian matrix is stored.\n" " "); // clang-format on variables_map vm; store(parse_command_line(argc, argv, desc), vm); notify(vm); // print help message if(vm.count("help")) { fmt::print("{}{}\n", help_str, desc); return 0; } argus.populate(vm); if(!argus.perf) { print_version_info(); rocblas_int device_count = query_device_property(); if(device_count <= 0) throw std::runtime_error("No devices found"); if(device_count <= device_id) throw std::invalid_argument("Invalid Device ID"); } set_device(device_id); // catch invalid arguments argus.validate_precision("precision"); argus.validate_operation("trans"); argus.validate_side("side"); argus.validate_fill("uplo"); argus.validate_diag("diag"); argus.validate_direct("direct"); argus.validate_storev("storev"); argus.validate_svect("svect"); argus.validate_svect("left_svect"); argus.validate_svect("right_svect"); argus.validate_erange("srange"); argus.validate_workmode("fast_alg"); argus.validate_evect("evect"); argus.validate_erange("erange"); argus.validate_eorder("eorder"); argus.validate_esort("esort"); argus.validate_itype("itype"); // prepare logging infrastructure and ignore environment variables rocsolver_log_begin(); rocsolver_log_set_layer_mode(rocblas_layer_mode_none); // select and dispatch function test/benchmark rocsolver_dispatcher::invoke(function, precision, argus); // terminate logging rocsolver_log_end(); return 0; } catch(const std::exception& exp) { fmt::print(stderr, "{}\n", exp.what()); return -1; } rocSOLVER-rocm-5.5.1/clients/common/000077500000000000000000000000001436600607200171335ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/clients/common/lapack_host_reference.cpp000066400000000000000000007702111436600607200241550ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2016-2022 Advanced Micro Devices, Inc. * ************************************************************************/ #include #include "lapack_host_reference.hpp" /*!\file * \brief provide template functions interfaces to BLAS and LAPACK interfaces, it is * only used for testing, not part of the GPU library */ /*************************************************************************/ // Function declarations for LAPACK-provided functions with gfortran-style // name mangling (lowercase name with trailing underscore). #ifdef __cplusplus extern "C" { #endif void sgemv_(char* transA, int* m, int* n, float* alpha, float* A, int* lda, float* x, int* incx, float* beta, float* y, int* incy); void dgemv_(char* transA, int* m, int* n, double* alpha, double* A, int* lda, double* x, int* incx, double* beta, double* y, int* incy); void cgemv_(char* transA, int* m, int* n, rocblas_float_complex* alpha, rocblas_float_complex* A, int* lda, rocblas_float_complex* x, int* incx, rocblas_float_complex* beta, rocblas_float_complex* y, int* incy); void zgemv_(char* transA, int* m, int* n, rocblas_double_complex* alpha, rocblas_double_complex* A, int* lda, rocblas_double_complex* x, int* incx, rocblas_double_complex* beta, rocblas_double_complex* y, int* incy); void sgemm_(char* transA, char* transB, int* m, int* n, int* k, float* alpha, float* A, int* lda, float* B, int* ldb, float* beta, float* C, int* ldc); void dgemm_(char* transA, char* transB, int* m, int* n, int* k, double* alpha, double* A, int* lda, double* B, int* ldb, double* beta, double* C, int* ldc); void cgemm_(char* transA, char* transB, int* m, int* n, int* k, rocblas_float_complex* alpha, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb, rocblas_float_complex* beta, rocblas_float_complex* C, int* ldc); void zgemm_(char* transA, char* transB, int* m, int* n, int* k, rocblas_double_complex* alpha, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb, rocblas_double_complex* beta, rocblas_double_complex* C, int* ldc); void ssymv_(char* uplo, int* n, float* alpha, float* A, int* lda, float* x, int* incx, float* beta, float* y, int* incy); void dsymv_(char* uplo, int* n, double* alpha, double* A, int* lda, double* x, int* incx, double* beta, double* y, int* incy); void chemv_(char* uplo, int* n, rocblas_float_complex* alpha, rocblas_float_complex* A, int* lda, rocblas_float_complex* x, int* incx, rocblas_float_complex* beta, rocblas_float_complex* y, int* incy); void zhemv_(char* uplo, int* n, rocblas_double_complex* alpha, rocblas_double_complex* A, int* lda, rocblas_double_complex* x, int* incx, rocblas_double_complex* beta, rocblas_double_complex* y, int* incy); void ssymm_(char* side, char* uplo, int* m, int* n, float* alpha, float* A, int* lda, float* B, int* ldb, float* beta, float* C, int* ldc); void dsymm_(char* side, char* uplo, int* m, int* n, double* alpha, double* A, int* lda, double* B, int* ldb, double* beta, double* C, int* ldc); void chemm_(char* side, char* uplo, int* m, int* n, rocblas_float_complex* alpha, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb, rocblas_float_complex* beta, rocblas_float_complex* C, int* ldc); void zhemm_(char* side, char* uplo, int* m, int* n, rocblas_double_complex* alpha, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb, rocblas_double_complex* beta, rocblas_double_complex* C, int* ldc); void strmm_(char* side, char* uplo, char* transA, char* diag, int* m, int* n, float* alpha, float* A, int* lda, float* B, int* ldb); void dtrmm_(char* side, char* uplo, char* transA, char* diag, int* m, int* n, double* alpha, double* A, int* lda, double* B, int* ldb); void ctrmm_(char* side, char* uplo, char* transA, char* diag, int* m, int* n, rocblas_float_complex* alpha, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb); void ztrmm_(char* side, char* uplo, char* transA, char* diag, int* m, int* n, rocblas_double_complex* alpha, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb); void strsm_(char* side, char* uplo, char* transA, char* diag, int* m, int* n, float* alpha, float* A, int* lda, float* B, int* ldb); void dtrsm_(char* side, char* uplo, char* transA, char* diag, int* m, int* n, double* alpha, double* A, int* lda, double* B, int* ldb); void ctrsm_(char* side, char* uplo, char* transA, char* diag, int* m, int* n, rocblas_float_complex* alpha, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb); void ztrsm_(char* side, char* uplo, char* transA, char* diag, int* m, int* n, rocblas_double_complex* alpha, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb); void strsv_(char* uplo, char* transA, char* diag, int* n, float* A, int* lda, float* x, int* incx); void dtrsv_(char* uplo, char* transA, char* diag, int* n, double* A, int* lda, double* x, int* incx); void ctrsv_(char* uplo, char* transA, char* diag, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* x, int* incx); void ztrsv_(char* uplo, char* transA, char* diag, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* x, int* incx); void strtri_(char* uplo, char* diag, int* n, float* A, int* lda, int* info); void dtrtri_(char* uplo, char* diag, int* n, double* A, int* lda, int* info); void ctrtri_(char* uplo, char* diag, int* n, rocblas_float_complex* A, int* lda, int* info); void ztrtri_(char* uplo, char* diag, int* n, rocblas_double_complex* A, int* lda, int* info); void sgetrf_(int* m, int* n, float* A, int* lda, int* ipiv, int* info); void dgetrf_(int* m, int* n, double* A, int* lda, int* ipiv, int* info); void cgetrf_(int* m, int* n, rocblas_float_complex* A, int* lda, int* ipiv, int* info); void zgetrf_(int* m, int* n, rocblas_double_complex* A, int* lda, int* ipiv, int* info); void spotf2_(char* uplo, int* n, float* A, int* lda, int* info); void dpotf2_(char* uplo, int* n, double* A, int* lda, int* info); void cpotf2_(char* uplo, int* n, rocblas_float_complex* A, int* lda, int* info); void zpotf2_(char* uplo, int* n, rocblas_double_complex* A, int* lda, int* info); void spotrf_(char* uplo, int* n, float* A, int* lda, int* info); void dpotrf_(char* uplo, int* n, double* A, int* lda, int* info); void cpotrf_(char* uplo, int* n, rocblas_float_complex* A, int* lda, int* info); void zpotrf_(char* uplo, int* n, rocblas_double_complex* A, int* lda, int* info); void spotrs_(char* uplo, int* n, int* nrhs, float* A, int* lda, float* B, int* ldb, int* info); void dpotrs_(char* uplo, int* n, int* nrhs, double* A, int* lda, double* B, int* ldb, int* info); void cpotrs_(char* uplo, int* n, int* nrhs, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb, int* info); void zpotrs_(char* uplo, int* n, int* nrhs, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb, int* info); void sposv_(char* uplo, int* n, int* nrhs, float* A, int* lda, float* B, int* ldb, int* info); void dposv_(char* uplo, int* n, int* nrhs, double* A, int* lda, double* B, int* ldb, int* info); void cposv_(char* uplo, int* n, int* nrhs, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb, int* info); void zposv_(char* uplo, int* n, int* nrhs, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb, int* info); void spotri_(char* uplo, int* n, float* A, int* lda, int* info); void dpotri_(char* uplo, int* n, double* A, int* lda, int* info); void cpotri_(char* uplo, int* n, rocblas_float_complex* A, int* lda, int* info); void zpotri_(char* uplo, int* n, rocblas_double_complex* A, int* lda, int* info); void sgetf2_(int* m, int* n, float* A, int* lda, int* ipiv, int* info); void dgetf2_(int* m, int* n, double* A, int* lda, int* ipiv, int* info); void cgetf2_(int* m, int* n, rocblas_float_complex* A, int* lda, int* ipiv, int* info); void zgetf2_(int* m, int* n, rocblas_double_complex* A, int* lda, int* ipiv, int* info); void sgetrs_(char* trans, int* n, int* nrhs, float* A, int* lda, int* ipiv, float* B, int* ldb, int* info); void dgetrs_(char* trans, int* n, int* nrhs, double* A, int* lda, int* ipiv, double* B, int* ldb, int* info); void cgetrs_(char* trans, int* n, int* nrhs, rocblas_float_complex* A, int* lda, int* ipiv, rocblas_float_complex* B, int* ldb, int* info); void zgetrs_(char* trans, int* n, int* nrhs, rocblas_double_complex* A, int* lda, int* ipiv, rocblas_double_complex* B, int* ldb, int* info); void sgesv_(int* n, int* nrhs, float* A, int* lda, int* ipiv, float* B, int* ldb, int* info); void dgesv_(int* n, int* nrhs, double* A, int* lda, int* ipiv, double* B, int* ldb, int* info); void cgesv_(int* n, int* nrhs, rocblas_float_complex* A, int* lda, int* ipiv, rocblas_float_complex* B, int* ldb, int* info); void zgesv_(int* n, int* nrhs, rocblas_double_complex* A, int* lda, int* ipiv, rocblas_double_complex* B, int* ldb, int* info); void sgels_(char* trans, int* m, int* n, int* nrhs, float* A, int* lda, float* B, int* ldb, float* work, int* lwork, int* info); void dgels_(char* trans, int* m, int* n, int* nrhs, double* A, int* lda, double* B, int* ldb, double* work, int* lwork, int* info); void cgels_(char* trans, int* m, int* n, int* nrhs, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb, rocblas_float_complex* work, int* lwork, int* info); void zgels_(char* trans, int* m, int* n, int* nrhs, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb, rocblas_double_complex* work, int* lwork, int* info); void sgetri_(int* n, float* A, int* lda, int* ipiv, float* work, int* lwork, int* info); void dgetri_(int* n, double* A, int* lda, int* ipiv, double* work, int* lwork, int* info); void cgetri_(int* n, rocblas_float_complex* A, int* lda, int* ipiv, rocblas_float_complex* work, int* lwork, int* info); void zgetri_(int* n, rocblas_double_complex* A, int* lda, int* ipiv, rocblas_double_complex* work, int* lwork, int* info); void strtri_(char* uplo, char* diag, int* n, float* A, int* lda, int* info); void dtrtri_(char* uplo, char* diag, int* n, double* A, int* lda, int* info); void ctrtri_(char* uplo, char* diag, int* n, rocblas_float_complex* A, int* lda, int* info); void ztrtri_(char* uplo, char* diag, int* n, rocblas_double_complex* A, int* lda, int* info); void slarfg_(int* n, float* alpha, float* x, int* incx, float* tau); void dlarfg_(int* n, double* alpha, double* x, int* incx, double* tau); void clarfg_(int* n, rocblas_float_complex* alpha, rocblas_float_complex* x, int* incx, rocblas_float_complex* tau); void zlarfg_(int* n, rocblas_double_complex* alpha, rocblas_double_complex* x, int* incx, rocblas_double_complex* tau); void slarf_(char* side, int* m, int* n, float* x, int* incx, float* alpha, float* A, int* lda, float* work); void dlarf_(char* side, int* m, int* n, double* x, int* incx, double* alpha, double* A, int* lda, double* work); void clarf_(char* side, int* m, int* n, rocblas_float_complex* x, int* incx, rocblas_float_complex* alpha, rocblas_float_complex* A, int* lda, rocblas_float_complex* work); void zlarf_(char* side, int* m, int* n, rocblas_double_complex* x, int* incx, rocblas_double_complex* alpha, rocblas_double_complex* A, int* lda, rocblas_double_complex* work); void slarft_(char* direct, char* storev, int* n, int* k, float* V, int* ldv, float* tau, float* T, int* ldt); void dlarft_(char* direct, char* storev, int* n, int* k, double* V, int* ldv, double* tau, double* T, int* ldt); void clarft_(char* direct, char* storev, int* n, int* k, rocblas_float_complex* V, int* ldv, rocblas_float_complex* tau, rocblas_float_complex* T, int* ldt); void zlarft_(char* direct, char* storev, int* n, int* k, rocblas_double_complex* V, int* ldv, rocblas_double_complex* tau, rocblas_double_complex* T, int* ldt); void sbdsqr_(char* uplo, int* n, int* nv, int* nu, int* nc, float* D, float* E, float* V, int* ldv, float* U, int* ldu, float* C, int* ldc, float* W, int* info); void dbdsqr_(char* uplo, int* n, int* nv, int* nu, int* nc, double* D, double* E, double* V, int* ldv, double* U, int* ldu, double* C, int* ldc, double* W, int* info); void cbdsqr_(char* uplo, int* n, int* nv, int* nu, int* nc, float* D, float* E, rocblas_float_complex* V, int* ldv, rocblas_float_complex* U, int* ldu, rocblas_float_complex* C, int* ldc, float* W, int* info); void zbdsqr_(char* uplo, int* n, int* nv, int* nu, int* nc, double* D, double* E, rocblas_double_complex* V, int* ldv, rocblas_double_complex* U, int* ldu, rocblas_double_complex* C, int* ldc, double* W, int* info); void slarfb_(char* side, char* trans, char* direct, char* storev, int* m, int* n, int* k, float* V, int* ldv, float* T, int* ldt, float* A, int* lda, float* W, int* ldw); void dlarfb_(char* side, char* trans, char* direct, char* storev, int* m, int* n, int* k, double* V, int* ldv, double* T, int* ldt, double* A, int* lda, double* W, int* ldw); void clarfb_(char* side, char* trans, char* direct, char* storev, int* m, int* n, int* k, rocblas_float_complex* V, int* ldv, rocblas_float_complex* T, int* ldt, rocblas_float_complex* A, int* lda, rocblas_float_complex* W, int* ldw); void zlarfb_(char* side, char* trans, char* direct, char* storev, int* m, int* n, int* k, rocblas_double_complex* V, int* ldv, rocblas_double_complex* T, int* ldt, rocblas_double_complex* A, int* lda, rocblas_double_complex* W, int* ldw); void slatrd_(char* uplo, int* n, int* k, float* A, int* lda, float* E, float* tau, float* W, int* ldw); void dlatrd_(char* uplo, int* n, int* k, double* A, int* lda, double* E, double* tau, double* W, int* ldw); void clatrd_(char* uplo, int* n, int* k, rocblas_float_complex* A, int* lda, float* E, rocblas_float_complex* tau, rocblas_float_complex* W, int* ldw); void zlatrd_(char* uplo, int* n, int* k, rocblas_double_complex* A, int* lda, double* E, rocblas_double_complex* tau, rocblas_double_complex* W, int* ldw); void slabrd_(int* m, int* n, int* nb, float* A, int* lda, float* D, float* E, float* tauq, float* taup, float* X, int* ldx, float* Y, int* ldy); void dlabrd_(int* m, int* n, int* nb, double* A, int* lda, double* D, double* E, double* tauq, double* taup, double* X, int* ldx, double* Y, int* ldy); void clabrd_(int* m, int* n, int* nb, rocblas_float_complex* A, int* lda, float* D, float* E, rocblas_float_complex* tauq, rocblas_float_complex* taup, rocblas_float_complex* X, int* ldx, rocblas_float_complex* Y, int* ldy); void zlabrd_(int* m, int* n, int* nb, rocblas_double_complex* A, int* lda, double* D, double* E, rocblas_double_complex* tauq, rocblas_double_complex* taup, rocblas_double_complex* X, int* ldx, rocblas_double_complex* Y, int* ldy); void sgeqr2_(int* m, int* n, float* A, int* lda, float* ipiv, float* work, int* info); void dgeqr2_(int* m, int* n, double* A, int* lda, double* ipiv, double* work, int* info); void cgeqr2_(int* m, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* info); void zgeqr2_(int* m, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* info); void sgeqrf_(int* m, int* n, float* A, int* lda, float* ipiv, float* work, int* lwork, int* info); void dgeqrf_(int* m, int* n, double* A, int* lda, double* ipiv, double* work, int* lwork, int* info); void cgeqrf_(int* m, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* lwork, int* info); void zgeqrf_(int* m, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* lwork, int* info); void sgerq2_(int* m, int* n, float* A, int* lda, float* ipiv, float* work, int* info); void dgerq2_(int* m, int* n, double* A, int* lda, double* ipiv, double* work, int* info); void cgerq2_(int* m, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* info); void zgerq2_(int* m, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* info); void sgerqf_(int* m, int* n, float* A, int* lda, float* ipiv, float* work, int* lwork, int* info); void dgerqf_(int* m, int* n, double* A, int* lda, double* ipiv, double* work, int* lwork, int* info); void cgerqf_(int* m, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* lwork, int* info); void zgerqf_(int* m, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* lwork, int* info); void sgeql2_(int* m, int* n, float* A, int* lda, float* ipiv, float* work, int* info); void dgeql2_(int* m, int* n, double* A, int* lda, double* ipiv, double* work, int* info); void cgeql2_(int* m, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* info); void zgeql2_(int* m, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* info); void sgeqlf_(int* m, int* n, float* A, int* lda, float* ipiv, float* work, int* lwork, int* info); void dgeqlf_(int* m, int* n, double* A, int* lda, double* ipiv, double* work, int* lwork, int* info); void cgeqlf_(int* m, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* lwork, int* info); void zgeqlf_(int* m, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* lwork, int* info); void sgelq2_(int* m, int* n, float* A, int* lda, float* ipiv, float* work, int* info); void dgelq2_(int* m, int* n, double* A, int* lda, double* ipiv, double* work, int* info); void cgelq2_(int* m, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* info); void zgelq2_(int* m, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* info); void sgelqf_(int* m, int* n, float* A, int* lda, float* ipiv, float* work, int* lwork, int* info); void dgelqf_(int* m, int* n, double* A, int* lda, double* ipiv, double* work, int* lwork, int* info); void cgelqf_(int* m, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* lwork, int* info); void zgelqf_(int* m, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* lwork, int* info); void clacgv_(int* n, rocblas_float_complex* x, int* incx); void zlacgv_(int* n, rocblas_double_complex* x, int* incx); void slaswp_(int* n, float* A, int* lda, int* k1, int* k2, int* ipiv, int* inc); void dlaswp_(int* n, double* A, int* lda, int* k1, int* k2, int* ipiv, int* inc); void claswp_(int* n, rocblas_float_complex* A, int* lda, int* k1, int* k2, int* ipiv, int* inc); void zlaswp_(int* n, rocblas_double_complex* A, int* lda, int* k1, int* k2, int* ipiv, int* inc); void slauum_(char* uplo, int* n, float* A, int* lda, int* info); void dlauum_(char* uplo, int* n, double* A, int* lda, int* info); void clauum_(char* uplo, int* n, rocblas_float_complex* A, int* lda, int* info); void zlauum_(char* uplo, int* n, rocblas_double_complex* A, int* lda, int* info); void sorg2r_(int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* work, int* info); void dorg2r_(int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* work, int* info); void cung2r_(int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* info); void zung2r_(int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* info); void sorgqr_(int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* work, int* lwork, int* info); void dorgqr_(int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* work, int* lwork, int* info); void cungqr_(int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* lwork, int* info); void zungqr_(int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* lwork, int* info); void sorgl2_(int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* work, int* info); void dorgl2_(int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* work, int* info); void cungl2_(int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* info); void zungl2_(int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* info); void sorglq_(int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* work, int* lwork, int* info); void dorglq_(int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* work, int* lwork, int* info); void cunglq_(int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* lwork, int* info); void zunglq_(int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* lwork, int* info); void sorg2l_(int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* work, int* info); void dorg2l_(int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* work, int* info); void cung2l_(int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* info); void zung2l_(int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* info); void sorgql_(int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* work, int* lwork, int* info); void dorgql_(int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* work, int* lwork, int* info); void cungql_(int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, int* lwork, int* info); void zungql_(int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, int* lwork, int* info); void sorgbr_(char* vect, int* m, int* n, int* k, float* A, int* lda, float* Ipiv, float* work, int* size_w, int* info); void dorgbr_(char* vect, int* m, int* n, int* k, double* A, int* lda, double* Ipiv, double* work, int* size_w, int* info); void cungbr_(char* vect, int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* Ipiv, rocblas_float_complex* work, int* size_w, int* info); void zungbr_(char* vect, int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* Ipiv, rocblas_double_complex* work, int* size_w, int* info); void sorgtr_(char* uplo, int* n, float* A, int* lda, float* Ipiv, float* work, int* size_w, int* info); void dorgtr_(char* uplo, int* n, double* A, int* lda, double* Ipiv, double* work, int* size_w, int* info); void cungtr_(char* uplo, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* Ipiv, rocblas_float_complex* work, int* size_w, int* info); void zungtr_(char* uplo, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* Ipiv, rocblas_double_complex* work, int* size_w, int* info); void sorm2r_(char* side, char* trans, int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* C, int* ldc, float* work, int* info); void dorm2r_(char* side, char* trans, int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* C, int* ldc, double* work, int* info); void cunm2r_(char* side, char* trans, int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, int* ldc, rocblas_float_complex* work, int* info); void zunm2r_(char* side, char* trans, int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, int* ldc, rocblas_double_complex* work, int* info); void sormqr_(char* side, char* trans, int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* C, int* ldc, float* work, int* sizeW, int* info); void dormqr_(char* side, char* trans, int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* C, int* ldc, double* work, int* sizeW, int* info); void cunmqr_(char* side, char* trans, int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, int* ldc, rocblas_float_complex* work, int* sizeW, int* info); void zunmqr_(char* side, char* trans, int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, int* ldc, rocblas_double_complex* work, int* sizeW, int* info); void sorml2_(char* side, char* trans, int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* C, int* ldc, float* work, int* info); void dorml2_(char* side, char* trans, int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* C, int* ldc, double* work, int* info); void cunml2_(char* side, char* trans, int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, int* ldc, rocblas_float_complex* work, int* info); void zunml2_(char* side, char* trans, int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, int* ldc, rocblas_double_complex* work, int* info); void sormlq_(char* side, char* trans, int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* C, int* ldc, float* work, int* sizeW, int* info); void dormlq_(char* side, char* trans, int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* C, int* ldc, double* work, int* sizeW, int* info); void cunmlq_(char* side, char* trans, int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, int* ldc, rocblas_float_complex* work, int* sizeW, int* info); void zunmlq_(char* side, char* trans, int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, int* ldc, rocblas_double_complex* work, int* sizeW, int* info); void sorm2l_(char* side, char* trans, int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* C, int* ldc, float* work, int* info); void dorm2l_(char* side, char* trans, int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* C, int* ldc, double* work, int* info); void cunm2l_(char* side, char* trans, int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, int* ldc, rocblas_float_complex* work, int* info); void zunm2l_(char* side, char* trans, int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, int* ldc, rocblas_double_complex* work, int* info); void sormql_(char* side, char* trans, int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* C, int* ldc, float* work, int* sizeW, int* info); void dormql_(char* side, char* trans, int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* C, int* ldc, double* work, int* sizeW, int* info); void cunmql_(char* side, char* trans, int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, int* ldc, rocblas_float_complex* work, int* sizeW, int* info); void zunmql_(char* side, char* trans, int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, int* ldc, rocblas_double_complex* work, int* sizeW, int* info); void sormbr_(char* vect, char* side, char* trans, int* m, int* n, int* k, float* A, int* lda, float* ipiv, float* C, int* ldc, float* work, int* sizeW, int* info); void dormbr_(char* vect, char* side, char* trans, int* m, int* n, int* k, double* A, int* lda, double* ipiv, double* C, int* ldc, double* work, int* sizeW, int* info); void cunmbr_(char* vect, char* side, char* trans, int* m, int* n, int* k, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, int* ldc, rocblas_float_complex* work, int* sizeW, int* info); void zunmbr_(char* vect, char* side, char* trans, int* m, int* n, int* k, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, int* ldc, rocblas_double_complex* work, int* sizeW, int* info); void sormtr_(char* side, char* uplo, char* trans, int* m, int* n, float* A, int* lda, float* ipiv, float* C, int* ldc, float* work, int* sizeW, int* info); void dormtr_(char* side, char* uplo, char* trans, int* m, int* n, double* A, int* lda, double* ipiv, double* C, int* ldc, double* work, int* sizeW, int* info); void cunmtr_(char* side, char* uplo, char* trans, int* m, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, int* ldc, rocblas_float_complex* work, int* sizeW, int* info); void zunmtr_(char* side, char* uplo, char* trans, int* m, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, int* ldc, rocblas_double_complex* work, int* sizeW, int* info); void sgebd2_(int* m, int* n, float* A, int* lda, float* D, float* E, float* tauq, float* taup, float* work, int* info); void dgebd2_(int* m, int* n, double* A, int* lda, double* D, double* E, double* tauq, double* taup, double* work, int* info); void cgebd2_(int* m, int* n, rocblas_float_complex* A, int* lda, float* D, float* E, rocblas_float_complex* tauq, rocblas_float_complex* taup, rocblas_float_complex* work, int* info); void zgebd2_(int* m, int* n, rocblas_double_complex* A, int* lda, double* D, double* E, rocblas_double_complex* tauq, rocblas_double_complex* taup, rocblas_double_complex* work, int* info); void sgebrd_(int* m, int* n, float* A, int* lda, float* D, float* E, float* tauq, float* taup, float* work, int* size_w, int* info); void dgebrd_(int* m, int* n, double* A, int* lda, double* D, double* E, double* tauq, double* taup, double* work, int* size_w, int* info); void cgebrd_(int* m, int* n, rocblas_float_complex* A, int* lda, float* D, float* E, rocblas_float_complex* tauq, rocblas_float_complex* taup, rocblas_float_complex* work, int* size_w, int* info); void zgebrd_(int* m, int* n, rocblas_double_complex* A, int* lda, double* D, double* E, rocblas_double_complex* tauq, rocblas_double_complex* taup, rocblas_double_complex* work, int* size_w, int* info); void ssytrd_(char* uplo, int* n, float* A, int* lda, float* D, float* E, float* tau, float* work, int* size_w, int* info); void dsytrd_(char* uplo, int* n, double* A, int* lda, double* D, double* E, double* tau, double* work, int* size_w, int* info); void chetrd_(char* uplo, int* n, rocblas_float_complex* A, int* lda, float* D, float* E, rocblas_float_complex* tau, rocblas_float_complex* work, int* size_w, int* info); void zhetrd_(char* uplo, int* n, rocblas_double_complex* A, int* lda, double* D, double* E, rocblas_double_complex* tau, rocblas_double_complex* work, int* size_w, int* info); void ssytd2_(char* uplo, int* n, float* A, int* lda, float* D, float* E, float* tau, int* info); void dsytd2_(char* uplo, int* n, double* A, int* lda, double* D, double* E, double* tau, int* info); void chetd2_(char* uplo, int* n, rocblas_float_complex* A, int* lda, float* D, float* E, rocblas_float_complex* tau, int* info); void zhetd2_(char* uplo, int* n, rocblas_double_complex* A, int* lda, double* D, double* E, rocblas_double_complex* tau, int* info); void sgesvd_(char* jobu, char* jobv, int* m, int* n, float* A, int* lda, float* S, float* U, int* ldu, float* V, int* ldv, float* E, int* lwork, int* info); void dgesvd_(char* jobu, char* jobv, int* m, int* n, double* A, int* lda, double* S, double* U, int* ldu, double* V, int* ldv, double* E, int* lwork, int* info); void cgesvd_(char* jobu, char* jobv, int* m, int* n, rocblas_float_complex* A, int* lda, float* S, rocblas_float_complex* U, int* ldu, rocblas_float_complex* V, int* ldv, rocblas_float_complex* work, int* lwork, float* E, int* info); void zgesvd_(char* jobu, char* jobv, int* m, int* n, rocblas_double_complex* A, int* lda, double* S, rocblas_double_complex* U, int* ldu, rocblas_double_complex* V, int* ldv, rocblas_double_complex* work, int* lwork, double* E, int* info); void sgesvdx_(char* jobu, char* jobv, char* srange, int* m, int* n, float* A, int* lda, float* vl, float* vu, int* il, int* iu, int* nsv, float* S, float* U, int* ldu, float* V, int* ldv, float* work, int* lwork, int* iwork, int* info); void dgesvdx_(char* jobu, char* jobv, char* srange, int* m, int* n, double* A, int* lda, double* vl, double* vu, int* il, int* iu, int* nsv, double* S, double* U, int* ldu, double* V, int* ldv, double* work, int* lwork, int* iwork, int* info); void cgesvdx_(char* jobu, char* jobv, char* srange, int* m, int* n, rocblas_float_complex* A, int* lda, float* vl, float* vu, int* il, int* iu, int* nsv, float* S, rocblas_float_complex* U, int* ldu, rocblas_float_complex* V, int* ldv, rocblas_float_complex* work, int* lwork, float* rwork, int* iwork, int* info); void zgesvdx_(char* jobu, char* jobv, char* srange, int* m, int* n, rocblas_double_complex* A, int* lda, double* vl, double* vu, int* il, int* iu, int* nsv, double* S, rocblas_double_complex* U, int* ldu, rocblas_double_complex* V, int* ldv, rocblas_double_complex* work, int* lwork, double* rwork, int* iwork, int* info); void ssterf_(int* n, float* D, float* E, int* info); void dsterf_(int* n, double* D, double* E, int* info); void ssteqr_(char* evect, int* n, float* D, float* E, float* C, int* ldc, float* work, int* info); void dsteqr_(char* evect, int* n, double* D, double* E, double* C, int* ldc, double* work, int* info); void csteqr_(char* evect, int* n, float* D, float* E, rocblas_float_complex* C, int* ldc, float* work, int* info); void zsteqr_(char* evect, int* n, double* D, double* E, rocblas_double_complex* C, int* ldc, double* work, int* info); void sstedc_(char* evect, int* n, float* D, float* E, float* C, int* ldc, float* work, int* lwork, int* iwork, int* liwork, int* info); void dstedc_(char* evect, int* n, double* D, double* E, double* C, int* ldc, double* work, int* lwork, int* iwork, int* liwork, int* info); void cstedc_(char* evect, int* n, float* D, float* E, rocblas_float_complex* C, int* ldc, rocblas_float_complex* work, int* lwork, float* rwork, int* lrwork, int* iwork, int* liwork, int* info); void zstedc_(char* evect, int* n, double* D, double* E, rocblas_double_complex* C, int* ldc, rocblas_double_complex* work, int* lwork, double* rwork, int* lrwork, int* iwork, int* liwork, int* info); void sstebz_(char* erange, char* order, int* n, float* vl, float* vu, int* il, int* iu, float* abstol, float* D, float* E, int* nev, int* nsplit, float* W, int* iblock, int* isplit, float* work, int* iwork, int* info); void dstebz_(char* erange, char* order, int* n, double* vl, double* vu, int* il, int* iu, double* abstol, double* D, double* E, int* nev, int* nsplit, double* W, int* iblock, int* isplit, double* work, int* iwork, int* info); void sstein_(int* n, float* D, float* E, int* nev, float* W, int* iblock, int* isplit, float* Z, int* ldz, float* work, int* iwork, int* ifail, int* info); void dstein_(int* n, double* D, double* E, int* nev, double* W, int* iblock, int* isplit, double* Z, int* ldz, double* work, int* iwork, int* ifail, int* info); void cstein_(int* n, float* D, float* E, int* nev, float* W, int* iblock, int* isplit, rocblas_float_complex* Z, int* ldz, float* work, int* iwork, int* ifail, int* info); void zstein_(int* n, double* D, double* E, int* nev, double* W, int* iblock, int* isplit, rocblas_double_complex* Z, int* ldz, double* work, int* iwork, int* ifail, int* info); void ssygs2_(int* itype, char* uplo, int* n, float* A, int* lda, float* B, int* ldb, int* info); void dsygs2_(int* itype, char* uplo, int* n, double* A, int* lda, double* B, int* ldb, int* info); void chegs2_(int* itype, char* uplo, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb, int* info); void zhegs2_(int* itype, char* uplo, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb, int* info); void ssygst_(int* itype, char* uplo, int* n, float* A, int* lda, float* B, int* ldb, int* info); void dsygst_(int* itype, char* uplo, int* n, double* A, int* lda, double* B, int* ldb, int* info); void chegst_(int* itype, char* uplo, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb, int* info); void zhegst_(int* itype, char* uplo, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb, int* info); void ssyev_(char* evect, char* uplo, int* n, float* A, int* lda, float* W, float* work, int* lwork, int* info); void dsyev_(char* evect, char* uplo, int* n, double* A, int* lda, double* W, double* work, int* lwork, int* info); void cheev_(char* evect, char* uplo, int* n, rocblas_float_complex* A, int* lda, float* W, rocblas_float_complex* work, int* lwork, float* rwork, int* info); void zheev_(char* evect, char* uplo, int* n, rocblas_double_complex* A, int* lda, double* W, rocblas_double_complex* work, int* lwork, double* rwork, int* info); void ssyevd_(char* evect, char* uplo, int* n, float* A, int* lda, float* W, float* work, int* lwork, int* iwork, int* liwork, int* info); void dsyevd_(char* evect, char* uplo, int* n, double* A, int* lda, double* W, double* work, int* lwork, int* iwork, int* liwork, int* info); void cheevd_(char* evect, char* uplo, int* n, rocblas_float_complex* A, int* lda, float* W, rocblas_float_complex* work, int* lwork, float* rwork, int* lrwork, int* iwork, int* liwork, int* info); void zheevd_(char* evect, char* uplo, int* n, rocblas_double_complex* A, int* lda, double* W, rocblas_double_complex* work, int* lwork, double* rwork, int* lrwork, int* iwork, int* liwork, int* info); void ssyevx_(char* evect, char* erange, char* uplo, int* n, float* A, int* lda, float* vl, float* vu, int* il, int* iu, float* abstol, int* nev, float* W, float* Z, int* ldz, float* work, int* lwork, int* iwork, int* ifail, int* info); void dsyevx_(char* evect, char* erange, char* uplo, int* n, double* A, int* lda, double* vl, double* vu, int* il, int* iu, double* abstol, int* nev, double* W, double* Z, int* ldz, double* work, int* lwork, int* iwork, int* ifail, int* info); void cheevx_(char* evect, char* erange, char* uplo, int* n, rocblas_float_complex* A, int* lda, float* vl, float* vu, int* il, int* iu, float* abstol, int* nev, float* W, rocblas_float_complex* Z, int* ldz, rocblas_float_complex* work, int* lwork, float* rwork, int* iwork, int* ifail, int* info); void zheevx_(char* evect, char* erange, char* uplo, int* n, rocblas_double_complex* A, int* lda, double* vl, double* vu, int* il, int* iu, double* abstol, int* nev, double* W, rocblas_double_complex* Z, int* ldz, rocblas_double_complex* work, int* lwork, double* rwork, int* iwork, int* ifail, int* info); void ssygv_(int* itype, char* evect, char* uplo, int* n, float* A, int* lda, float* B, int* ldb, float* W, float* work, int* lwork, int* info); void dsygv_(int* itype, char* evect, char* uplo, int* n, double* A, int* lda, double* B, int* ldb, double* W, double* work, int* lwork, int* info); void chegv_(int* itype, char* evect, char* uplo, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb, float* W, rocblas_float_complex* work, int* lwork, float* rwork, int* info); void zhegv_(int* itype, char* evect, char* uplo, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb, double* W, rocblas_double_complex* work, int* lwork, double* rwork, int* info); void ssygvd_(int* itype, char* evect, char* uplo, int* n, float* A, int* lda, float* B, int* ldb, float* W, float* work, int* lwork, int* iwork, int* liwork, int* info); void dsygvd_(int* itype, char* evect, char* uplo, int* n, double* A, int* lda, double* B, int* ldb, double* W, double* work, int* lwork, int* iwork, int* liwork, int* info); void chegvd_(int* itype, char* evect, char* uplo, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb, float* W, rocblas_float_complex* work, int* lwork, float* rwork, int* lrwork, int* iwork, int* liwork, int* info); void zhegvd_(int* itype, char* evect, char* uplo, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb, double* W, rocblas_double_complex* work, int* lwork, double* rwork, int* lrwork, int* iwork, int* liwork, int* info); void ssygvx_(int* itype, char* evect, char* erange, char* uplo, int* n, float* A, int* lda, float* B, int* ldb, float* vl, float* vu, int* il, int* iu, float* abstol, int* m, float* W, float* Z, int* ldz, float* work, int* lwork, int* iwork, int* ifail, int* info); void dsygvx_(int* itype, char* evect, char* erange, char* uplo, int* n, double* A, int* lda, double* B, int* ldb, double* vl, double* vu, int* il, int* iu, double* abstol, int* m, double* W, double* Z, int* ldz, double* work, int* lwork, int* iwork, int* ifail, int* info); void chegvx_(int* itype, char* evect, char* erange, char* uplo, int* n, rocblas_float_complex* A, int* lda, rocblas_float_complex* B, int* ldb, float* vl, float* vu, int* il, int* iu, float* abstol, int* m, float* W, rocblas_float_complex* Z, int* ldz, rocblas_float_complex* work, int* lwork, float* rwork, int* iwork, int* ifail, int* info); void zhegvx_(int* itype, char* evect, char* erange, char* uplo, int* n, rocblas_double_complex* A, int* lda, rocblas_double_complex* B, int* ldb, double* vl, double* vu, int* il, int* iu, double* abstol, int* m, double* W, rocblas_double_complex* Z, int* ldz, rocblas_double_complex* work, int* lwork, double* rwork, int* iwork, int* ifail, int* info); void slasyf_(char* uplo, int* n, int* nb, int* kb, float* A, int* lda, int* ipiv, float* W, int* ldw, int* info); void dlasyf_(char* uplo, int* n, int* nb, int* kb, double* A, int* lda, int* ipiv, double* W, int* ldw, int* info); void clasyf_(char* uplo, int* n, int* nb, int* kb, rocblas_float_complex* A, int* lda, int* ipiv, rocblas_float_complex* W, int* ldw, int* info); void zlasyf_(char* uplo, int* n, int* nb, int* kb, rocblas_double_complex* A, int* lda, int* ipiv, rocblas_double_complex* W, int* ldw, int* info); void ssytf2_(char* uplo, int* n, float* A, int* lda, int* ipiv, int* info); void dsytf2_(char* uplo, int* n, double* A, int* lda, int* ipiv, int* info); void csytf2_(char* uplo, int* n, rocblas_float_complex* A, int* lda, int* ipiv, int* info); void zsytf2_(char* uplo, int* n, rocblas_double_complex* A, int* lda, int* ipiv, int* info); void ssytrf_(char* uplo, int* n, float* A, int* lda, int* ipiv, float* work, int* lwork, int* info); void dsytrf_(char* uplo, int* n, double* A, int* lda, int* ipiv, double* work, int* lwork, int* info); void csytrf_(char* uplo, int* n, rocblas_float_complex* A, int* lda, int* ipiv, rocblas_float_complex* work, int* lwork, int* info); void zsytrf_(char* uplo, int* n, rocblas_double_complex* A, int* lda, int* ipiv, rocblas_double_complex* work, int* lwork, int* info); void sbdsvdx_(char* uplo, char* svect, char* srange, int* n, float* D, float* E, float* vl, float* vu, int* il, int* iu, int* nsv, float* S, float* Z, int* ldz, float* work, int* iwork, int* info); void dbdsvdx_(char* uplo, char* svect, char* srange, int* n, double* D, double* E, double* vl, double* vu, int* il, int* iu, int* nsv, double* S, double* Z, int* ldz, double* work, int* iwork, int* info); #ifdef __cplusplus } #endif /************************************************************************/ /************************************************************************/ // These are templated functions used in rocSOLVER clients code // lacgv template <> void cpu_lacgv(rocblas_int n, rocblas_float_complex* x, rocblas_int incx) { clacgv_(&n, x, &incx); } template <> void cpu_lacgv(rocblas_int n, rocblas_double_complex* x, rocblas_int incx) { zlacgv_(&n, x, &incx); } // laswp template <> void cpu_laswp(rocblas_int n, float* A, rocblas_int lda, rocblas_int k1, rocblas_int k2, rocblas_int* ipiv, rocblas_int inc) { slaswp_(&n, A, &lda, &k1, &k2, ipiv, &inc); } template <> void cpu_laswp(rocblas_int n, double* A, rocblas_int lda, rocblas_int k1, rocblas_int k2, rocblas_int* ipiv, rocblas_int inc) { dlaswp_(&n, A, &lda, &k1, &k2, ipiv, &inc); } template <> void cpu_laswp(rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_int k1, rocblas_int k2, rocblas_int* ipiv, rocblas_int inc) { claswp_(&n, A, &lda, &k1, &k2, ipiv, &inc); } template <> void cpu_laswp(rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_int k1, rocblas_int k2, rocblas_int* ipiv, rocblas_int inc) { zlaswp_(&n, A, &lda, &k1, &k2, ipiv, &inc); } // larfg template <> void cpu_larfg(rocblas_int n, float* alpha, float* x, rocblas_int incx, float* tau) { slarfg_(&n, alpha, x, &incx, tau); } template <> void cpu_larfg(rocblas_int n, double* alpha, double* x, rocblas_int incx, double* tau) { dlarfg_(&n, alpha, x, &incx, tau); } template <> void cpu_larfg(rocblas_int n, rocblas_float_complex* alpha, rocblas_float_complex* x, rocblas_int incx, rocblas_float_complex* tau) { clarfg_(&n, alpha, x, &incx, tau); } template <> void cpu_larfg(rocblas_int n, rocblas_double_complex* alpha, rocblas_double_complex* x, rocblas_int incx, rocblas_double_complex* tau) { zlarfg_(&n, alpha, x, &incx, tau); } // larf template <> void cpu_larf(rocblas_side sideR, rocblas_int m, rocblas_int n, float* x, rocblas_int incx, float* alpha, float* A, rocblas_int lda, float* work) { char side = rocblas2char_side(sideR); slarf_(&side, &m, &n, x, &incx, alpha, A, &lda, work); } template <> void cpu_larf(rocblas_side sideR, rocblas_int m, rocblas_int n, double* x, rocblas_int incx, double* alpha, double* A, rocblas_int lda, double* work) { char side = rocblas2char_side(sideR); dlarf_(&side, &m, &n, x, &incx, alpha, A, &lda, work); } template <> void cpu_larf(rocblas_side sideR, rocblas_int m, rocblas_int n, rocblas_float_complex* x, rocblas_int incx, rocblas_float_complex* alpha, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* work) { char side = rocblas2char_side(sideR); clarf_(&side, &m, &n, x, &incx, alpha, A, &lda, work); } template <> void cpu_larf(rocblas_side sideR, rocblas_int m, rocblas_int n, rocblas_double_complex* x, rocblas_int incx, rocblas_double_complex* alpha, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* work) { char side = rocblas2char_side(sideR); zlarf_(&side, &m, &n, x, &incx, alpha, A, &lda, work); } // larft template <> void cpu_larft(rocblas_direct directR, rocblas_storev storevR, rocblas_int n, rocblas_int k, float* V, rocblas_int ldv, float* tau, float* T, rocblas_int ldt) { char direct = rocblas2char_direct(directR); char storev = rocblas2char_storev(storevR); slarft_(&direct, &storev, &n, &k, V, &ldv, tau, T, &ldt); } template <> void cpu_larft(rocblas_direct directR, rocblas_storev storevR, rocblas_int n, rocblas_int k, double* V, rocblas_int ldv, double* tau, double* T, rocblas_int ldt) { char direct = rocblas2char_direct(directR); char storev = rocblas2char_storev(storevR); dlarft_(&direct, &storev, &n, &k, V, &ldv, tau, T, &ldt); } template <> void cpu_larft(rocblas_direct directR, rocblas_storev storevR, rocblas_int n, rocblas_int k, rocblas_float_complex* V, rocblas_int ldv, rocblas_float_complex* tau, rocblas_float_complex* T, rocblas_int ldt) { char direct = rocblas2char_direct(directR); char storev = rocblas2char_storev(storevR); clarft_(&direct, &storev, &n, &k, V, &ldv, tau, T, &ldt); } template <> void cpu_larft(rocblas_direct directR, rocblas_storev storevR, rocblas_int n, rocblas_int k, rocblas_double_complex* V, rocblas_int ldv, rocblas_double_complex* tau, rocblas_double_complex* T, rocblas_int ldt) { char direct = rocblas2char_direct(directR); char storev = rocblas2char_storev(storevR); zlarft_(&direct, &storev, &n, &k, V, &ldv, tau, T, &ldt); } // larfb template <> void cpu_larfb(rocblas_side sideR, rocblas_operation transR, rocblas_direct directR, rocblas_storev storevR, rocblas_int m, rocblas_int n, rocblas_int k, float* V, rocblas_int ldv, float* T, rocblas_int ldt, float* A, rocblas_int lda, float* W, rocblas_int ldw) { char side = rocblas2char_side(sideR); char trans = rocblas2char_operation(transR); char direct = rocblas2char_direct(directR); char storev = rocblas2char_storev(storevR); slarfb_(&side, &trans, &direct, &storev, &m, &n, &k, V, &ldv, T, &ldt, A, &lda, W, &ldw); } template <> void cpu_larfb(rocblas_side sideR, rocblas_operation transR, rocblas_direct directR, rocblas_storev storevR, rocblas_int m, rocblas_int n, rocblas_int k, double* V, rocblas_int ldv, double* T, rocblas_int ldt, double* A, rocblas_int lda, double* W, rocblas_int ldw) { char side = rocblas2char_side(sideR); char trans = rocblas2char_operation(transR); char direct = rocblas2char_direct(directR); char storev = rocblas2char_storev(storevR); dlarfb_(&side, &trans, &direct, &storev, &m, &n, &k, V, &ldv, T, &ldt, A, &lda, W, &ldw); } template <> void cpu_larfb(rocblas_side sideR, rocblas_operation transR, rocblas_direct directR, rocblas_storev storevR, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* V, rocblas_int ldv, rocblas_float_complex* T, rocblas_int ldt, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* W, rocblas_int ldw) { char side = rocblas2char_side(sideR); char trans = rocblas2char_operation(transR); char direct = rocblas2char_direct(directR); char storev = rocblas2char_storev(storevR); clarfb_(&side, &trans, &direct, &storev, &m, &n, &k, V, &ldv, T, &ldt, A, &lda, W, &ldw); } template <> void cpu_larfb(rocblas_side sideR, rocblas_operation transR, rocblas_direct directR, rocblas_storev storevR, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* V, rocblas_int ldv, rocblas_double_complex* T, rocblas_int ldt, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* W, rocblas_int ldw) { char side = rocblas2char_side(sideR); char trans = rocblas2char_operation(transR); char direct = rocblas2char_direct(directR); char storev = rocblas2char_storev(storevR); zlarfb_(&side, &trans, &direct, &storev, &m, &n, &k, V, &ldv, T, &ldt, A, &lda, W, &ldw); } // lauum template <> void cpu_lauum(rocblas_fill uploR, rocblas_int n, float* A, rocblas_int lda) { rocblas_int info; char uplo = rocblas2char_fill(uploR); slauum_(&uplo, &n, A, &lda, &info); } template <> void cpu_lauum(rocblas_fill uploR, rocblas_int n, double* A, rocblas_int lda) { rocblas_int info; char uplo = rocblas2char_fill(uploR); dlauum_(&uplo, &n, A, &lda, &info); } template <> void cpu_lauum(rocblas_fill uploR, rocblas_int n, rocblas_float_complex* A, rocblas_int lda) { rocblas_int info; char uplo = rocblas2char_fill(uploR); clauum_(&uplo, &n, A, &lda, &info); } template <> void cpu_lauum(rocblas_fill uploR, rocblas_int n, rocblas_double_complex* A, rocblas_int lda) { rocblas_int info; char uplo = rocblas2char_fill(uploR); zlauum_(&uplo, &n, A, &lda, &info); } // bdsqr template <> void cpu_bdsqr(rocblas_fill uplo, rocblas_int n, rocblas_int nv, rocblas_int nu, rocblas_int nc, float* D, float* E, float* V, rocblas_int ldv, float* U, rocblas_int ldu, float* C, rocblas_int ldc, float* work, rocblas_int* info) { char uploC = (uplo == rocblas_fill_upper) ? 'U' : 'L'; sbdsqr_(&uploC, &n, &nv, &nu, &nc, D, E, V, &ldv, U, &ldu, C, &ldc, work, info); } template <> void cpu_bdsqr(rocblas_fill uplo, rocblas_int n, rocblas_int nv, rocblas_int nu, rocblas_int nc, double* D, double* E, double* V, rocblas_int ldv, double* U, rocblas_int ldu, double* C, rocblas_int ldc, double* work, rocblas_int* info) { char uploC = (uplo == rocblas_fill_upper) ? 'U' : 'L'; dbdsqr_(&uploC, &n, &nv, &nu, &nc, D, E, V, &ldv, U, &ldu, C, &ldc, work, info); } template <> void cpu_bdsqr(rocblas_fill uplo, rocblas_int n, rocblas_int nv, rocblas_int nu, rocblas_int nc, float* D, float* E, rocblas_float_complex* V, rocblas_int ldv, rocblas_float_complex* U, rocblas_int ldu, rocblas_float_complex* C, rocblas_int ldc, float* work, rocblas_int* info) { char uploC = (uplo == rocblas_fill_upper) ? 'U' : 'L'; cbdsqr_(&uploC, &n, &nv, &nu, &nc, D, E, V, &ldv, U, &ldu, C, &ldc, work, info); } template <> void cpu_bdsqr(rocblas_fill uplo, rocblas_int n, rocblas_int nv, rocblas_int nu, rocblas_int nc, double* D, double* E, rocblas_double_complex* V, rocblas_int ldv, rocblas_double_complex* U, rocblas_int ldu, rocblas_double_complex* C, rocblas_int ldc, double* work, rocblas_int* info) { char uploC = (uplo == rocblas_fill_upper) ? 'U' : 'L'; zbdsqr_(&uploC, &n, &nv, &nu, &nc, D, E, V, &ldv, U, &ldu, C, &ldc, work, info); } // gesvd template <> void cpu_gesvd(rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* S, float* U, rocblas_int ldu, float* V, rocblas_int ldv, float* work, rocblas_int lwork, float* rwork, rocblas_int* info) { char jobu = rocblas2char_svect(leftv); char jobv = rocblas2char_svect(rightv); sgesvd_(&jobu, &jobv, &m, &n, A, &lda, S, U, &ldu, V, &ldv, work, &lwork, info); } template <> void cpu_gesvd(rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* S, double* U, rocblas_int ldu, double* V, rocblas_int ldv, double* work, rocblas_int lwork, double* rwork, rocblas_int* info) { char jobu = rocblas2char_svect(leftv); char jobv = rocblas2char_svect(rightv); dgesvd_(&jobu, &jobv, &m, &n, A, &lda, S, U, &ldu, V, &ldv, work, &lwork, info); } template <> void cpu_gesvd(rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, float* S, rocblas_float_complex* U, rocblas_int ldu, rocblas_float_complex* V, rocblas_int ldv, rocblas_float_complex* work, rocblas_int lwork, float* rwork, rocblas_int* info) { char jobu = rocblas2char_svect(leftv); char jobv = rocblas2char_svect(rightv); cgesvd_(&jobu, &jobv, &m, &n, A, &lda, S, U, &ldu, V, &ldv, work, &lwork, rwork, info); } template <> void cpu_gesvd(rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, double* S, rocblas_double_complex* U, rocblas_int ldu, rocblas_double_complex* V, rocblas_int ldv, rocblas_double_complex* work, rocblas_int lwork, double* rwork, rocblas_int* info) { char jobu = rocblas2char_svect(leftv); char jobv = rocblas2char_svect(rightv); zgesvd_(&jobu, &jobv, &m, &n, A, &lda, S, U, &ldu, V, &ldv, work, &lwork, rwork, info); } // gesvdx template <> void cpu_gesvdx(rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float vl, float vu, rocblas_int il, rocblas_int iu, rocblas_int* nsv, float* S, float* U, rocblas_int ldu, float* V, rocblas_int ldv, float* work, rocblas_int lwork, float* rwork, rocblas_int* iwork, rocblas_int* info) { char jobu = rocblas2char_svect(leftv, true); char jobv = rocblas2char_svect(rightv, true); char srangeC = rocblas2char_srange(srange); sgesvdx_(&jobu, &jobv, &srangeC, &m, &n, A, &lda, &vl, &vu, &il, &iu, nsv, S, U, &ldu, V, &ldv, work, &lwork, iwork, info); } template <> void cpu_gesvdx(rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double vl, double vu, rocblas_int il, rocblas_int iu, rocblas_int* nsv, double* S, double* U, rocblas_int ldu, double* V, rocblas_int ldv, double* work, rocblas_int lwork, double* rwork, rocblas_int* iwork, rocblas_int* info) { char jobu = rocblas2char_svect(leftv, true); char jobv = rocblas2char_svect(rightv, true); char srangeC = rocblas2char_srange(srange); dgesvdx_(&jobu, &jobv, &srangeC, &m, &n, A, &lda, &vl, &vu, &il, &iu, nsv, S, U, &ldu, V, &ldv, work, &lwork, iwork, info); } template <> void cpu_gesvdx(rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, float vl, float vu, rocblas_int il, rocblas_int iu, rocblas_int* nsv, float* S, rocblas_float_complex* U, rocblas_int ldu, rocblas_float_complex* V, rocblas_int ldv, rocblas_float_complex* work, rocblas_int lwork, float* rwork, rocblas_int* iwork, rocblas_int* info) { char jobu = rocblas2char_svect(leftv, true); char jobv = rocblas2char_svect(rightv, true); char srangeC = rocblas2char_srange(srange); cgesvdx_(&jobu, &jobv, &srangeC, &m, &n, A, &lda, &vl, &vu, &il, &iu, nsv, S, U, &ldu, V, &ldv, work, &lwork, rwork, iwork, info); } template <> void cpu_gesvdx(rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, double vl, double vu, rocblas_int il, rocblas_int iu, rocblas_int* nsv, double* S, rocblas_double_complex* U, rocblas_int ldu, rocblas_double_complex* V, rocblas_int ldv, rocblas_double_complex* work, rocblas_int lwork, double* rwork, rocblas_int* iwork, rocblas_int* info) { char jobu = rocblas2char_svect(leftv, true); char jobv = rocblas2char_svect(rightv, true); char srangeC = rocblas2char_srange(srange); zgesvdx_(&jobu, &jobv, &srangeC, &m, &n, A, &lda, &vl, &vu, &il, &iu, nsv, S, U, &ldu, V, &ldv, work, &lwork, rwork, iwork, info); } // latrd template <> void cpu_latrd(rocblas_fill uplo, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* E, float* tau, float* W, rocblas_int ldw) { char uploC = rocblas2char_fill(uplo); slatrd_(&uploC, &n, &k, A, &lda, E, tau, W, &ldw); } template <> void cpu_latrd(rocblas_fill uplo, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* E, double* tau, double* W, rocblas_int ldw) { char uploC = rocblas2char_fill(uplo); dlatrd_(&uploC, &n, &k, A, &lda, E, tau, W, &ldw); } template <> void cpu_latrd(rocblas_fill uplo, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, float* E, rocblas_float_complex* tau, rocblas_float_complex* W, rocblas_int ldw) { char uploC = rocblas2char_fill(uplo); clatrd_(&uploC, &n, &k, A, &lda, E, tau, W, &ldw); } template <> void cpu_latrd(rocblas_fill uplo, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, double* E, rocblas_double_complex* tau, rocblas_double_complex* W, rocblas_int ldw) { char uploC = rocblas2char_fill(uplo); zlatrd_(&uploC, &n, &k, A, &lda, E, tau, W, &ldw); } // labrd template <> void cpu_labrd(rocblas_int m, rocblas_int n, rocblas_int nb, float* A, rocblas_int lda, float* D, float* E, float* tauq, float* taup, float* X, rocblas_int ldx, float* Y, rocblas_int ldy) { slabrd_(&m, &n, &nb, A, &lda, D, E, tauq, taup, X, &ldx, Y, &ldy); } template <> void cpu_labrd(rocblas_int m, rocblas_int n, rocblas_int nb, double* A, rocblas_int lda, double* D, double* E, double* tauq, double* taup, double* X, rocblas_int ldx, double* Y, rocblas_int ldy) { dlabrd_(&m, &n, &nb, A, &lda, D, E, tauq, taup, X, &ldx, Y, &ldy); } template <> void cpu_labrd(rocblas_int m, rocblas_int n, rocblas_int nb, rocblas_float_complex* A, rocblas_int lda, float* D, float* E, rocblas_float_complex* tauq, rocblas_float_complex* taup, rocblas_float_complex* X, rocblas_int ldx, rocblas_float_complex* Y, rocblas_int ldy) { clabrd_(&m, &n, &nb, A, &lda, D, E, tauq, taup, X, &ldx, Y, &ldy); } template <> void cpu_labrd(rocblas_int m, rocblas_int n, rocblas_int nb, rocblas_double_complex* A, rocblas_int lda, double* D, double* E, rocblas_double_complex* tauq, rocblas_double_complex* taup, rocblas_double_complex* X, rocblas_int ldx, rocblas_double_complex* Y, rocblas_int ldy) { zlabrd_(&m, &n, &nb, A, &lda, D, E, tauq, taup, X, &ldx, Y, &ldy); } // orgqr & ungqr template <> void cpu_orgqr_ungqr(rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* work, rocblas_int lwork) { int info; sorgqr_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_orgqr_ungqr(rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* work, rocblas_int lwork) { int info; dorgqr_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_orgqr_ungqr(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, rocblas_int lwork) { int info; cungqr_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_orgqr_ungqr(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, rocblas_int lwork) { int info; zungqr_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } // org2r & ung2r template <> void cpu_org2r_ung2r(rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* work) { int info; sorg2r_(&m, &n, &k, A, &lda, ipiv, work, &info); } template <> void cpu_org2r_ung2r(rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* work) { int info; dorg2r_(&m, &n, &k, A, &lda, ipiv, work, &info); } template <> void cpu_org2r_ung2r(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work) { int info; cung2r_(&m, &n, &k, A, &lda, ipiv, work, &info); } template <> void cpu_org2r_ung2r(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work) { int info; zung2r_(&m, &n, &k, A, &lda, ipiv, work, &info); } // orglq & unglq template <> void cpu_orglq_unglq(rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* work, rocblas_int lwork) { int info; sorglq_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_orglq_unglq(rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* work, rocblas_int lwork) { int info; dorglq_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_orglq_unglq(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, rocblas_int lwork) { int info; cunglq_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_orglq_unglq(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, rocblas_int lwork) { int info; zunglq_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } // orgl2 & ungl2 template <> void cpu_orgl2_ungl2(rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* work) { int info; sorgl2_(&m, &n, &k, A, &lda, ipiv, work, &info); } template <> void cpu_orgl2_ungl2(rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* work) { int info; dorgl2_(&m, &n, &k, A, &lda, ipiv, work, &info); } template <> void cpu_orgl2_ungl2(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work) { int info; cungl2_(&m, &n, &k, A, &lda, ipiv, work, &info); } template <> void cpu_orgl2_ungl2(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work) { int info; zungl2_(&m, &n, &k, A, &lda, ipiv, work, &info); } // orgql & ungql template <> void cpu_orgql_ungql(rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* work, rocblas_int lwork) { int info; sorgql_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_orgql_ungql(rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* work, rocblas_int lwork) { int info; dorgql_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_orgql_ungql(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, rocblas_int lwork) { int info; cungql_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_orgql_ungql(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, rocblas_int lwork) { int info; zungql_(&m, &n, &k, A, &lda, ipiv, work, &lwork, &info); } // org2l & ung2l template <> void cpu_org2l_ung2l(rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* work) { int info; sorg2l_(&m, &n, &k, A, &lda, ipiv, work, &info); } template <> void cpu_org2l_ung2l(rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* work) { int info; dorg2l_(&m, &n, &k, A, &lda, ipiv, work, &info); } template <> void cpu_org2l_ung2l(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work) { int info; cung2l_(&m, &n, &k, A, &lda, ipiv, work, &info); } template <> void cpu_org2l_ung2l(rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work) { int info; zung2l_(&m, &n, &k, A, &lda, ipiv, work, &info); } // orgbr & ungbr template <> void cpu_orgbr_ungbr(rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* Ipiv, float* work, rocblas_int size_w) { int info; char vect; if(storev == rocblas_column_wise) vect = 'Q'; else vect = 'P'; sorgbr_(&vect, &m, &n, &k, A, &lda, Ipiv, work, &size_w, &info); } template <> void cpu_orgbr_ungbr(rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* Ipiv, double* work, rocblas_int size_w) { int info; char vect; if(storev == rocblas_column_wise) vect = 'Q'; else vect = 'P'; dorgbr_(&vect, &m, &n, &k, A, &lda, Ipiv, work, &size_w, &info); } template <> void cpu_orgbr_ungbr(rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv, rocblas_float_complex* work, rocblas_int size_w) { int info; char vect; if(storev == rocblas_column_wise) vect = 'Q'; else vect = 'P'; cungbr_(&vect, &m, &n, &k, A, &lda, Ipiv, work, &size_w, &info); } template <> void cpu_orgbr_ungbr(rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv, rocblas_double_complex* work, rocblas_int size_w) { int info; char vect; if(storev == rocblas_column_wise) vect = 'Q'; else vect = 'P'; zungbr_(&vect, &m, &n, &k, A, &lda, Ipiv, work, &size_w, &info); } // orgtr & ungtr template <> void cpu_orgtr_ungtr(rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float* Ipiv, float* work, rocblas_int size_w) { int info; char uploC = rocblas2char_fill(uplo); sorgtr_(&uploC, &n, A, &lda, Ipiv, work, &size_w, &info); } template <> void cpu_orgtr_ungtr(rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double* Ipiv, double* work, rocblas_int size_w) { int info; char uploC = rocblas2char_fill(uplo); dorgtr_(&uploC, &n, A, &lda, Ipiv, work, &size_w, &info); } template <> void cpu_orgtr_ungtr(rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv, rocblas_float_complex* work, rocblas_int size_w) { int info; char uploC = rocblas2char_fill(uplo); cungtr_(&uploC, &n, A, &lda, Ipiv, work, &size_w, &info); } template <> void cpu_orgtr_ungtr(rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv, rocblas_double_complex* work, rocblas_int size_w) { int info; char uploC = rocblas2char_fill(uplo); zungtr_(&uploC, &n, A, &lda, Ipiv, work, &size_w, &info); } // ormqr & unmqr template <> void cpu_ormqr_unmqr(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* C, rocblas_int ldc, float* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); sormqr_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormqr_unmqr(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* C, rocblas_int ldc, double* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); dormqr_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormqr_unmqr(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, rocblas_int ldc, rocblas_float_complex* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); cunmqr_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormqr_unmqr(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, rocblas_int ldc, rocblas_double_complex* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); zunmqr_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } // orm2r & unm2r template <> void cpu_orm2r_unm2r(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* C, rocblas_int ldc, float* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); sorm2r_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } template <> void cpu_orm2r_unm2r(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* C, rocblas_int ldc, double* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); dorm2r_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } template <> void cpu_orm2r_unm2r(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, rocblas_int ldc, rocblas_float_complex* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); cunm2r_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } template <> void cpu_orm2r_unm2r(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, rocblas_int ldc, rocblas_double_complex* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); zunm2r_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } // ormlq & unmlq template <> void cpu_ormlq_unmlq(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* C, rocblas_int ldc, float* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); sormlq_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormlq_unmlq(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* C, rocblas_int ldc, double* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); dormlq_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormlq_unmlq(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, rocblas_int ldc, rocblas_float_complex* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); cunmlq_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormlq_unmlq(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, rocblas_int ldc, rocblas_double_complex* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); zunmlq_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } // orml2 & unml2 template <> void cpu_orml2_unml2(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* C, rocblas_int ldc, float* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); sorml2_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } template <> void cpu_orml2_unml2(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* C, rocblas_int ldc, double* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); dorml2_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } template <> void cpu_orml2_unml2(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, rocblas_int ldc, rocblas_float_complex* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); cunml2_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } template <> void cpu_orml2_unml2(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, rocblas_int ldc, rocblas_double_complex* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); zunml2_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } // ormql & unmql template <> void cpu_ormql_unmql(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* C, rocblas_int ldc, float* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); sormql_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormql_unmql(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* C, rocblas_int ldc, double* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); dormql_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormql_unmql(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, rocblas_int ldc, rocblas_float_complex* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); cunmql_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormql_unmql(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, rocblas_int ldc, rocblas_double_complex* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); zunmql_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } // orm2l & unm2l template <> void cpu_orm2l_unm2l(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* C, rocblas_int ldc, float* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); sorm2l_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } template <> void cpu_orm2l_unm2l(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* C, rocblas_int ldc, double* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); dorm2l_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } template <> void cpu_orm2l_unm2l(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, rocblas_int ldc, rocblas_float_complex* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); cunm2l_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } template <> void cpu_orm2l_unm2l(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, rocblas_int ldc, rocblas_double_complex* work) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); zunm2l_(&sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &info); } // ormbr & unmbr template <> void cpu_ormbr_unmbr(rocblas_storev storev, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* ipiv, float* C, rocblas_int ldc, float* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); char vect; if(storev == rocblas_column_wise) vect = 'Q'; else vect = 'P'; sormbr_(&vect, &sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormbr_unmbr(rocblas_storev storev, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* ipiv, double* C, rocblas_int ldc, double* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); char vect; if(storev == rocblas_column_wise) vect = 'Q'; else vect = 'P'; dormbr_(&vect, &sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormbr_unmbr(rocblas_storev storev, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, rocblas_int ldc, rocblas_float_complex* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); char vect; if(storev == rocblas_column_wise) vect = 'Q'; else vect = 'P'; cunmbr_(&vect, &sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormbr_unmbr(rocblas_storev storev, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, rocblas_int ldc, rocblas_double_complex* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char transC = rocblas2char_operation(trans); char vect; if(storev == rocblas_column_wise) vect = 'Q'; else vect = 'P'; zunmbr_(&vect, &sideC, &transC, &m, &n, &k, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } // ormtr & unmtr template <> void cpu_ormtr_unmtr(rocblas_side side, rocblas_fill uplo, rocblas_operation trans, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* ipiv, float* C, rocblas_int ldc, float* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(trans); sormtr_(&sideC, &uploC, &transC, &m, &n, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormtr_unmtr(rocblas_side side, rocblas_fill uplo, rocblas_operation trans, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* ipiv, double* C, rocblas_int ldc, double* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(trans); dormtr_(&sideC, &uploC, &transC, &m, &n, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormtr_unmtr(rocblas_side side, rocblas_fill uplo, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* C, rocblas_int ldc, rocblas_float_complex* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(trans); cunmtr_(&sideC, &uploC, &transC, &m, &n, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } template <> void cpu_ormtr_unmtr(rocblas_side side, rocblas_fill uplo, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* C, rocblas_int ldc, rocblas_double_complex* work, rocblas_int lwork) { int info; char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(trans); zunmtr_(&sideC, &uploC, &transC, &m, &n, A, &lda, ipiv, C, &ldc, work, &lwork, &info); } // gemv template <> void cpu_gemv(rocblas_operation transA, rocblas_int m, rocblas_int n, float alpha, float* A, rocblas_int lda, float* x, rocblas_int incx, float beta, float* y, rocblas_int incy) { char transAC = rocblas2char_operation(transA); sgemv_(&transAC, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); } template <> void cpu_gemv(rocblas_operation transA, rocblas_int m, rocblas_int n, double alpha, double* A, rocblas_int lda, double* x, rocblas_int incx, double beta, double* y, rocblas_int incy) { char transAC = rocblas2char_operation(transA); dgemv_(&transAC, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); } template <> void cpu_gemv(rocblas_operation transA, rocblas_int m, rocblas_int n, rocblas_float_complex alpha, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* x, rocblas_int incx, rocblas_float_complex beta, rocblas_float_complex* y, rocblas_int incy) { char transAC = rocblas2char_operation(transA); cgemv_(&transAC, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); } template <> void cpu_gemv(rocblas_operation transA, rocblas_int m, rocblas_int n, rocblas_double_complex alpha, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* x, rocblas_int incx, rocblas_double_complex beta, rocblas_double_complex* y, rocblas_int incy) { char transAC = rocblas2char_operation(transA); zgemv_(&transAC, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); } // gemm template <> void cpu_gemm(rocblas_operation transA, rocblas_operation transB, rocblas_int m, rocblas_int n, rocblas_int k, float alpha, float* A, rocblas_int lda, float* B, rocblas_int ldb, float beta, float* C, rocblas_int ldc) { char transAC = rocblas2char_operation(transA); char transBC = rocblas2char_operation(transB); sgemm_(&transAC, &transBC, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } template <> void cpu_gemm(rocblas_operation transA, rocblas_operation transB, rocblas_int m, rocblas_int n, rocblas_int k, double alpha, double* A, rocblas_int lda, double* B, rocblas_int ldb, double beta, double* C, rocblas_int ldc) { char transAC = rocblas2char_operation(transA); char transBC = rocblas2char_operation(transB); dgemm_(&transAC, &transBC, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } template <> void cpu_gemm(rocblas_operation transA, rocblas_operation transB, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex alpha, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb, rocblas_float_complex beta, rocblas_float_complex* C, rocblas_int ldc) { char transAC = rocblas2char_operation(transA); char transBC = rocblas2char_operation(transB); cgemm_(&transAC, &transBC, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } template <> void cpu_gemm(rocblas_operation transA, rocblas_operation transB, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex alpha, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb, rocblas_double_complex beta, rocblas_double_complex* C, rocblas_int ldc) { char transAC = rocblas2char_operation(transA); char transBC = rocblas2char_operation(transB); zgemm_(&transAC, &transBC, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } // symv & hemv template <> void cpu_symv_hemv(rocblas_fill uplo, rocblas_int n, float alpha, float* A, rocblas_int lda, float* x, rocblas_int incx, float beta, float* y, rocblas_int incy) { char uploC = rocblas2char_fill(uplo); ssymv_(&uploC, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); } template <> void cpu_symv_hemv(rocblas_fill uplo, rocblas_int n, double alpha, double* A, rocblas_int lda, double* x, rocblas_int incx, double beta, double* y, rocblas_int incy) { char uploC = rocblas2char_fill(uplo); dsymv_(&uploC, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); } template <> void cpu_symv_hemv(rocblas_fill uplo, rocblas_int n, rocblas_float_complex alpha, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* x, rocblas_int incx, rocblas_float_complex beta, rocblas_float_complex* y, rocblas_int incy) { char uploC = rocblas2char_fill(uplo); chemv_(&uploC, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); } template <> void cpu_symv_hemv(rocblas_fill uplo, rocblas_int n, rocblas_double_complex alpha, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* x, rocblas_int incx, rocblas_double_complex beta, rocblas_double_complex* y, rocblas_int incy) { char uploC = rocblas2char_fill(uplo); zhemv_(&uploC, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); } // symm & hemm template <> void cpu_symm_hemm(rocblas_side side, rocblas_fill uplo, rocblas_int m, rocblas_int n, float alpha, float* A, rocblas_int lda, float* B, rocblas_int ldb, float beta, float* C, rocblas_int ldc) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); ssymm_(&sideC, &uploC, &m, &n, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } template <> void cpu_symm_hemm(rocblas_side side, rocblas_fill uplo, rocblas_int m, rocblas_int n, double alpha, double* A, rocblas_int lda, double* B, rocblas_int ldb, double beta, double* C, rocblas_int ldc) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); dsymm_(&sideC, &uploC, &m, &n, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } template <> void cpu_symm_hemm(rocblas_side side, rocblas_fill uplo, rocblas_int m, rocblas_int n, rocblas_float_complex alpha, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb, rocblas_float_complex beta, rocblas_float_complex* C, rocblas_int ldc) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); chemm_(&sideC, &uploC, &m, &n, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } template <> void cpu_symm_hemm(rocblas_side side, rocblas_fill uplo, rocblas_int m, rocblas_int n, rocblas_double_complex alpha, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb, rocblas_double_complex beta, rocblas_double_complex* C, rocblas_int ldc) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); zhemm_(&sideC, &uploC, &m, &n, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } // trsm template <> void cpu_trsm(rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, float alpha, float* A, rocblas_int lda, float* B, rocblas_int ldb) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); strsm_(&sideC, &uploC, &transC, &diagC, &m, &n, &alpha, A, &lda, B, &ldb); } template <> void cpu_trsm(rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, double alpha, double* A, rocblas_int lda, double* B, rocblas_int ldb) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); dtrsm_(&sideC, &uploC, &transC, &diagC, &m, &n, &alpha, A, &lda, B, &ldb); } template <> void cpu_trsm(rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, rocblas_float_complex alpha, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); ctrsm_(&sideC, &uploC, &transC, &diagC, &m, &n, &alpha, A, &lda, B, &ldb); } template <> void cpu_trsm(rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, rocblas_double_complex alpha, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); ztrsm_(&sideC, &uploC, &transC, &diagC, &m, &n, &alpha, A, &lda, B, &ldb); } /* // trsv template <> void cpu_trsv(rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int n, float* A, rocblas_int lda, float* x, rocblas_int incx) { char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); strsv_(&uploC, &transC, &diagC, &n, A, &lda, x, &incx); } template <> void cpu_trsv(rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int n, double* A, rocblas_int lda, double* x, rocblas_int incx) { char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); dtrsv_(&uploC, &transC, &diagC, &n, A, &lda, x, &incx); } template <> void cpu_trsv(rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* x, rocblas_int incx) { char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); ctrsv_(&uploC, &transC, &diagC, &n, A, &lda, x, &incx); } template <> void cpu_trsv(rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* x, rocblas_int incx) { char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); ztrsv_(&uploC, &transC, &diagC, &n, A, &lda, x, &incx); } */ // trmm template <> void cpu_trmm(rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, float alpha, float* A, rocblas_int lda, float* B, rocblas_int ldb) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); strmm_(&sideC, &uploC, &transC, &diagC, &m, &n, &alpha, A, &lda, B, &ldb); } template <> void cpu_trmm(rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, double alpha, double* A, rocblas_int lda, double* B, rocblas_int ldb) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); dtrmm_(&sideC, &uploC, &transC, &diagC, &m, &n, &alpha, A, &lda, B, &ldb); } template <> void cpu_trmm(rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, rocblas_float_complex alpha, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); ctrmm_(&sideC, &uploC, &transC, &diagC, &m, &n, &alpha, A, &lda, B, &ldb); } template <> void cpu_trmm(rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, rocblas_double_complex alpha, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb) { char sideC = rocblas2char_side(side); char uploC = rocblas2char_fill(uplo); char transC = rocblas2char_operation(transA); char diagC = rocblas2char_diagonal(diag); ztrmm_(&sideC, &uploC, &transC, &diagC, &m, &n, &alpha, A, &lda, B, &ldb); } // potf2 template <> void cpu_potf2(rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); spotf2_(&uploC, &n, A, &lda, info); } template <> void cpu_potf2(rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); dpotf2_(&uploC, &n, A, &lda, info); } template <> void cpu_potf2(rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); cpotf2_(&uploC, &n, A, &lda, info); } template <> void cpu_potf2(rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); zpotf2_(&uploC, &n, A, &lda, info); } // potrf template <> void cpu_potrf(rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); spotrf_(&uploC, &n, A, &lda, info); } template <> void cpu_potrf(rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); dpotrf_(&uploC, &n, A, &lda, info); } template <> void cpu_potrf(rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); cpotrf_(&uploC, &n, A, &lda, info); } template <> void cpu_potrf(rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); zpotrf_(&uploC, &n, A, &lda, info); } // potrs template <> void cpu_potrs(rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, float* B, rocblas_int ldb) { int info; char uploC = rocblas2char_fill(uplo); spotrs_(&uploC, &n, &nrhs, A, &lda, B, &ldb, &info); } template <> void cpu_potrs(rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, double* B, rocblas_int ldb) { int info; char uploC = rocblas2char_fill(uplo); dpotrs_(&uploC, &n, &nrhs, A, &lda, B, &ldb, &info); } template <> void cpu_potrs(rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb) { int info; char uploC = rocblas2char_fill(uplo); cpotrs_(&uploC, &n, &nrhs, A, &lda, B, &ldb, &info); } template <> void cpu_potrs(rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb) { int info; char uploC = rocblas2char_fill(uplo); zpotrs_(&uploC, &n, &nrhs, A, &lda, B, &ldb, &info); } // posv template <> void cpu_posv(rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, float* B, rocblas_int ldb, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); sposv_(&uploC, &n, &nrhs, A, &lda, B, &ldb, info); } template <> void cpu_posv(rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, double* B, rocblas_int ldb, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); dposv_(&uploC, &n, &nrhs, A, &lda, B, &ldb, info); } template <> void cpu_posv(rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); cposv_(&uploC, &n, &nrhs, A, &lda, B, &ldb, info); } template <> void cpu_posv(rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); zposv_(&uploC, &n, &nrhs, A, &lda, B, &ldb, info); } // potri template <> void cpu_potri(rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); spotri_(&uploC, &n, A, &lda, info); } template <> void cpu_potri(rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); dpotri_(&uploC, &n, A, &lda, info); } template <> void cpu_potri(rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); cpotri_(&uploC, &n, A, &lda, info); } template <> void cpu_potri(rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); zpotri_(&uploC, &n, A, &lda, info); } // getf2 template <> void cpu_getf2(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { sgetf2_(&m, &n, A, &lda, ipiv, info); } template <> void cpu_getf2(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { dgetf2_(&m, &n, A, &lda, ipiv, info); } template <> void cpu_getf2(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { cgetf2_(&m, &n, A, &lda, ipiv, info); } template <> void cpu_getf2(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { zgetf2_(&m, &n, A, &lda, ipiv, info); } // getrf template <> void cpu_getrf(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { sgetrf_(&m, &n, A, &lda, ipiv, info); } template <> void cpu_getrf(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { dgetrf_(&m, &n, A, &lda, ipiv, info); } template <> void cpu_getrf(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { cgetrf_(&m, &n, A, &lda, ipiv, info); } template <> void cpu_getrf(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { zgetrf_(&m, &n, A, &lda, ipiv, info); } // getrs template <> void cpu_getrs(rocblas_operation trans, rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, rocblas_int* ipiv, float* B, rocblas_int ldb) { rocblas_int info; char transC = rocblas2char_operation(trans); sgetrs_(&transC, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); } template <> void cpu_getrs(rocblas_operation trans, rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, rocblas_int* ipiv, double* B, rocblas_int ldb) { rocblas_int info; char transC = rocblas2char_operation(trans); dgetrs_(&transC, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); } template <> void cpu_getrs(rocblas_operation trans, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_float_complex* B, rocblas_int ldb) { rocblas_int info; char transC = rocblas2char_operation(trans); cgetrs_(&transC, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); } template <> void cpu_getrs(rocblas_operation trans, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_double_complex* B, rocblas_int ldb) { rocblas_int info; char transC = rocblas2char_operation(trans); zgetrs_(&transC, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info); } // gesv template <> void cpu_gesv(rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, rocblas_int* ipiv, float* B, rocblas_int ldb, rocblas_int* info) { sgesv_(&n, &nrhs, A, &lda, ipiv, B, &ldb, info); } template <> void cpu_gesv(rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, rocblas_int* ipiv, double* B, rocblas_int ldb, rocblas_int* info) { dgesv_(&n, &nrhs, A, &lda, ipiv, B, &ldb, info); } template <> void cpu_gesv(rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_float_complex* B, rocblas_int ldb, rocblas_int* info) { cgesv_(&n, &nrhs, A, &lda, ipiv, B, &ldb, info); } template <> void cpu_gesv(rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_double_complex* B, rocblas_int ldb, rocblas_int* info) { zgesv_(&n, &nrhs, A, &lda, ipiv, B, &ldb, info); } // gels template <> void cpu_gels(rocblas_operation transR, rocblas_int m, rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, float* B, rocblas_int ldb, float* work, rocblas_int lwork, rocblas_int* info) { char trans = rocblas2char_operation(transR); sgels_(&trans, &m, &n, &nrhs, A, &lda, B, &ldb, work, &lwork, info); } template <> void cpu_gels(rocblas_operation transR, rocblas_int m, rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, double* B, rocblas_int ldb, double* work, rocblas_int lwork, rocblas_int* info) { char trans = rocblas2char_operation(transR); dgels_(&trans, &m, &n, &nrhs, A, &lda, B, &ldb, work, &lwork, info); } template <> void cpu_gels(rocblas_operation transR, rocblas_int m, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb, rocblas_float_complex* work, rocblas_int lwork, rocblas_int* info) { char trans = rocblas2char_operation(transR); cgels_(&trans, &m, &n, &nrhs, A, &lda, B, &ldb, work, &lwork, info); } template <> void cpu_gels(rocblas_operation transR, rocblas_int m, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb, rocblas_double_complex* work, rocblas_int lwork, rocblas_int* info) { char trans = rocblas2char_operation(transR); zgels_(&trans, &m, &n, &nrhs, A, &lda, B, &ldb, work, &lwork, info); } // trtri template <> void cpu_trtri(rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, float* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); char diagC = rocblas2char_diagonal(diag); strtri_(&uploC, &diagC, &n, A, &lda, info); } template <> void cpu_trtri(rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, double* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); char diagC = rocblas2char_diagonal(diag); dtrtri_(&uploC, &diagC, &n, A, &lda, info); } template <> void cpu_trtri(rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); char diagC = rocblas2char_diagonal(diag); ctrtri_(&uploC, &diagC, &n, A, &lda, info); } template <> void cpu_trtri(rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); char diagC = rocblas2char_diagonal(diag); ztrtri_(&uploC, &diagC, &n, A, &lda, info); } // getri template <> void cpu_getri(rocblas_int n, float* A, rocblas_int lda, rocblas_int* ipiv, float* work, rocblas_int lwork, rocblas_int* info) { sgetri_(&n, A, &lda, ipiv, work, &lwork, info); } template <> void cpu_getri(rocblas_int n, double* A, rocblas_int lda, rocblas_int* ipiv, double* work, rocblas_int lwork, rocblas_int* info) { dgetri_(&n, A, &lda, ipiv, work, &lwork, info); } template <> void cpu_getri(rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_float_complex* work, rocblas_int lwork, rocblas_int* info) { cgetri_(&n, A, &lda, ipiv, work, &lwork, info); } template <> void cpu_getri(rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_double_complex* work, rocblas_int lwork, rocblas_int* info) { zgetri_(&n, A, &lda, ipiv, work, &lwork, info); } // geqrf template <> void cpu_geqrf(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* ipiv, float* work, rocblas_int lwork) { int info; sgeqrf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_geqrf(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* ipiv, double* work, rocblas_int lwork) { int info; dgeqrf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_geqrf(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, rocblas_int lwork) { int info; cgeqrf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_geqrf(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, rocblas_int lwork) { int info; zgeqrf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } // geqr2 template <> void cpu_geqr2(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* ipiv, float* work) { int info; sgeqr2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_geqr2(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* ipiv, double* work) { int info; dgeqr2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_geqr2(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work) { int info; cgeqr2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_geqr2(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work) { int info; zgeqr2_(&m, &n, A, &lda, ipiv, work, &info); } // gerqf template <> void cpu_gerqf(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* ipiv, float* work, rocblas_int lwork) { int info; sgerqf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_gerqf(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* ipiv, double* work, rocblas_int lwork) { int info; dgerqf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_gerqf(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, rocblas_int lwork) { int info; cgerqf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_gerqf(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, rocblas_int lwork) { int info; zgerqf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } // gerq2 template <> void cpu_gerq2(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* ipiv, float* work) { int info; sgerq2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_gerq2(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* ipiv, double* work) { int info; dgerq2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_gerq2(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work) { int info; cgerq2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_gerq2(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work) { int info; zgerq2_(&m, &n, A, &lda, ipiv, work, &info); } // geqlf template <> void cpu_geqlf(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* ipiv, float* work, rocblas_int lwork) { int info; sgeqlf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_geqlf(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* ipiv, double* work, rocblas_int lwork) { int info; dgeqlf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_geqlf(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, rocblas_int lwork) { int info; cgeqlf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_geqlf(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, rocblas_int lwork) { int info; zgeqlf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } // geql2 template <> void cpu_geql2(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* ipiv, float* work) { int info; sgeql2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_geql2(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* ipiv, double* work) { int info; dgeql2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_geql2(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work) { int info; cgeql2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_geql2(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work) { int info; zgeql2_(&m, &n, A, &lda, ipiv, work, &info); } // gelqf template <> void cpu_gelqf(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* ipiv, float* work, rocblas_int lwork) { int info; sgelqf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_gelqf(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* ipiv, double* work, rocblas_int lwork) { int info; dgelqf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_gelqf(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work, rocblas_int lwork) { int info; cgelqf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } template <> void cpu_gelqf(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work, rocblas_int lwork) { int info; zgelqf_(&m, &n, A, &lda, ipiv, work, &lwork, &info); } // gelq2 template <> void cpu_gelq2(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* ipiv, float* work) { int info; sgelq2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_gelq2(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* ipiv, double* work) { int info; dgelq2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_gelq2(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* ipiv, rocblas_float_complex* work) { int info; cgelq2_(&m, &n, A, &lda, ipiv, work, &info); } template <> void cpu_gelq2(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* ipiv, rocblas_double_complex* work) { int info; zgelq2_(&m, &n, A, &lda, ipiv, work, &info); } // gebd2 template <> void cpu_gebd2(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* D, float* E, float* tauq, float* taup, float* work) { int info; sgebd2_(&m, &n, A, &lda, D, E, tauq, taup, work, &info); } template <> void cpu_gebd2(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* D, double* E, double* tauq, double* taup, double* work) { int info; dgebd2_(&m, &n, A, &lda, D, E, tauq, taup, work, &info); } template <> void cpu_gebd2(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, float* D, float* E, rocblas_float_complex* tauq, rocblas_float_complex* taup, rocblas_float_complex* work) { int info; cgebd2_(&m, &n, A, &lda, D, E, tauq, taup, work, &info); } template <> void cpu_gebd2(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, double* D, double* E, rocblas_double_complex* tauq, rocblas_double_complex* taup, rocblas_double_complex* work) { int info; zgebd2_(&m, &n, A, &lda, D, E, tauq, taup, work, &info); } // gebrd template <> void cpu_gebrd(rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* D, float* E, float* tauq, float* taup, float* work, rocblas_int size_w) { int info; sgebrd_(&m, &n, A, &lda, D, E, tauq, taup, work, &size_w, &info); } template <> void cpu_gebrd(rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* D, double* E, double* tauq, double* taup, double* work, rocblas_int size_w) { int info; dgebrd_(&m, &n, A, &lda, D, E, tauq, taup, work, &size_w, &info); } template <> void cpu_gebrd(rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, float* D, float* E, rocblas_float_complex* tauq, rocblas_float_complex* taup, rocblas_float_complex* work, rocblas_int size_w) { int info; cgebrd_(&m, &n, A, &lda, D, E, tauq, taup, work, &size_w, &info); } template <> void cpu_gebrd(rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, double* D, double* E, rocblas_double_complex* tauq, rocblas_double_complex* taup, rocblas_double_complex* work, rocblas_int size_w) { int info; zgebrd_(&m, &n, A, &lda, D, E, tauq, taup, work, &size_w, &info); } // sytrd & hetrd template <> void cpu_sytrd_hetrd(rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float* D, float* E, float* tau, float* work, rocblas_int size_w) { int info; char uploC = rocblas2char_fill(uplo); ssytrd_(&uploC, &n, A, &lda, D, E, tau, work, &size_w, &info); } template <> void cpu_sytrd_hetrd(rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double* D, double* E, double* tau, double* work, rocblas_int size_w) { int info; char uploC = rocblas2char_fill(uplo); dsytrd_(&uploC, &n, A, &lda, D, E, tau, work, &size_w, &info); } template <> void cpu_sytrd_hetrd(rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, float* D, float* E, rocblas_float_complex* tau, rocblas_float_complex* work, rocblas_int size_w) { int info; char uploC = rocblas2char_fill(uplo); chetrd_(&uploC, &n, A, &lda, D, E, tau, work, &size_w, &info); } template <> void cpu_sytrd_hetrd(rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, double* D, double* E, rocblas_double_complex* tau, rocblas_double_complex* work, rocblas_int size_w) { int info; char uploC = rocblas2char_fill(uplo); zhetrd_(&uploC, &n, A, &lda, D, E, tau, work, &size_w, &info); } // sytd2 & hetd2 template <> void cpu_sytd2_hetd2(rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float* D, float* E, float* tau) { int info; char uploC = rocblas2char_fill(uplo); ssytd2_(&uploC, &n, A, &lda, D, E, tau, &info); } template <> void cpu_sytd2_hetd2(rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double* D, double* E, double* tau) { int info; char uploC = rocblas2char_fill(uplo); dsytd2_(&uploC, &n, A, &lda, D, E, tau, &info); } template <> void cpu_sytd2_hetd2(rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, float* D, float* E, rocblas_float_complex* tau) { int info; char uploC = rocblas2char_fill(uplo); chetd2_(&uploC, &n, A, &lda, D, E, tau, &info); } template <> void cpu_sytd2_hetd2(rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, double* D, double* E, rocblas_double_complex* tau) { int info; char uploC = rocblas2char_fill(uplo); zhetd2_(&uploC, &n, A, &lda, D, E, tau, &info); } // sterf template <> void cpu_sterf(rocblas_int n, float* D, float* E) { int info; ssterf_(&n, D, E, &info); } template <> void cpu_sterf(rocblas_int n, double* D, double* E) { int info; dsterf_(&n, D, E, &info); } // steqr template <> void cpu_steqr(rocblas_evect evect, rocblas_int n, float* D, float* E, float* C, rocblas_int ldc, float* work, rocblas_int* info) { char evectC = rocblas2char_evect(evect); ssteqr_(&evectC, &n, D, E, C, &ldc, work, info); } template <> void cpu_steqr(rocblas_evect evect, rocblas_int n, double* D, double* E, double* C, rocblas_int ldc, double* work, rocblas_int* info) { char evectC = rocblas2char_evect(evect); dsteqr_(&evectC, &n, D, E, C, &ldc, work, info); } template <> void cpu_steqr(rocblas_evect evect, rocblas_int n, float* D, float* E, rocblas_float_complex* C, rocblas_int ldc, float* work, rocblas_int* info) { char evectC = rocblas2char_evect(evect); csteqr_(&evectC, &n, D, E, C, &ldc, work, info); } template <> void cpu_steqr(rocblas_evect evect, rocblas_int n, double* D, double* E, rocblas_double_complex* C, rocblas_int ldc, double* work, rocblas_int* info) { char evectC = rocblas2char_evect(evect); zsteqr_(&evectC, &n, D, E, C, &ldc, work, info); } // stedc template <> void cpu_stedc(rocblas_evect evect, rocblas_int n, float* D, float* E, float* C, rocblas_int ldc, float* work, rocblas_int lwork, float* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); sstedc_(&evectC, &n, D, E, C, &ldc, rwork, &lrwork, iwork, &liwork, info); } template <> void cpu_stedc(rocblas_evect evect, rocblas_int n, double* D, double* E, double* C, rocblas_int ldc, double* work, rocblas_int lwork, double* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); dstedc_(&evectC, &n, D, E, C, &ldc, rwork, &lrwork, iwork, &liwork, info); } template <> void cpu_stedc(rocblas_evect evect, rocblas_int n, float* D, float* E, rocblas_float_complex* C, rocblas_int ldc, rocblas_float_complex* work, rocblas_int lwork, float* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); cstedc_(&evectC, &n, D, E, C, &ldc, work, &lwork, rwork, &lrwork, iwork, &liwork, info); } template <> void cpu_stedc(rocblas_evect evect, rocblas_int n, double* D, double* E, rocblas_double_complex* C, rocblas_int ldc, rocblas_double_complex* work, rocblas_int lwork, double* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); zstedc_(&evectC, &n, D, E, C, &ldc, work, &lwork, rwork, &lrwork, iwork, &liwork, info); } // stebz template <> void cpu_stebz(rocblas_erange erange, rocblas_eorder eorder, rocblas_int n, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, float* D, float* E, rocblas_int* m, rocblas_int* nsplit, float* W, rocblas_int* iblock, rocblas_int* isplit, float* work, rocblas_int* iwork, rocblas_int* info) { char erangeC = rocblas2char_erange(erange); char eorderC = rocblas2char_eorder(eorder); sstebz_(&erangeC, &eorderC, &n, &vl, &vu, &il, &iu, &abstol, D, E, m, nsplit, W, iblock, isplit, work, iwork, info); } template <> void cpu_stebz(rocblas_erange erange, rocblas_eorder eorder, rocblas_int n, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, double* D, double* E, rocblas_int* m, rocblas_int* nsplit, double* W, rocblas_int* iblock, rocblas_int* isplit, double* work, rocblas_int* iwork, rocblas_int* info) { char erangeC = rocblas2char_erange(erange); char eorderC = rocblas2char_eorder(eorder); dstebz_(&erangeC, &eorderC, &n, &vl, &vu, &il, &iu, &abstol, D, E, m, nsplit, W, iblock, isplit, work, iwork, info); } // stein template <> void cpu_stein(rocblas_int n, float* D, float* E, rocblas_int* nev, float* W, rocblas_int* iblock, rocblas_int* isplit, float* Z, rocblas_int ldz, float* work, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { sstein_(&n, D, E, nev, W, iblock, isplit, Z, &ldz, work, iwork, ifail, info); } template <> void cpu_stein(rocblas_int n, double* D, double* E, rocblas_int* nev, double* W, rocblas_int* iblock, rocblas_int* isplit, double* Z, rocblas_int ldz, double* work, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { dstein_(&n, D, E, nev, W, iblock, isplit, Z, &ldz, work, iwork, ifail, info); } template <> void cpu_stein(rocblas_int n, float* D, float* E, rocblas_int* nev, float* W, rocblas_int* iblock, rocblas_int* isplit, rocblas_float_complex* Z, rocblas_int ldz, float* work, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { cstein_(&n, D, E, nev, W, iblock, isplit, Z, &ldz, work, iwork, ifail, info); } template <> void cpu_stein(rocblas_int n, double* D, double* E, rocblas_int* nev, double* W, rocblas_int* iblock, rocblas_int* isplit, rocblas_double_complex* Z, rocblas_int ldz, double* work, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { zstein_(&n, D, E, nev, W, iblock, isplit, Z, &ldz, work, iwork, ifail, info); } // sygs2 & hegs2 template <> void cpu_sygs2_hegs2(rocblas_eform itype, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float* B, rocblas_int ldb) { rocblas_int info; int itypeI = rocblas2char_eform(itype) - '0'; char uploC = rocblas2char_fill(uplo); ssygs2_(&itypeI, &uploC, &n, A, &lda, B, &ldb, &info); } template <> void cpu_sygs2_hegs2(rocblas_eform itype, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double* B, rocblas_int ldb) { rocblas_int info; int itypeI = rocblas2char_eform(itype) - '0'; char uploC = rocblas2char_fill(uplo); dsygs2_(&itypeI, &uploC, &n, A, &lda, B, &ldb, &info); } template <> void cpu_sygs2_hegs2(rocblas_eform itype, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb) { rocblas_int info; int itypeI = rocblas2char_eform(itype) - '0'; char uploC = rocblas2char_fill(uplo); chegs2_(&itypeI, &uploC, &n, A, &lda, B, &ldb, &info); } template <> void cpu_sygs2_hegs2(rocblas_eform itype, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb) { rocblas_int info; int itypeI = rocblas2char_eform(itype) - '0'; char uploC = rocblas2char_fill(uplo); zhegs2_(&itypeI, &uploC, &n, A, &lda, B, &ldb, &info); } // sygst & hegst template <> void cpu_sygst_hegst(rocblas_eform itype, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float* B, rocblas_int ldb) { rocblas_int info; int itypeI = rocblas2char_eform(itype) - '0'; char uploC = rocblas2char_fill(uplo); ssygst_(&itypeI, &uploC, &n, A, &lda, B, &ldb, &info); } template <> void cpu_sygst_hegst(rocblas_eform itype, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double* B, rocblas_int ldb) { rocblas_int info; int itypeI = rocblas2char_eform(itype) - '0'; char uploC = rocblas2char_fill(uplo); dsygst_(&itypeI, &uploC, &n, A, &lda, B, &ldb, &info); } template <> void cpu_sygst_hegst(rocblas_eform itype, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb) { rocblas_int info; int itypeI = rocblas2char_eform(itype) - '0'; char uploC = rocblas2char_fill(uplo); chegst_(&itypeI, &uploC, &n, A, &lda, B, &ldb, &info); } template <> void cpu_sygst_hegst(rocblas_eform itype, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb) { rocblas_int info; int itypeI = rocblas2char_eform(itype) - '0'; char uploC = rocblas2char_fill(uplo); zhegst_(&itypeI, &uploC, &n, A, &lda, B, &ldb, &info); } // syev & heev template <> void cpu_syev_heev(rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float* W, float* work, rocblas_int lwork, float* rwork, rocblas_int lrwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); ssyev_(&evectC, &uploC, &n, A, &lda, W, rwork, &lrwork, info); } template <> void cpu_syev_heev(rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double* W, double* work, rocblas_int lwork, double* rwork, rocblas_int lrwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); dsyev_(&evectC, &uploC, &n, A, &lda, W, rwork, &lrwork, info); } template <> void cpu_syev_heev(rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, float* W, rocblas_float_complex* work, rocblas_int lwork, float* rwork, rocblas_int lrwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); cheev_(&evectC, &uploC, &n, A, &lda, W, work, &lwork, rwork, info); } template <> void cpu_syev_heev(rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, double* W, rocblas_double_complex* work, rocblas_int lwork, double* rwork, rocblas_int lrwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); zheev_(&evectC, &uploC, &n, A, &lda, W, work, &lwork, rwork, info); } // syevd & heevd template <> void cpu_syevd_heevd(rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float* W, float* work, rocblas_int lwork, float* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); ssyevd_(&evectC, &uploC, &n, A, &lda, W, rwork, &lrwork, iwork, &liwork, info); } template <> void cpu_syevd_heevd(rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double* W, double* work, rocblas_int lwork, double* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); dsyevd_(&evectC, &uploC, &n, A, &lda, W, rwork, &lrwork, iwork, &liwork, info); } template <> void cpu_syevd_heevd(rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, float* W, rocblas_float_complex* work, rocblas_int lwork, float* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); cheevd_(&evectC, &uploC, &n, A, &lda, W, work, &lwork, rwork, &lrwork, iwork, &liwork, info); } template <> void cpu_syevd_heevd(rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, double* W, rocblas_double_complex* work, rocblas_int lwork, double* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); zheevd_(&evectC, &uploC, &n, A, &lda, W, work, &lwork, rwork, &lrwork, iwork, &liwork, info); } // syevx & heevx template <> void cpu_syevx_heevx(rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, float* Z, rocblas_int ldz, float* work, rocblas_int lwork, float* rwork, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char erangeC = rocblas2char_erange(erange); char uploC = rocblas2char_fill(uplo); ssyevx_(&evectC, &erangeC, &uploC, &n, A, &lda, &vl, &vu, &il, &iu, &abstol, nev, W, Z, &ldz, work, &lwork, iwork, ifail, info); } template <> void cpu_syevx_heevx(rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, double* Z, rocblas_int ldz, double* work, rocblas_int lwork, double* rwork, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char erangeC = rocblas2char_erange(erange); char uploC = rocblas2char_fill(uplo); dsyevx_(&evectC, &erangeC, &uploC, &n, A, &lda, &vl, &vu, &il, &iu, &abstol, nev, W, Z, &ldz, work, &lwork, iwork, ifail, info); } template <> void cpu_syevx_heevx(rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_float_complex* Z, rocblas_int ldz, rocblas_float_complex* work, rocblas_int lwork, float* rwork, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char erangeC = rocblas2char_erange(erange); char uploC = rocblas2char_fill(uplo); cheevx_(&evectC, &erangeC, &uploC, &n, A, &lda, &vl, &vu, &il, &iu, &abstol, nev, W, Z, &ldz, work, &lwork, rwork, iwork, ifail, info); } template <> void cpu_syevx_heevx(rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_double_complex* Z, rocblas_int ldz, rocblas_double_complex* work, rocblas_int lwork, double* rwork, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { char evectC = rocblas2char_evect(evect); char erangeC = rocblas2char_erange(erange); char uploC = rocblas2char_fill(uplo); zheevx_(&evectC, &erangeC, &uploC, &n, A, &lda, &vl, &vu, &il, &iu, &abstol, nev, W, Z, &ldz, work, &lwork, rwork, iwork, ifail, info); } // sygv & hegv template <> void cpu_sygv_hegv(rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float* B, rocblas_int ldb, float* W, float* work, rocblas_int lwork, float* rwork, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); ssygv_(&itypeI, &evectC, &uploC, &n, A, &lda, B, &ldb, W, work, &lwork, info); } template <> void cpu_sygv_hegv(rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double* B, rocblas_int ldb, double* W, double* work, rocblas_int lwork, double* rwork, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); dsygv_(&itypeI, &evectC, &uploC, &n, A, &lda, B, &ldb, W, work, &lwork, info); } template <> void cpu_sygv_hegv(rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb, float* W, rocblas_float_complex* work, rocblas_int lwork, float* rwork, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); chegv_(&itypeI, &evectC, &uploC, &n, A, &lda, B, &ldb, W, work, &lwork, rwork, info); } template <> void cpu_sygv_hegv(rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb, double* W, rocblas_double_complex* work, rocblas_int lwork, double* rwork, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); zhegv_(&itypeI, &evectC, &uploC, &n, A, &lda, B, &ldb, W, work, &lwork, rwork, info); } // sygvd & hegvd template <> void cpu_sygvd_hegvd(rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float* B, rocblas_int ldb, float* W, float* work, rocblas_int lwork, float* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); ssygvd_(&itypeI, &evectC, &uploC, &n, A, &lda, B, &ldb, W, rwork, &lrwork, iwork, &liwork, info); } template <> void cpu_sygvd_hegvd(rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double* B, rocblas_int ldb, double* W, double* work, rocblas_int lwork, double* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); dsygvd_(&itypeI, &evectC, &uploC, &n, A, &lda, B, &ldb, W, rwork, &lrwork, iwork, &liwork, info); } template <> void cpu_sygvd_hegvd(rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb, float* W, rocblas_float_complex* work, rocblas_int lwork, float* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); chegvd_(&itypeI, &evectC, &uploC, &n, A, &lda, B, &ldb, W, work, &lwork, rwork, &lrwork, iwork, &liwork, info); } template <> void cpu_sygvd_hegvd(rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb, double* W, rocblas_double_complex* work, rocblas_int lwork, double* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char uploC = rocblas2char_fill(uplo); zhegvd_(&itypeI, &evectC, &uploC, &n, A, &lda, B, &ldb, W, work, &lwork, rwork, &lrwork, iwork, &liwork, info); } // sygvx & hegvx template <> void cpu_sygvx_hegvx(rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float* B, rocblas_int ldb, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* m, float* W, float* Z, rocblas_int ldz, float* work, rocblas_int lwork, float* rwork, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char erangeC = rocblas2char_erange(erange); char uploC = rocblas2char_fill(uplo); ssygvx_(&itypeI, &evectC, &erangeC, &uploC, &n, A, &lda, B, &ldb, &vl, &vu, &il, &iu, &abstol, m, W, Z, &ldz, work, &lwork, iwork, ifail, info); } template <> void cpu_sygvx_hegvx(rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double* B, rocblas_int ldb, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* m, double* W, double* Z, rocblas_int ldz, double* work, rocblas_int lwork, double* rwork, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char erangeC = rocblas2char_erange(erange); char uploC = rocblas2char_fill(uplo); dsygvx_(&itypeI, &evectC, &erangeC, &uploC, &n, A, &lda, B, &ldb, &vl, &vu, &il, &iu, &abstol, m, W, Z, &ldz, work, &lwork, iwork, ifail, info); } template <> void cpu_sygvx_hegvx(rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* B, rocblas_int ldb, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* m, float* W, rocblas_float_complex* Z, rocblas_int ldz, rocblas_float_complex* work, rocblas_int lwork, float* rwork, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char erangeC = rocblas2char_erange(erange); char uploC = rocblas2char_fill(uplo); chegvx_(&itypeI, &evectC, &erangeC, &uploC, &n, A, &lda, B, &ldb, &vl, &vu, &il, &iu, &abstol, m, W, Z, &ldz, work, &lwork, rwork, iwork, ifail, info); } template <> void cpu_sygvx_hegvx(rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* B, rocblas_int ldb, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* m, double* W, rocblas_double_complex* Z, rocblas_int ldz, rocblas_double_complex* work, rocblas_int lwork, double* rwork, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info) { int itypeI = rocblas2char_eform(itype) - '0'; char evectC = rocblas2char_evect(evect); char erangeC = rocblas2char_erange(erange); char uploC = rocblas2char_fill(uplo); zhegvx_(&itypeI, &evectC, &erangeC, &uploC, &n, A, &lda, B, &ldb, &vl, &vu, &il, &iu, &abstol, m, W, Z, &ldz, work, &lwork, rwork, iwork, ifail, info); } // lasyf template <> void cpu_lasyf(rocblas_fill uplo, rocblas_int n, rocblas_int nb, rocblas_int* kb, float* A, rocblas_int lda, rocblas_int* ipiv, float* W, rocblas_int ldw, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); slasyf_(&uploC, &n, &nb, kb, A, &lda, ipiv, W, &ldw, info); } template <> void cpu_lasyf(rocblas_fill uplo, rocblas_int n, rocblas_int nb, rocblas_int* kb, double* A, rocblas_int lda, rocblas_int* ipiv, double* W, rocblas_int ldw, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); dlasyf_(&uploC, &n, &nb, kb, A, &lda, ipiv, W, &ldw, info); } template <> void cpu_lasyf(rocblas_fill uplo, rocblas_int n, rocblas_int nb, rocblas_int* kb, rocblas_float_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_float_complex* W, rocblas_int ldw, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); clasyf_(&uploC, &n, &nb, kb, A, &lda, ipiv, W, &ldw, info); } template <> void cpu_lasyf(rocblas_fill uplo, rocblas_int n, rocblas_int nb, rocblas_int* kb, rocblas_double_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_double_complex* W, rocblas_int ldw, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); zlasyf_(&uploC, &n, &nb, kb, A, &lda, ipiv, W, &ldw, info); } // sytf2 template <> void cpu_sytf2(rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); ssytf2_(&uploC, &n, A, &lda, ipiv, info); } template <> void cpu_sytf2(rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); dsytf2_(&uploC, &n, A, &lda, ipiv, info); } template <> void cpu_sytf2(rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); csytf2_(&uploC, &n, A, &lda, ipiv, info); } template <> void cpu_sytf2(rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); zsytf2_(&uploC, &n, A, &lda, ipiv, info); } // sytrf template <> void cpu_sytrf(rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_int* ipiv, float* work, rocblas_int lwork, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); ssytrf_(&uploC, &n, A, &lda, ipiv, work, &lwork, info); } template <> void cpu_sytrf(rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_int* ipiv, double* work, rocblas_int lwork, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); dsytrf_(&uploC, &n, A, &lda, ipiv, work, &lwork, info); } template <> void cpu_sytrf(rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_float_complex* work, rocblas_int lwork, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); csytrf_(&uploC, &n, A, &lda, ipiv, work, &lwork, info); } template <> void cpu_sytrf(rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_double_complex* work, rocblas_int lwork, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); zsytrf_(&uploC, &n, A, &lda, ipiv, work, &lwork, info); } // bdsvdx template <> void cpu_bdsvdx(rocblas_fill uplo, rocblas_svect svect, rocblas_srange srange, rocblas_int n, float* D, float* E, float vl, float vu, rocblas_int il, rocblas_int iu, rocblas_int* nsv, float* S, float* Z, rocblas_int ldz, float* work, rocblas_int* iwork, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); char svectC = rocblas2char_svect(svect, true); char srangeC = rocblas2char_srange(srange); sbdsvdx_(&uploC, &svectC, &srangeC, &n, D, E, &vl, &vu, &il, &iu, nsv, S, Z, &ldz, work, iwork, info); } template <> void cpu_bdsvdx(rocblas_fill uplo, rocblas_svect svect, rocblas_srange srange, rocblas_int n, double* D, double* E, double vl, double vu, rocblas_int il, rocblas_int iu, rocblas_int* nsv, double* S, double* Z, rocblas_int ldz, double* work, rocblas_int* iwork, rocblas_int* info) { char uploC = rocblas2char_fill(uplo); char svectC = rocblas2char_svect(svect, true); char srangeC = rocblas2char_srange(srange); dbdsvdx_(&uploC, &svectC, &srangeC, &n, D, E, &vl, &vu, &il, &iu, nsv, S, Z, &ldz, work, iwork, info); } rocSOLVER-rocm-5.5.1/clients/common/testing_bdsqr.cpp000066400000000000000000000005741436600607200225150ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_BDSQR(...) template void testing_bdsqr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_BDSQR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_bdsvdx.cpp000066400000000000000000000005761436600607200226760ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_BDSVDX(...) template void testing_bdsvdx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_BDSVDX, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_gebd2_gebrd.cpp000066400000000000000000000007711436600607200235270ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GEBD2_GEBRD(...) template void testing_gebd2_gebrd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEBD2_GEBRD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_geblttrf_npvt.cpp000066400000000000000000000006701436600607200242570ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GEBLTTRF_NPVT(...) template void testing_geblttrf_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEBLTTRF_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_geblttrs_npvt.cpp000066400000000000000000000006701436600607200242740ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GEBLTTRS_NPVT(...) template void testing_geblttrs_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEBLTTRS_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_gelq2_gelqf.cpp000066400000000000000000000007711436600607200235710ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GELQ2_GELQF(...) template void testing_gelq2_gelqf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GELQ2_GELQF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_gels.cpp000066400000000000000000000006241436600607200223300ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GELS(...) template void testing_gels<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GELS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_geql2_geqlf.cpp000066400000000000000000000007711436600607200235710ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GEQL2_GEQLF(...) template void testing_geql2_geqlf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEQL2_GEQLF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_geqr2_geqrf.cpp000066400000000000000000000007711436600607200236050ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GEQR2_GEQRF(...) template void testing_geqr2_geqrf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEQR2_GEQRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_gerq2_gerqf.cpp000066400000000000000000000007711436600607200236050ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GERQ2_GERQF(...) template void testing_gerq2_gerqf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GERQ2_GERQF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_gesv.cpp000066400000000000000000000006241436600607200223420ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GESV(...) template void testing_gesv<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GESV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_gesvd.cpp000066400000000000000000000006301436600607200225030ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GESVD(...) template void testing_gesvd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GESVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_gesvdj.cpp000066400000000000000000000006341436600607200226610ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GESVDJ(...) template void testing_gesvdj<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GESVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_gesvdx.cpp000066400000000000000000000006341436600607200226770ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GESVDX(...) template void testing_gesvdx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GESVDX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_getf2_getrf.cpp000066400000000000000000000007711436600607200235770ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GETF2_GETRF(...) template void testing_getf2_getrf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETF2_GETRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_getf2_getrf_npvt.cpp000066400000000000000000000010231436600607200246350ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GETF2_GETRF_NPVT(...) \ template void testing_getf2_getrf_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETF2_GETRF_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_getri.cpp000066400000000000000000000006301436600607200225050ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GETRI(...) template void testing_getri<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_getri_npvt.cpp000066400000000000000000000006541436600607200235620ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GETRI_NPVT(...) template void testing_getri_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETRI_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_getri_npvt_outofplace.cpp000066400000000000000000000007361436600607200260040ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GETRI_NPVT_OUTOFPLACE(...) \ template void testing_getri_npvt_outofplace<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETRI_NPVT_OUTOFPLACE, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_getri_outofplace.cpp000066400000000000000000000007121436600607200247270ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GETRI_OUTOFPLACE(...) \ template void testing_getri_outofplace<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETRI_OUTOFPLACE, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_getrs.cpp000066400000000000000000000006301436600607200225170ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_GETRS(...) template void testing_getrs<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETRS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_labrd.cpp000066400000000000000000000005741436600607200224660ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_LABRD(...) template void testing_labrd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LABRD, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_lacgv.cpp000066400000000000000000000005751436600607200224770ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_LACGV(...) template void testing_lacgv<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LACGV, FOREACH_COMPLEX_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_larf.cpp000066400000000000000000000005701436600607200223220ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_LARF(...) template void testing_larf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LARF, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_larfb.cpp000066400000000000000000000005741436600607200224700ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_LARFB(...) template void testing_larfb<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LARFB, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_larfg.cpp000066400000000000000000000005741436600607200224750ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_LARFG(...) template void testing_larfg<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LARFG, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_larft.cpp000066400000000000000000000005741436600607200225120ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_LARFT(...) template void testing_larft<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LARFT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_laswp.cpp000066400000000000000000000005741436600607200225300ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_LASWP(...) template void testing_laswp<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LASWP, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_lasyf.cpp000066400000000000000000000005741436600607200225200ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_LASYF(...) template void testing_lasyf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LASYF, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_latrd.cpp000066400000000000000000000005741436600607200225100ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_LATRD(...) template void testing_latrd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LATRD, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_lauum.cpp000066400000000000000000000005721436600607200225230ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_LAUUM(...) template void testing_lauum<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LAUUM, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_orgbr_ungbr.cpp000066400000000000000000000006241436600607200237060ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_ORGBR_UNGBR(...) template void testing_orgbr_ungbr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORGBR_UNGBR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_orglx_unglx.cpp000066400000000000000000000006551436600607200237520ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_ORGLX_UNGLX(...) template void testing_orglx_unglx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORGLX_UNGLX, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_orgtr_ungtr.cpp000066400000000000000000000006241436600607200237520ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_ORGTR_UNGTR(...) template void testing_orgtr_ungtr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORGTR_UNGTR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_orgxl_ungxl.cpp000066400000000000000000000006551436600607200237520ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_ORGXL_UNGXL(...) template void testing_orgxl_ungxl<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORGXL_UNGXL, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_orgxr_ungxr.cpp000066400000000000000000000006551436600607200237660ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_ORGXR_UNGXR(...) template void testing_orgxr_ungxr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORGXR_UNGXR, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_ormbr_unmbr.cpp000066400000000000000000000006241436600607200237220ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_ORMBR_UNMBR(...) template void testing_ormbr_unmbr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORMBR_UNMBR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_ormlx_unmlx.cpp000066400000000000000000000006551436600607200237660ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_ORMLX_UNMLX(...) template void testing_ormlx_unmlx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORMLX_UNMLX, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_ormtr_unmtr.cpp000066400000000000000000000006241436600607200237660ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_ORMTR_UNMTR(...) template void testing_ormtr_unmtr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORMTR_UNMTR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_ormxl_unmxl.cpp000066400000000000000000000006551436600607200237660ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_ORMXL_UNMXL(...) template void testing_ormxl_unmxl<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORMXL_UNMXL, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_ormxr_unmxr.cpp000066400000000000000000000006551436600607200240020ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_ORMXR_UNMXR(...) template void testing_ormxr_unmxr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORMXR_UNMXR, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_posv.cpp000066400000000000000000000006241436600607200223650ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_POSV(...) template void testing_posv<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_POSV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_potf2_potrf.cpp000066400000000000000000000007711436600607200236450ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_POTF2_POTRF(...) template void testing_potf2_potrf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_POTF2_POTRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_potri.cpp000066400000000000000000000006301436600607200225300ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_POTRI(...) template void testing_potri<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_POTRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_potrs.cpp000066400000000000000000000006301436600607200225420ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_POTRS(...) template void testing_potrs<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_POTRS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_stebz.cpp000066400000000000000000000005721436600607200225270ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_STEBZ(...) template void testing_stebz<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STEBZ, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_stedc.cpp000066400000000000000000000005741436600607200225040ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_STEDC(...) template void testing_stedc<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STEDC, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_stein.cpp000066400000000000000000000005741436600607200225240ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_STEIN(...) template void testing_stein<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STEIN, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_steqr.cpp000066400000000000000000000005741436600607200225400ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_STEQR(...) template void testing_steqr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STEQR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_sterf.cpp000066400000000000000000000005721436600607200225230ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_STERF(...) template void testing_sterf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STERF, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_syev_heev.cpp000066400000000000000000000006501436600607200233720ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_SYEV_HEEV(...) template void testing_syev_heev<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYEV_HEEV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_syevd_heevd.cpp000066400000000000000000000006601436600607200237030ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_SYEVD_HEEVD(...) template void testing_syevd_heevd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYEVD_HEEVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_syevj_heevj.cpp000066400000000000000000000006601436600607200237170ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_SYEVJ_HEEVJ(...) template void testing_syevj_heevj<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYEVJ_HEEVJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_syevx_heevx.cpp000066400000000000000000000006601436600607200237530ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_SYEVX_HEEVX(...) template void testing_syevx_heevx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYEVX_HEEVX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_sygsx_hegsx.cpp000066400000000000000000000007711436600607200237540ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_SYGSX_HEGSX(...) template void testing_sygsx_hegsx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGSX_HEGSX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_sygv_hegv.cpp000066400000000000000000000006501436600607200233760ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_SYGV_HEGV(...) template void testing_sygv_hegv<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGV_HEGV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_sygvd_hegvd.cpp000066400000000000000000000006601436600607200237070ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_SYGVD_HEGVD(...) template void testing_sygvd_hegvd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGVD_HEGVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_sygvj_hegvj.cpp000066400000000000000000000006601436600607200237230ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_SYGVJ_HEGVJ(...) template void testing_sygvj_hegvj<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGVJ_HEGVJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_sygvx_hegvx.cpp000066400000000000000000000006601436600607200237570ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_SYGVX_HEGVX(...) template void testing_sygvx_hegvx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGVX_HEGVX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_sytf2_sytrf.cpp000066400000000000000000000007711436600607200236770ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_SYTF2_SYTRF(...) template void testing_sytf2_sytrf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYTF2_SYTRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_sytxx_hetxx.cpp000066400000000000000000000007711436600607200240200ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_SYTXX_HETXX(...) template void testing_sytxx_hetxx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYTXX_HETXX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/common/testing_trtri.cpp000066400000000000000000000006301436600607200225370ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #define TESTING_TRTRI(...) template void testing_trtri<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_TRTRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/extras/000077500000000000000000000000001436600607200171515ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/clients/extras/CMakeLists.txt000066400000000000000000000023341436600607200217130ustar00rootroot00000000000000# ######################################################################## # Copyright (c) 2021-2022 Advanced Micro Devices, Inc. # ######################################################################## find_package(GTest REQUIRED) add_executable(test-header test_header.c ) target_link_libraries(test-header PRIVATE roc::rocsolver) set_target_properties(test-header PROPERTIES C_STANDARD 99 C_STANDARD_REQUIRED ON C_EXTENSIONS OFF ) if(UNIX AND BUILD_SHARED_LIBS) if(TARGET rocsolver) add_executable(test-rocsolver-dlopen test_dlopen_main.cpp ) target_compile_definitions(test-rocsolver-dlopen PRIVATE ROCSOLVER_LIB_NAME="$" ) target_link_libraries(test-rocsolver-dlopen PRIVATE GTest::GTest ${CMAKE_DL_LIBS} ) add_test( NAME test-rocsolver-dlopen COMMAND test-rocsolver-dlopen ) endif() if(TARGET rocsolver-bench) find_package(Python3 COMPONENTS Interpreter) if(Python3_FOUND) add_test( NAME test-rocsolver-bench COMMAND "${Python3_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/test_rocsolver_bench.py" WORKING_DIRECTORY "$" ) endif() endif() endif() rocSOLVER-rocm-5.5.1/clients/extras/test_dlopen_main.cpp000066400000000000000000000011511436600607200231770ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #include // Tensorflow uses dlopen to load the ROCm libraries. // https://github.com/ROCmSoftwarePlatform/rocSOLVER/issues/230 TEST(TestDynamicLinking, AllSymbolsResolved) { ASSERT_NE(dlopen(ROCSOLVER_LIB_NAME, RTLD_NOW | RTLD_LOCAL), nullptr) << dlerror(); } int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } rocSOLVER-rocm-5.5.1/clients/extras/test_header.c000066400000000000000000000003751436600607200216110ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021 Advanced Micro Devices, Inc. * ************************************************************************ */ #include int main() {} rocSOLVER-rocm-5.5.1/clients/extras/test_rocsolver_bench.py000066400000000000000000000774741436600607200237620ustar00rootroot00000000000000# ######################################################################## # Copyright (c) 2022 Advanced Micro Devices, Inc. # ######################################################################## import collections import os import re import shlex import unittest from subprocess import Popen, PIPE def call_rocsolver_bench(*args): cmd = ['./rocsolver-bench'] for arg in args: if isinstance(arg, str): cmd.extend(shlex.split(arg, False, False)) elif isinstance(arg, collections.Sequence): cmd.extend(arg) else: cmd.push(str(arg)) process = Popen(cmd, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() return (str(stdout, encoding='utf-8', errors='surrogateescape'), str(stderr, encoding='utf-8', errors='surrogateescape'), process.returncode) class TestRocsolverBench(unittest.TestCase): def parse_arguments(self, bench_output): m = re.search(r"\n=+\nArguments:\s*\n=+\n(?P.*)\n(?P.*)\n", bench_output, re.MULTILINE) self.assertTrue(m) arg_names = m.group('arg_names').split() arg_values = m.group('arg_values').split() self.assertEqual(len(arg_names), len(arg_values)) return dict(zip(arg_names, arg_values)) def parse_results(self, bench_output): m = re.search(r"\n=+\nResults:\s*\n=+\n(?P.*)\n(?P.*)\n", bench_output, re.MULTILINE) self.assertTrue(m) arg_names = m.group('arg_names').split() arg_values = m.group('arg_values').split() self.assertEqual(len(arg_names), len(arg_values)) return dict(zip(arg_names, arg_values)) def test_help(self): out, err, exitcode = call_rocsolver_bench('--help') self.assertEqual(err, '') self.assertEqual(exitcode, 0) self.assertNotEqual(out, '') def test_validate_precision(self): for precision in 'sdcz': with self.subTest(precision=precision): out, err, exitcode = call_rocsolver_bench(f'-f gels -r {precision} -n 10 -m 15') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f gels -r 0 -n 10 -m 15') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_operation(self): for trans in 'NTC': with self.subTest(trans=trans): out, err, exitcode = call_rocsolver_bench(f'-f gels --trans {trans} -n 10 -m 15') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f gels --trans 0 -n 10 -m 15') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_side(self): for side in 'LRB': with self.subTest(side=side): out, err, exitcode = call_rocsolver_bench(f'-f larf --side {side} -n 10 -m 15') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f larf --side 0 -n 10 -m 15') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_fill(self): for fill in 'ULF': with self.subTest(fill=fill): out, err, exitcode = call_rocsolver_bench(f'-f bdsqr --uplo {fill} -n 15 --nu 10 --nv 10') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f bdsqr --uplo 0 -n 15 --nu 10 --nv 10') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_diagonal(self): for diag in 'NU': with self.subTest(diag=diag): out, err, exitcode = call_rocsolver_bench(f'-f trtri --diag {diag} -n 10') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f trtri --diag 0 -n 10') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_direct(self): for direct in 'FB': with self.subTest(direct=direct): out, err, exitcode = call_rocsolver_bench(f'-f larfb --direct {direct} --side L --storev C -n 10 -m 15 -k 1') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f larfb --direct 0 --side L --storev C -k 1 -n 10 -m 15 -k 1') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_storev(self): for storev in 'CR': with self.subTest(storev=storev): out, err, exitcode = call_rocsolver_bench(f'-f larft --storev {storev} -n 10 -k 1') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f larft --storev 0 -n 10 -k 1') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_left_svect(self): for svect in 'ASON': with self.subTest(svect=svect): out, err, exitcode = call_rocsolver_bench(f'-f gesvd --left_svect {svect} -n 10 -m 15') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f gesvd --left_svect 0 -n 10 -m 15') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_right_svect(self): for svect in 'ASON': with self.subTest(svect=svect): out, err, exitcode = call_rocsolver_bench(f'-f gesvd --right_svect {svect} -n 10 -m 15') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f gesvd --right_svect 0 -n 10 -m 15') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_workmode(self): for workmode in 'IO': with self.subTest(workmode=workmode): out, err, exitcode = call_rocsolver_bench(f'-f gesvd --fast_alg {workmode} -n 10 -m 15') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f gesvd --fast_alg 0 -n 10 -m 15') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_evect(self): for evect in 'VIN': with self.subTest(evect=evect): out, err, exitcode = call_rocsolver_bench(f'-f syev --evect {evect} -n 10') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f syev --evect 0 -n 10') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_erange(self): for erange in 'AVI': with self.subTest(erange=erange): out, err, exitcode = call_rocsolver_bench(f'-f stebz --erange {erange} -n 10') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f stebz --erange 0 -n 10') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_eorder(self): for eorder in 'BE': with self.subTest(eorder=eorder): out, err, exitcode = call_rocsolver_bench(f'-f stebz --eorder {eorder} -n 10') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f stebz --eorder 0 -n 10') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_validate_itype(self): for itype in '123': with self.subTest(itype=itype): out, err, exitcode = call_rocsolver_bench(f'-f sygv --itype {itype} -n 10') self.assertEqual(err, '') self.assertEqual(exitcode, 0) out, err, exitcode = call_rocsolver_bench('-f sygv --itype 0 -n 10') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_unused_arg(self): out, err, exitcode = call_rocsolver_bench('-f gels --itype 1 -n 10 -m 15') self.assertNotEqual(err, '') self.assertNotEqual(exitcode, 0) def test_perf_returns_number(self): out, err, exitcode = call_rocsolver_bench('-f gels -n 10 -m 15 --perf 1') self.assertEqual(err, '') self.assertEqual(exitcode, 0) self.assertGreaterEqual(float(out), 0) def generate_parameterized_test(command_options, expected_args): def test_function_output(self): out, err, exitcode = call_rocsolver_bench(command_options) self.assertEqual(err, '') self.assertEqual(exitcode, 0) args = self.parse_arguments(out) self.assertEqual(args, expected_args) results = self.parse_results(out) self.assertGreaterEqual(float(results['cpu_time_us']), 0) self.assertGreaterEqual(float(results['gpu_time_us']), 0) return test_function_output parameters = [ ( 'laswp', '-f laswp -n 10 --k1 15 --k2 20', { 'n': '10', 'lda': '20', 'k1': '15', 'k2': '20', 'inc': '1', } ), ( 'larfg', '-f larfg -n 10', { 'n': '10', 'inc': '1', } ), ( 'larf', '-f larf -m 10 --side L', { 'side': 'L', 'm': '10', 'n': '10', 'inc': '1', 'lda': '10', } ), ( 'larft', '-f larft -n 10 --storev C -k 5', { 'direct': 'F', 'storev': 'C', 'n': '10', 'k': '5', 'ldv': '10', 'ldt': '5', } ), ( 'larfb', '-f larfb -n 10 -m 15 --direct F --side L --storev C -k 5', { 'side': 'L', 'trans': 'N', 'direct': 'F', 'storev': 'C', 'm': '15', 'n': '10', 'k': '5', 'ldv': '15', 'ldt': '5', 'lda': '15', } ), ( 'latrd', '-f latrd -n 10 -k 5', { 'uplo': 'U', 'n': '10', 'k': '5', 'lda': '10', 'ldw': '10', } ), ( 'labrd', '-f labrd -n 10 -m 15', { 'm': '15', 'n': '10', 'nb': '10', 'lda': '15', 'ldx': '15', 'ldy': '10', } ), ( 'bdsqr', '-f bdsqr --uplo U -n 15 --nu 10 --nv 10', { 'uplo': 'U', 'n': '15', 'nv': '10', 'nu': '10', 'nc': '0', 'ldv': '15', 'ldu': '10', 'ldc': '1', } ), ( 'steqr', '-f steqr -n 15', { 'evect': 'N', 'n': '15', 'ldc': '15', } ), ( 'stedc', '-f stedc -n 15', { 'evect': 'N', 'n': '15', 'ldc': '15', } ), ( 'stein', '-f stein -n 15', { 'n': '15', 'nev': '5', 'ldz': '15', } ), ( 'lasyf', '-f lasyf -n 10', { 'uplo': 'U', 'n': '10', 'nb': '10', 'lda': '10', } ), ( 'potf2', '-f potf2 -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', } ), ( 'potf2_batched', '-f potf2_batched -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', 'batch_c': '1', } ), ( 'potf2_strided_batched', '-f potf2_strided_batched -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', 'strideA': '100', 'batch_c': '1', } ), ( 'potrf', '-f potrf -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', } ), ( 'potrf_batched', '-f potrf_batched -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', 'batch_c': '1', } ), ( 'potrf_strided_batched', '-f potrf_strided_batched -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', 'strideA': '100', 'batch_c': '1', } ), ( 'potrs', '-f potrs -n 10', { 'uplo': 'U', 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', } ), ( 'potrs_batched', '-f potrs_batched -n 10', { 'uplo': 'U', 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', 'batch_c': '1', } ), ( 'potrs_strided_batched', '-f potrs_strided_batched -n 10', { 'uplo': 'U', 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', 'strideA': '100', 'strideB': '100', 'batch_c': '1', } ), ( 'posv', '-f posv -n 10', { 'uplo': 'U', 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', } ), ( 'posv_batched', '-f posv_batched -n 10', { 'uplo': 'U', 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', 'batch_c': '1', } ), ( 'posv_strided_batched', '-f posv_strided_batched -n 10', { 'uplo': 'U', 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', 'strideA': '100', 'strideB': '100', 'batch_c': '1', } ), ( 'potri', '-f potri -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', } ), ( 'potri_batched', '-f potri_batched -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', 'batch_c': '1', } ), ( 'potri_strided_batched', '-f potri_strided_batched -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', 'strideA': '100', 'batch_c': '1', } ), ( 'getf2_npvt', '-f getf2_npvt -m 10', { 'm': '10', 'n': '10', 'lda': '10', } ), ( 'getf2_npvt_batched', '-f getf2_npvt_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'batch_c': '1', } ), ( 'getf2_npvt_strided_batched', '-f getf2_npvt_strided_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideA': '100', 'batch_c': '1', } ), ( 'getrf_npvt', '-f getrf_npvt -m 10', { 'm': '10', 'n': '10', 'lda': '10', } ), ( 'getrf_npvt_batched', '-f getrf_npvt_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'batch_c': '1', } ), ( 'getrf_npvt_strided_batched', '-f getrf_npvt_strided_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideA': '100', 'batch_c': '1', } ), ( 'getrf', '-f getrf -m 10', { 'm': '10', 'n': '10', 'lda': '10', } ), ( 'getrf_batched', '-f getrf_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'getrf_strided_batched', '-f getrf_strided_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'batch_c': '1', } ), ( 'getf2', '-f getf2 -m 10', { 'm': '10', 'n': '10', 'lda': '10', } ), ( 'getf2_batched', '-f getf2_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'getf2_strided_batched', '-f getf2_strided_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'batch_c': '1', } ), ( 'geqr2', '-f geqr2 -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', } ), ( 'geqr2_batched', '-f geqr2_batched -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', 'strideP': '10', 'batch_c': '1', } ), ( 'geqr2_strided_batched', '-f geqr2_strided_batched -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', 'strideA': '150', 'strideP': '10', 'batch_c': '1', } ), ( 'geqrf', '-f geqrf -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', } ), ( 'geqrf_batched', '-f geqrf_batched -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', 'strideP': '10', 'batch_c': '1', } ), ( 'geqrf_strided_batched', '-f geqrf_strided_batched -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', 'strideA': '150', 'strideP': '10', 'batch_c': '1', } ), ( 'geqrf_ptr_batched', '-f geqrf_ptr_batched -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', 'strideP': '10', 'batch_c': '1', } ), ( 'gerq2', '-f gerq2 -m 10', { 'm': '10', 'n': '10', 'lda': '10', } ), ( 'gerq2_batched', '-f gerq2_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'gerq2_strided_batched', '-f gerq2_strided_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'batch_c': '1', } ), ( 'gerqf', '-f gerqf -m 10', { 'm': '10', 'n': '10', 'lda': '10', } ), ( 'gerqf_batched', '-f gerqf_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'gerqf_strided_batched', '-f gerqf_strided_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'batch_c': '1', } ), ( 'geql2', '-f geql2 -m 10', { 'm': '10', 'n': '10', 'lda': '10', } ), ( 'geql2_batched', '-f geql2_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'geql2_strided_batched', '-f geql2_strided_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'batch_c': '1', } ), ( 'geqlf', '-f geqlf -m 10', { 'm': '10', 'n': '10', 'lda': '10', } ), ( 'geqlf_batched', '-f geqlf_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'geqlf_strided_batched', '-f geqlf_strided_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'batch_c': '1', } ), ( 'gelq2', '-f gelq2 -m 10', { 'm': '10', 'n': '10', 'lda': '10', } ), ( 'gelq2_batched', '-f gelq2_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'gelq2_strided_batched', '-f gelq2_strided_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'batch_c': '1', } ), ( 'gelqf', '-f gelqf -m 10', { 'm': '10', 'n': '10', 'lda': '10', } ), ( 'gelqf_batched', '-f gelqf_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'gelqf_strided_batched', '-f gelqf_strided_batched -m 10', { 'm': '10', 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'batch_c': '1', } ), ( 'getrs', '-f getrs -n 10', { 'trans': 'N', 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', } ), ( 'getrs_batched', '-f getrs_batched -n 10', { 'trans': 'N', 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'getrs_strided_batched', '-f getrs_strided_batched -n 10', { 'trans': 'N', 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', 'strideA': '100', 'strideP': '10', 'strideB': '100', 'batch_c': '1', } ), ( 'gesv', '-f gesv -n 10', { 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', } ), ( 'gesv_batched', '-f gesv_batched -n 10', { 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'gesv_strided_batched', '-f gesv_strided_batched -n 10', { 'n': '10', 'nrhs': '10', 'lda': '10', 'ldb': '10', 'strideA': '100', 'strideP': '10', 'strideB': '100', 'batch_c': '1', } ), ( 'gesvd', '-f gesvd -n 10 -m 15', { 'left_svect': 'N', 'right_svect': 'N', 'm': '15', 'n': '10', 'lda': '15', 'ldu': '15', 'ldv': '10', } ), ( 'gesvd_batched', '-f gesvd_batched -n 10 -m 15', { 'left_svect': 'N', 'right_svect': 'N', 'm': '15', 'n': '10', 'lda': '15', 'strideS': '10', 'ldu': '15', 'strideU': '225', 'ldv': '10', 'strideV': '100', 'strideE': '9', 'batch_c': '1', } ), ( 'gesvd_strided_batched', '-f gesvd_strided_batched -n 10 -m 15', { 'left_svect': 'N', 'right_svect': 'N', 'm': '15', 'n': '10', 'lda': '15', 'strideA': '150', 'strideS': '10', 'ldu': '15', 'strideU': '225', 'ldv': '10', 'strideV': '100', 'strideE': '9', 'batch_c': '1', } ), ( 'trtri', '-f trtri -n 10', { 'uplo': 'U', 'diag': 'N', 'n': '10', 'lda': '10', } ), ( 'trtri_batched', '-f trtri_batched -n 10', { 'uplo': 'U', 'diag': 'N', 'n': '10', 'lda': '10', 'batch_c': '1', } ), ( 'trtri_strided_batched', '-f trtri_strided_batched -n 10', { 'uplo': 'U', 'diag': 'N', 'n': '10', 'lda': '10', 'strideA': '100', 'batch_c': '1', } ), ( 'getri', '-f getri -n 10', { 'n': '10', 'lda': '10', } ), ( 'getri_batched', '-f getri_batched -n 10', { 'n': '10', 'lda': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'getri_strided_batched', '-f getri_strided_batched -n 10', { 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'batch_c': '1', } ), ( 'getri_npvt', '-f getri_npvt -n 10', { 'n': '10', 'lda': '10', } ), ( 'getri_npvt_batched', '-f getri_npvt_batched -n 10', { 'n': '10', 'lda': '10', 'batch_c': '1', } ), ( 'getri_npvt_strided_batched', '-f getri_npvt_strided_batched -n 10', { 'n': '10', 'lda': '10', 'strideA': '100', 'batch_c': '1', } ), ( 'getri_outofplace', '-f getri_outofplace -n 10', { 'n': '10', 'lda': '10', 'ldc': '10', } ), ( 'getri_outofplace_batched', '-f getri_outofplace_batched -n 10', { 'n': '10', 'lda': '10', 'strideP': '10', 'ldc': '10', 'batch_c': '1', } ), ( 'getri_outofplace_strided_batched', '-f getri_outofplace_strided_batched -n 10', { 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'ldc': '10', 'strideC': '100', 'batch_c': '1', } ), ( 'getri_npvt_outofplace', '-f getri_npvt_outofplace -n 10', { 'n': '10', 'lda': '10', 'ldc': '10', } ), ( 'getri_npvt_outofplace_batched', '-f getri_npvt_outofplace_batched -n 10', { 'n': '10', 'lda': '10', 'ldc': '10', 'batch_c': '1', } ), ( 'getri_npvt_outofplace_strided_batched', '-f getri_npvt_outofplace_strided_batched -n 10', { 'n': '10', 'lda': '10', 'strideA': '100', 'ldc': '10', 'strideC': '100', 'batch_c': '1', } ), ( 'gels', '-f gels -n 10 -m 15', { 'trans': 'N', 'm': '15', 'n': '10', 'nrhs': '10', 'lda': '15', 'ldb': '15', } ), ( 'gels_batched', '-f gels_batched -n 10 -m 15', { 'trans': 'N', 'm': '15', 'n': '10', 'nrhs': '10', 'lda': '15', 'ldb': '15', 'batch_c': '1', } ), ( 'gels_strided_batched', '-f gels_strided_batched -n 10 -m 15', { 'trans': 'N', 'm': '15', 'n': '10', 'nrhs': '10', 'lda': '15', 'ldb': '15', 'strideA': '150', 'strideB': '150', 'batch_c': '1', } ), ( 'gebd2', '-f gebd2 -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', } ), ( 'gebd2_batched', '-f gebd2_batched -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', 'strideP': '10', 'batch_c': '1', } ), ( 'gebd2_strided_batched', '-f gebd2_strided_batched -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', 'strideA': '150', 'strideP': '10', 'batch_c': '1', } ), ( 'gebrd', '-f gebrd -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', } ), ( 'gebrd_batched', '-f gebrd_batched -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', 'strideP': '10', 'batch_c': '1', } ), ( 'gebrd_strided_batched', '-f gebrd_strided_batched -n 10 -m 15', { 'm': '15', 'n': '10', 'lda': '15', 'strideA': '150', 'strideP': '10', 'batch_c': '1', } ), ( 'sytf2', '-f sytf2 -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', } ), ( 'sytf2_batched', '-f sytf2_batched -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'sytf2_strided_batched', '-f sytf2_strided_batched -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'batch_c': '1', } ), ( 'sytrf', '-f sytrf -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', } ), ( 'sytrf_batched', '-f sytrf_batched -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', 'strideP': '10', 'batch_c': '1', } ), ( 'sytrf_strided_batched', '-f sytrf_strided_batched -n 10', { 'uplo': 'U', 'n': '10', 'lda': '10', 'strideA': '100', 'strideP': '10', 'batch_c': '1', } ) ] if __name__ == '__main__': for name, command_options, expected_args in parameters: test = generate_parameterized_test(command_options, expected_args) setattr(TestRocsolverBench, f'test_{name}', test) unittest.main() rocSOLVER-rocm-5.5.1/clients/gtest/000077500000000000000000000000001436600607200167715ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/clients/gtest/CMakeLists.txt000077500000000000000000000070451436600607200215420ustar00rootroot00000000000000# ######################################################################## # Copyright (c) 2019-2022 Advanced Micro Devices, Inc. # ######################################################################## find_package(GTest REQUIRED) set(roclapack_test_source # linear systems solvers getri_gtest.cpp getrs_gtest.cpp gesv_gtest.cpp potrs_gtest.cpp posv_gtest.cpp potri_gtest.cpp trtri_gtest.cpp geblttrs_gtest.cpp # least squares solvers gels_gtest.cpp # triangular factorizations getf2_getrf_gtest.cpp potf2_potrf_gtest.cpp sytf2_sytrf_gtest.cpp geblttrf_gtest.cpp # orthogonal factorizations geqr2_geqrf_gtest.cpp gerq2_gerqf_gtest.cpp geql2_geqlf_gtest.cpp gelq2_gelqf_gtest.cpp # problem and matrix reductions (diagonalizations) gebd2_gebrd_gtest.cpp sytxx_hetxx_gtest.cpp sygsx_hegsx_gtest.cpp # singular value decomposition gesvd_gtest.cpp gesvdj_gtest.cpp gesvdx_gtest.cpp # symmetric eigensolvers syev_heev_gtest.cpp syevd_heevd_gtest.cpp syevj_heevj_gtest.cpp syevx_heevx_gtest.cpp syevdx_heevdx_gtest.cpp sygv_hegv_gtest.cpp sygvd_hegvd_gtest.cpp sygvj_hegvj_gtest.cpp sygvx_hegvx_gtest.cpp sygvdx_hegvdx_gtest.cpp ) set(rocauxiliary_test_source # vector & matrix manipulations lacgv_gtest.cpp laswp_gtest.cpp # householder reflections larf_gtest.cpp larfg_gtest.cpp larft_gtest.cpp larfb_gtest.cpp # orthonormal/unitary matrices orgxr_ungxr_gtest.cpp orglx_unglx_gtest.cpp orgxl_ungxl_gtest.cpp orgbr_ungbr_gtest.cpp orgtr_ungtr_gtest.cpp ormxr_unmxr_gtest.cpp ormlx_unmlx_gtest.cpp ormxl_unmxl_gtest.cpp ormbr_unmbr_gtest.cpp ormtr_unmtr_gtest.cpp # bidiagonal matrices labrd_gtest.cpp bdsqr_gtest.cpp bdsvdx_gtest.cpp # tridiagonal matrices sterf_gtest.cpp steqr_gtest.cpp stedc_gtest.cpp stebz_gtest.cpp stein_gtest.cpp latrd_gtest.cpp # symmetric matrices lasyf_gtest.cpp # triangular matrices lauum_gtest.cpp ) set(others_test_source # unified memory model managed_malloc_gtest.cpp # rocblas memory model memory_model_gtest.cpp # rocsolver logging logging_gtest.cpp # helpers client_environment_helpers.cpp ) set(rocsolver_test_source rocsolver_gtest_main.cpp ) add_executable(rocsolver-test ${roclapack_test_source} ${rocauxiliary_test_source} ${others_test_source} ${rocsolver_test_source} ) add_armor_flags(rocsolver-test "${ARMOR_LEVEL}") if(WIN32) file(GLOB third_party_dlls LIST_DIRECTORIES OFF CONFIGURE_DEPENDS ${ROCSOLVER_LAPACK_PATH}/bin/*.dll ${GTest_DIR}/bin/*.dll $ENV{rocblas_DIR}/bin/*.dll $ENV{HIP_DIR}/bin/*.dll $ENV{HIP_DIR}/bin/hipinfo.exe ${CMAKE_SOURCE_DIR}/rtest.* ) foreach(file_i ${third_party_dlls}) add_custom_command(TARGET rocsolver-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} ${PROJECT_BINARY_DIR}/staging/ ) endforeach() add_custom_command(TARGET rocsolver-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy_directory $ENV{rocblas_DIR}/bin/rocblas/library ${PROJECT_BINARY_DIR}/staging/library ) endif() target_link_libraries(rocsolver-test PRIVATE GTest::GTest hip::device rocsolver-common clients-common $<$:stdc++fs> $<$:m> roc::rocsolver ) # Turn on f16c intrinsics target_compile_options(rocsolver-test PRIVATE -mf16c) target_compile_definitions(rocsolver-test PRIVATE ROCM_USE_FLOAT16 ROCSOLVER_CLIENTS_TEST ) add_test( NAME rocsolver-test COMMAND rocsolver-test ) rocm_install(TARGETS rocsolver-test COMPONENT tests) rocSOLVER-rocm-5.5.1/clients/gtest/bdsqr_gtest.cpp000066400000000000000000000064021436600607200220200ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_bdsqr.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> bdsqr_tuple; // each size_range is a {n, nv, nu, nc} // each opt_range is a {uplo, ldv, ldu, ldc} // if uplo = 0, then is upper bidiagonal // if uplo = 1, then is lower bidiagonal // if ldx = -1, then ldx < limit (invalid size) // if ldx = 0, then ldx = limit // if ldx = 1, then ldx > limit // case when n = 0 and uplo = 'L' will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> size_range = { // quick return {0, 1, 1, 1}, // invalid {-1, 1, 1, 1}, {1, -1, 1, 1}, {1, 1, -1, 1}, {1, 1, 1, -1}, // normal (valid) samples {15, 10, 10, 10}, {20, 0, 0, 15}, {30, 30, 50, 0}, {50, 60, 20, 0}, {70, 0, 0, 0}}; const vector> opt_range = { // invalid {0, -1, 0, 0}, {0, 0, -1, 0}, {0, 0, 0, -1}, // normal (valid) samples {0, 0, 0, 0}, {1, 0, 0, 0}, {0, 1, 0, 0}, {0, 0, 1, 0}, {0, 0, 0, 1}}; // for daily_lapack tests const vector> large_size_range = {{152, 152, 152, 152}, {640, 640, 656, 700}, {1000, 1024, 1000, 80}, {2000, 0, 0, 0}}; const vector> large_opt_range = {{0, 0, 0, 0}, {1, 0, 1, 0}, {0, 1, 0, 1}, {1, 0, 0, 0}}; Arguments bdsqr_setup_arguments(bdsqr_tuple tup) { vector size = std::get<0>(tup); vector opt = std::get<1>(tup); Arguments arg; rocblas_int n = size[0]; rocblas_int nv = size[1]; rocblas_int nu = size[2]; rocblas_int nc = size[3]; arg.set("n", n); arg.set("nv", nv); arg.set("nu", nu); arg.set("nc", nc); arg.set("uplo", opt[0] ? 'L' : 'U'); arg.set("ldv", (nv > 0 ? n : 1) + opt[1] * 10); arg.set("ldu", (nu > 0 ? nu : 1) + opt[2] * 10); arg.set("ldc", (nc > 0 ? n : 1) + opt[3] * 10); arg.timing = 0; return arg; } class BDSQR : public ::TestWithParam { protected: BDSQR() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = bdsqr_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("uplo") == 'L') testing_bdsqr_bad_arg(); testing_bdsqr(arg); } }; // non-batch tests TEST_P(BDSQR, __float) { run_tests(); } TEST_P(BDSQR, __double) { run_tests(); } TEST_P(BDSQR, __float_complex) { run_tests(); } TEST_P(BDSQR, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, BDSQR, Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, BDSQR, Combine(ValuesIn(size_range), ValuesIn(opt_range))); rocSOLVER-rocm-5.5.1/clients/gtest/bdsvdx_gtest.cpp000066400000000000000000000074431436600607200222050ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include "testing_bdsvdx.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> bdsvdx_tuple; // each size_range vector is a {n, ldz, vect} // if vect = 0, then don't find singular vectors // if vect = 1, then find singular vectors // each ops_range vector is a {rng, vl, vu, il, iu} // if rng = 0, then find all singular values // if rng = 1, then find singular values in (vl, vu] // if rng = 2, then find the il-th to the iu-th singular value // Note: all tests are prepared with diagonally dominant matrices that have random diagonal // elements in [-20, -11] U [11, 20], and off-diagonal elements in [-0.4, 0.5]. This // guarantees that all singular values will be in [0, 20]. // case when n == 0, vect = 0, and rng == 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector uplo_range = {'U', 'L'}; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1, 0}, // invalid {-1, 1, 0}, {10, 10, 1}, // normal (valid) samples {1, 2, 1}, {15, 32, 0}, {20, 42, 1}, {64, 128, 0}, }; const vector> ops_range = { // always invalid {1, 2, 1, 0, 0}, {1, -1, 1, 0, 0}, {2, 0, 0, 0, -1}, {2, 0, 0, 1, 80}, // valid only when n=0 {2, 0, 0, 1, 0}, // valid only when n>0 {2, 0, 0, 1, 5}, {2, 0, 0, 1, 15}, {2, 0, 0, 7, 12}, // always valid samples {0, 0, 0, 0, 0}, {1, 5, 15, 0, 0}, {1, 0, 15, 0, 0}, {1, 15, 20, 0, 0}, {1, 35, 55, 0, 0}}; // for daily_lapack tests const vector> large_size_range = {{120, 240, 1}, {256, 520, 0}, {350, 700, 1}, {512, 1024, 0}, {1024, 2100, 1}}; const vector> large_ops_range = {{0, 0, 0, 0, 0}, {1, 5, 15, 0, 0}, {1, 0, 25, 0, 0}, {1, 15, 20, 0, 0}, {2, 0, 0, 50, 75}, {2, 0, 0, 1, 25}}; Arguments bdsvdx_setup_arguments(bdsvdx_tuple tup) { Arguments arg; char uplo = std::get<0>(tup); vector size = std::get<1>(tup); vector op = std::get<2>(tup); arg.set("uplo", uplo); arg.set("n", size[0]); arg.set("ldz", size[1]); arg.set("svect", (size[2] == 0 ? 'N' : 'V')); arg.set("srange", (op[0] == 0 ? 'A' : (op[0] == 1 ? 'V' : 'I'))); arg.set("vl", op[1]); arg.set("vu", op[2]); arg.set("il", op[3]); arg.set("iu", op[4]); arg.timing = 0; return arg; } class BDSVDX : public ::TestWithParam { protected: BDSVDX() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = bdsvdx_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("svect") == 'N' && arg.peek("srange") == 'A') testing_bdsvdx_bad_arg(); testing_bdsvdx(arg); } }; // non-batch tests TEST_P(BDSVDX, __float) { run_tests(); } TEST_P(BDSVDX, __double) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, BDSVDX, Combine(ValuesIn(uplo_range), ValuesIn(large_size_range), ValuesIn(large_ops_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, BDSVDX, Combine(ValuesIn(uplo_range), ValuesIn(size_range), ValuesIn(ops_range))); rocSOLVER-rocm-5.5.1/clients/gtest/client_environment_helpers.cpp000066400000000000000000000022151436600607200251210ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #include #include #include "client_environment_helpers.hpp" bool set_environment_variable(const char* name, const char* value) { #ifdef _WIN32 return _putenv_s(name, value) == 0; #else return setenv(name, value, 1) == 0; #endif } bool unset_environment_variable(const char* name) { #ifdef _WIN32 return _putenv_s(name, "") == 0; #else return unsetenv(name) == 0; #endif } scoped_envvar::scoped_envvar(const char* name, const char* value) : m_name(name) { if(const char* old_value = std::getenv(name)) { m_old_value.emplace(old_value); } if(!set_environment_variable(name, value)) throw environment_error(fmt::format("failed to set {:s}={:s}", name, value)); } scoped_envvar::~scoped_envvar() { if(m_old_value) set_environment_variable(m_name.c_str(), m_old_value->c_str()); else unset_environment_variable(m_name.c_str()); } rocSOLVER-rocm-5.5.1/clients/gtest/gebd2_gebrd_gtest.cpp000066400000000000000000000116071436600607200230360ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_gebd2_gebrd.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, int> gebrd_tuple; // each matrix_size_range is a {m, lda} // case when m = n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1}, // invalid {-1, 1}, {20, 5}, // normal (valid) samples {50, 50}, {70, 100}, {130, 130}, {150, 200}}; const vector n_size_range = { // quick return 0, // invalid -1, // normal (valid) samples 16, 20, 120, 150}; const vector> large_matrix_size_range = { {152, 152}, {640, 640}, {1000, 1024}, }; const vector large_n_size_range = {64, 98, 130, 220}; Arguments gebrd_setup_arguments(gebrd_tuple tup) { vector matrix_size = std::get<0>(tup); int n_size = std::get<1>(tup); Arguments arg; arg.set("m", matrix_size[0]); arg.set("n", n_size); arg.set("lda", matrix_size[1]); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } template class GEBD2_GEBRD : public ::TestWithParam { protected: GEBD2_GEBRD() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = gebrd_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0) testing_gebd2_gebrd_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_gebd2_gebrd(arg); } }; class GEBD2 : public GEBD2_GEBRD { }; class GEBRD : public GEBD2_GEBRD { }; // non-batch tests TEST_P(GEBD2, __float) { run_tests(); } TEST_P(GEBD2, __double) { run_tests(); } TEST_P(GEBD2, __float_complex) { run_tests(); } TEST_P(GEBD2, __double_complex) { run_tests(); } TEST_P(GEBRD, __float) { run_tests(); } TEST_P(GEBRD, __double) { run_tests(); } TEST_P(GEBRD, __float_complex) { run_tests(); } TEST_P(GEBRD, __double_complex) { run_tests(); } // batched tests TEST_P(GEBD2, batched__float) { run_tests(); } TEST_P(GEBD2, batched__double) { run_tests(); } TEST_P(GEBD2, batched__float_complex) { run_tests(); } TEST_P(GEBD2, batched__double_complex) { run_tests(); } TEST_P(GEBRD, batched__float) { run_tests(); } TEST_P(GEBRD, batched__double) { run_tests(); } TEST_P(GEBRD, batched__float_complex) { run_tests(); } TEST_P(GEBRD, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(GEBD2, strided_batched__float) { run_tests(); } TEST_P(GEBD2, strided_batched__double) { run_tests(); } TEST_P(GEBD2, strided_batched__float_complex) { run_tests(); } TEST_P(GEBD2, strided_batched__double_complex) { run_tests(); } TEST_P(GEBRD, strided_batched__float) { run_tests(); } TEST_P(GEBRD, strided_batched__double) { run_tests(); } TEST_P(GEBRD, strided_batched__float_complex) { run_tests(); } TEST_P(GEBRD, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GEBD2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GEBD2, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, GEBRD, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GEBRD, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/geblttrf_gtest.cpp000066400000000000000000000074551436600607200225270ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_geblttrf_npvt.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef vector geblttrf_tuple; // each matrix_size_range vector is a {nb, nblocks, lda, ldb, ldc, singular} // if singular = 1, then the used matrix for the tests is singular // case when nb = 0 and nblocks = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 1, 1, 1, 0}, {1, 0, 1, 1, 1, 0}, // invalid {-1, 1, 1, 1, 1, 0}, {1, -1, 1, 1, 1, 0}, {10, 2, 1, 1, 1, 0}, // normal (valid) samples {32, 1, 32, 32, 32, 0}, {16, 2, 20, 16, 16, 1}, {10, 7, 10, 20, 10, 0}, {10, 10, 10, 10, 20, 1}, }; // for daily_lapack tests const vector> large_matrix_size_range = {{32, 6, 32, 32, 32, 0}, {50, 10, 60, 50, 50, 1}, {32, 10, 32, 40, 32, 0}, {32, 20, 32, 32, 40, 0}}; Arguments geblttrf_setup_arguments(geblttrf_tuple tup) { Arguments arg; arg.set("nb", tup[0]); arg.set("nblocks", tup[1]); arg.set("lda", tup[2]); arg.set("ldb", tup[3]); arg.set("ldc", tup[4]); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = tup[5]; return arg; } class GEBLTTRF_NPVT : public ::TestWithParam { protected: GEBLTTRF_NPVT() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = geblttrf_setup_arguments(GetParam()); if(arg.peek("nb") == 0 && arg.peek("nblocks") == 0) testing_geblttrf_npvt_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_geblttrf_npvt(arg); arg.singular = 0; testing_geblttrf_npvt(arg); } }; // non-batch tests TEST_P(GEBLTTRF_NPVT, __float) { run_tests(); } TEST_P(GEBLTTRF_NPVT, __double) { run_tests(); } TEST_P(GEBLTTRF_NPVT, __float_complex) { run_tests(); } TEST_P(GEBLTTRF_NPVT, __double_complex) { run_tests(); } // batched tests TEST_P(GEBLTTRF_NPVT, batched__float) { run_tests(); } TEST_P(GEBLTTRF_NPVT, batched__double) { run_tests(); } TEST_P(GEBLTTRF_NPVT, batched__float_complex) { run_tests(); } TEST_P(GEBLTTRF_NPVT, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(GEBLTTRF_NPVT, strided_batched__float) { run_tests(); } TEST_P(GEBLTTRF_NPVT, strided_batched__double) { run_tests(); } TEST_P(GEBLTTRF_NPVT, strided_batched__float_complex) { run_tests(); } TEST_P(GEBLTTRF_NPVT, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GEBLTTRF_NPVT, ValuesIn(large_matrix_size_range)); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GEBLTTRF_NPVT, ValuesIn(matrix_size_range)); rocSOLVER-rocm-5.5.1/clients/gtest/geblttrs_gtest.cpp000066400000000000000000000075271436600607200225440ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_geblttrs_npvt.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef vector geblttrs_tuple; // each matrix_size_range vector is a {nb, nblocks, nrhs, lda, ldb, ldc, ldx} // case when nb = 0, nblocks = 0, and nrhs = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1, 1}, // invalid {-1, 1, 1, 1, 1, 1, 1}, {1, -1, 1, 1, 1, 1, 1}, {1, 1, -1, 1, 1, 1, 1}, {10, 2, 1, 1, 1, 1, 1}, // normal (valid) samples {32, 1, 10, 32, 32, 32, 32}, {16, 2, 10, 20, 16, 16, 16}, {10, 7, 20, 10, 20, 10, 10}, {10, 10, 20, 10, 10, 20, 20}, }; // for daily_lapack tests const vector> large_matrix_size_range = {{32, 6, 10, 32, 32, 32, 32}, {50, 10, 10, 60, 50, 50, 50}, {32, 10, 20, 32, 40, 32, 40}, {32, 20, 20, 32, 32, 40, 32}}; Arguments geblttrs_setup_arguments(geblttrs_tuple tup) { Arguments arg; arg.set("nb", tup[0]); arg.set("nblocks", tup[1]); arg.set("nrhs", tup[2]); arg.set("lda", tup[3]); arg.set("ldb", tup[4]); arg.set("ldc", tup[5]); arg.set("ldx", tup[6]); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } class GEBLTTRS_NPVT : public ::TestWithParam { protected: GEBLTTRS_NPVT() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = geblttrs_setup_arguments(GetParam()); if(arg.peek("nb") == 0 && arg.peek("nblocks") == 0 && arg.peek("nrhs") == 0) testing_geblttrs_npvt_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_geblttrs_npvt(arg); } }; // non-batch tests TEST_P(GEBLTTRS_NPVT, __float) { run_tests(); } TEST_P(GEBLTTRS_NPVT, __double) { run_tests(); } TEST_P(GEBLTTRS_NPVT, __float_complex) { run_tests(); } TEST_P(GEBLTTRS_NPVT, __double_complex) { run_tests(); } // batched tests TEST_P(GEBLTTRS_NPVT, batched__float) { run_tests(); } TEST_P(GEBLTTRS_NPVT, batched__double) { run_tests(); } TEST_P(GEBLTTRS_NPVT, batched__float_complex) { run_tests(); } TEST_P(GEBLTTRS_NPVT, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(GEBLTTRS_NPVT, strided_batched__float) { run_tests(); } TEST_P(GEBLTTRS_NPVT, strided_batched__double) { run_tests(); } TEST_P(GEBLTTRS_NPVT, strided_batched__float_complex) { run_tests(); } TEST_P(GEBLTTRS_NPVT, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GEBLTTRS_NPVT, ValuesIn(large_matrix_size_range)); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GEBLTTRS_NPVT, ValuesIn(matrix_size_range)); rocSOLVER-rocm-5.5.1/clients/gtest/gelq2_gelqf_gtest.cpp000066400000000000000000000116461436600607200231030ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_gelq2_gelqf.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, int> gelqf_tuple; // each matrix_size_range is a {m, lda} // case when m = n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1}, // invalid {-1, 1}, {20, 5}, // normal (valid) samples {50, 50}, {70, 100}, {130, 130}, {150, 200}}; const vector n_size_range = { // quick return 0, // invalid -1, // normal (valid) samples 16, 20, 130, 150}; // for daily_lapack tests const vector> large_matrix_size_range = { {152, 152}, {640, 640}, {1000, 1024}, }; const vector large_n_size_range = {64, 98, 130, 220, 400}; Arguments gelqf_setup_arguments(gelqf_tuple tup) { vector matrix_size = std::get<0>(tup); int n_size = std::get<1>(tup); Arguments arg; arg.set("m", matrix_size[0]); arg.set("n", n_size); arg.set("lda", matrix_size[1]); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } template class GELQ2_GELQF : public ::TestWithParam { protected: GELQ2_GELQF() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = gelqf_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0) testing_gelq2_gelqf_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_gelq2_gelqf(arg); } }; class GELQ2 : public GELQ2_GELQF { }; class GELQF : public GELQ2_GELQF { }; // non-batch tests TEST_P(GELQ2, __float) { run_tests(); } TEST_P(GELQ2, __double) { run_tests(); } TEST_P(GELQ2, __float_complex) { run_tests(); } TEST_P(GELQ2, __double_complex) { run_tests(); } TEST_P(GELQF, __float) { run_tests(); } TEST_P(GELQF, __double) { run_tests(); } TEST_P(GELQF, __float_complex) { run_tests(); } TEST_P(GELQF, __double_complex) { run_tests(); } // batched tests TEST_P(GELQ2, batched__float) { run_tests(); } TEST_P(GELQ2, batched__double) { run_tests(); } TEST_P(GELQ2, batched__float_complex) { run_tests(); } TEST_P(GELQ2, batched__double_complex) { run_tests(); } TEST_P(GELQF, batched__float) { run_tests(); } TEST_P(GELQF, batched__double) { run_tests(); } TEST_P(GELQF, batched__float_complex) { run_tests(); } TEST_P(GELQF, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(GELQ2, strided_batched__float) { run_tests(); } TEST_P(GELQ2, strided_batched__double) { run_tests(); } TEST_P(GELQ2, strided_batched__float_complex) { run_tests(); } TEST_P(GELQ2, strided_batched__double_complex) { run_tests(); } TEST_P(GELQF, strided_batched__float) { run_tests(); } TEST_P(GELQF, strided_batched__double) { run_tests(); } TEST_P(GELQF, strided_batched__float_complex) { run_tests(); } TEST_P(GELQF, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GELQ2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GELQ2, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, GELQF, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GELQF, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/gels_gtest.cpp000066400000000000000000000141411436600607200216360ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_gels.hpp" #include "testing_gels_outofplace.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple gels_params_A; typedef std::tuple gels_params_B; typedef std::tuple gels_tuple; // each A_range tuple is a {M, N, lda, ldb, singular}; // if singular = 1, then the used matrix for the tests is singular // each B_range tuple is a {nrhs, trans}; // case when N = nrhs = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector matrix_sizeA_range = { // quick return {0, 0, 0, 0, 0}, // invalid {-1, 1, 1, 1, 0}, {1, -1, 1, 1, 0}, {10, 10, 10, 1, 0}, {10, 10, 1, 10, 0}, // normal (valid) samples {20, 20, 20, 20, 1}, {30, 20, 40, 30, 0}, {20, 30, 30, 40, 0}, {40, 20, 40, 40, 1}, {20, 40, 40, 40, 1}, }; const vector matrix_sizeB_range = { // quick return {0, 'N'}, // invalid {-1, 'N'}, // normal (valid) samples {10, 'N'}, {20, 'N'}, {30, 'N'}, // invalid for complex precision {10, 'T'}, {30, 'T'}, // invalid for real precision {20, 'C'}, }; // for daily_lapack tests const vector large_matrix_sizeA_range = { {75, 25, 75, 75, 1}, {25, 75, 75, 75, 1}, {150, 150, 150, 150, 1}, {500, 50, 600, 600, 0}, {50, 500, 600, 600, 0}, }; const vector large_matrix_sizeB_range = { {100, 'N'}, {200, 'T'}, {500, 'C'}, {1000, 'N'}, }; Arguments gels_setup_arguments(gels_tuple tup, bool outofplace) { gels_params_A matrix_sizeA = std::get<0>(tup); gels_params_B matrix_sizeB = std::get<1>(tup); Arguments arg; arg.set("m", std::get<0>(matrix_sizeA)); arg.set("n", std::get<1>(matrix_sizeA)); arg.set("lda", std::get<2>(matrix_sizeA)); arg.set("ldb", std::get<3>(matrix_sizeA)); if(outofplace) arg.set("ldx", std::get<3>(matrix_sizeA)); arg.set("nrhs", std::get<0>(matrix_sizeB)); arg.set("trans", std::get<1>(matrix_sizeB)); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = std::get<4>(matrix_sizeA); return arg; } class GELS : public ::TestWithParam { protected: GELS() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = gels_setup_arguments(GetParam(), false); if(arg.peek("n") == 0 && arg.peek("nrhs") == 0) testing_gels_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_gels(arg); arg.singular = 0; testing_gels(arg); } }; class GELS_OUTOFPLACE : public ::TestWithParam { protected: GELS_OUTOFPLACE() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = gels_setup_arguments(GetParam(), true); if(arg.peek("n") == 0 && arg.peek("nrhs") == 0) testing_gels_outofplace_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_gels_outofplace(arg); arg.singular = 0; testing_gels_outofplace(arg); } }; // non-batch tests TEST_P(GELS, __float) { run_tests(); } TEST_P(GELS, __double) { run_tests(); } TEST_P(GELS, __float_complex) { run_tests(); } TEST_P(GELS, __double_complex) { run_tests(); } TEST_P(GELS_OUTOFPLACE, __float) { run_tests(); } TEST_P(GELS_OUTOFPLACE, __double) { run_tests(); } TEST_P(GELS_OUTOFPLACE, __float_complex) { run_tests(); } TEST_P(GELS_OUTOFPLACE, __double_complex) { run_tests(); } // batched tests TEST_P(GELS, batched__float) { run_tests(); } TEST_P(GELS, batched__double) { run_tests(); } TEST_P(GELS, batched__float_complex) { run_tests(); } TEST_P(GELS, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(GELS, strided_batched__float) { run_tests(); } TEST_P(GELS, strided_batched__double) { run_tests(); } TEST_P(GELS, strided_batched__float_complex) { run_tests(); } TEST_P(GELS, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GELS, Combine(ValuesIn(large_matrix_sizeA_range), ValuesIn(large_matrix_sizeB_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GELS, Combine(ValuesIn(matrix_sizeA_range), ValuesIn(matrix_sizeB_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, GELS_OUTOFPLACE, Combine(ValuesIn(large_matrix_sizeA_range), ValuesIn(large_matrix_sizeB_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GELS_OUTOFPLACE, Combine(ValuesIn(matrix_sizeA_range), ValuesIn(matrix_sizeB_range))); rocSOLVER-rocm-5.5.1/clients/gtest/geql2_geqlf_gtest.cpp000066400000000000000000000116461436600607200231030ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_geql2_geqlf.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, int> geqlf_tuple; // each matrix_size_range is a {m, lda} // case when m = n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1}, // invalid {-1, 1}, {20, 5}, // normal (valid) samples {50, 50}, {70, 100}, {130, 130}, {150, 200}}; const vector n_size_range = { // quick return 0, // invalid -1, // normal (valid) samples 16, 20, 130, 150}; // for daily_lapack tests const vector> large_matrix_size_range = { {152, 152}, {640, 640}, {1000, 1024}, }; const vector large_n_size_range = {64, 98, 130, 220, 400}; Arguments geqlf_setup_arguments(geqlf_tuple tup) { vector matrix_size = std::get<0>(tup); int n_size = std::get<1>(tup); Arguments arg; arg.set("m", matrix_size[0]); arg.set("n", n_size); arg.set("lda", matrix_size[1]); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } template class GEQL2_GEQLF : public ::TestWithParam { protected: GEQL2_GEQLF() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = geqlf_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0) testing_geql2_geqlf_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_geql2_geqlf(arg); } }; class GEQL2 : public GEQL2_GEQLF { }; class GEQLF : public GEQL2_GEQLF { }; // non-batch tests TEST_P(GEQL2, __float) { run_tests(); } TEST_P(GEQL2, __double) { run_tests(); } TEST_P(GEQL2, __float_complex) { run_tests(); } TEST_P(GEQL2, __double_complex) { run_tests(); } TEST_P(GEQLF, __float) { run_tests(); } TEST_P(GEQLF, __double) { run_tests(); } TEST_P(GEQLF, __float_complex) { run_tests(); } TEST_P(GEQLF, __double_complex) { run_tests(); } // batched tests TEST_P(GEQL2, batched__float) { run_tests(); } TEST_P(GEQL2, batched__double) { run_tests(); } TEST_P(GEQL2, batched__float_complex) { run_tests(); } TEST_P(GEQL2, batched__double_complex) { run_tests(); } TEST_P(GEQLF, batched__float) { run_tests(); } TEST_P(GEQLF, batched__double) { run_tests(); } TEST_P(GEQLF, batched__float_complex) { run_tests(); } TEST_P(GEQLF, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(GEQL2, strided_batched__float) { run_tests(); } TEST_P(GEQL2, strided_batched__double) { run_tests(); } TEST_P(GEQL2, strided_batched__float_complex) { run_tests(); } TEST_P(GEQL2, strided_batched__double_complex) { run_tests(); } TEST_P(GEQLF, strided_batched__float) { run_tests(); } TEST_P(GEQLF, strided_batched__double) { run_tests(); } TEST_P(GEQLF, strided_batched__float_complex) { run_tests(); } TEST_P(GEQLF, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GEQL2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GEQL2, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, GEQLF, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GEQLF, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/geqr2_geqrf_gtest.cpp000066400000000000000000000124401436600607200231100ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_geqr2_geqrf.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, int> geqrf_tuple; // each matrix_size_range is a {m, lda} // case when m = n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1}, // invalid {-1, 1}, {20, 5}, // normal (valid) samples {50, 50}, {70, 100}, {130, 130}, {150, 200}}; const vector n_size_range = { // quick return 0, // invalid -1, // normal (valid) samples 16, 20, 130, 150}; // for daily_lapack tests const vector> large_matrix_size_range = { {152, 152}, {640, 640}, {1000, 1024}, }; const vector large_n_size_range = {64, 98, 130, 220, 400}; Arguments geqrf_setup_arguments(geqrf_tuple tup) { vector matrix_size = std::get<0>(tup); int n_size = std::get<1>(tup); Arguments arg; arg.set("m", matrix_size[0]); arg.set("n", n_size); arg.set("lda", matrix_size[1]); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } template class GEQR2_GEQRF : public ::TestWithParam { protected: GEQR2_GEQRF() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = geqrf_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0) testing_geqr2_geqrf_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_geqr2_geqrf(arg); } }; class GEQR2 : public GEQR2_GEQRF { }; class GEQRF : public GEQR2_GEQRF { }; // non-batch tests TEST_P(GEQR2, __float) { run_tests(); } TEST_P(GEQR2, __double) { run_tests(); } TEST_P(GEQR2, __float_complex) { run_tests(); } TEST_P(GEQR2, __double_complex) { run_tests(); } TEST_P(GEQRF, __float) { run_tests(); } TEST_P(GEQRF, __double) { run_tests(); } TEST_P(GEQRF, __float_complex) { run_tests(); } TEST_P(GEQRF, __double_complex) { run_tests(); } // batched tests TEST_P(GEQR2, batched__float) { run_tests(); } TEST_P(GEQR2, batched__double) { run_tests(); } TEST_P(GEQR2, batched__float_complex) { run_tests(); } TEST_P(GEQR2, batched__double_complex) { run_tests(); } TEST_P(GEQRF, batched__float) { run_tests(); } TEST_P(GEQRF, batched__double) { run_tests(); } TEST_P(GEQRF, batched__float_complex) { run_tests(); } TEST_P(GEQRF, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(GEQR2, strided_batched__float) { run_tests(); } TEST_P(GEQR2, strided_batched__double) { run_tests(); } TEST_P(GEQR2, strided_batched__float_complex) { run_tests(); } TEST_P(GEQR2, strided_batched__double_complex) { run_tests(); } TEST_P(GEQRF, strided_batched__float) { run_tests(); } TEST_P(GEQRF, strided_batched__double) { run_tests(); } TEST_P(GEQRF, strided_batched__float_complex) { run_tests(); } TEST_P(GEQRF, strided_batched__double_complex) { run_tests(); } // ptr_batched tests TEST_P(GEQRF, ptr_batched__float) { run_tests(); } TEST_P(GEQRF, ptr_batched__double) { run_tests(); } TEST_P(GEQRF, ptr_batched__float_complex) { run_tests(); } TEST_P(GEQRF, ptr_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GEQR2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GEQR2, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, GEQRF, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GEQRF, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/gerq2_gerqf_gtest.cpp000066400000000000000000000116461436600607200231170ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_gerq2_gerqf.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, int> gerqf_tuple; // each matrix_size_range is a {m, lda} // case when m = n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1}, // invalid {-1, 1}, {20, 5}, // normal (valid) samples {50, 50}, {70, 100}, {130, 130}, {150, 200}}; const vector n_size_range = { // quick return 0, // invalid -1, // normal (valid) samples 16, 20, 130, 150}; // for daily_lapack tests const vector> large_matrix_size_range = { {152, 152}, {640, 640}, {1000, 1024}, }; const vector large_n_size_range = {64, 98, 130, 220, 400}; Arguments gerqf_setup_arguments(gerqf_tuple tup) { vector matrix_size = std::get<0>(tup); int n_size = std::get<1>(tup); Arguments arg; arg.set("m", matrix_size[0]); arg.set("n", n_size); arg.set("lda", matrix_size[1]); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } template class GERQ2_GERQF : public ::TestWithParam { protected: GERQ2_GERQF() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = gerqf_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0) testing_gerq2_gerqf_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_gerq2_gerqf(arg); } }; class GERQ2 : public GERQ2_GERQF { }; class GERQF : public GERQ2_GERQF { }; // non-batch tests TEST_P(GERQ2, __float) { run_tests(); } TEST_P(GERQ2, __double) { run_tests(); } TEST_P(GERQ2, __float_complex) { run_tests(); } TEST_P(GERQ2, __double_complex) { run_tests(); } TEST_P(GERQF, __float) { run_tests(); } TEST_P(GERQF, __double) { run_tests(); } TEST_P(GERQF, __float_complex) { run_tests(); } TEST_P(GERQF, __double_complex) { run_tests(); } // batched tests TEST_P(GERQ2, batched__float) { run_tests(); } TEST_P(GERQ2, batched__double) { run_tests(); } TEST_P(GERQ2, batched__float_complex) { run_tests(); } TEST_P(GERQ2, batched__double_complex) { run_tests(); } TEST_P(GERQF, batched__float) { run_tests(); } TEST_P(GERQF, batched__double) { run_tests(); } TEST_P(GERQF, batched__float_complex) { run_tests(); } TEST_P(GERQF, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(GERQ2, strided_batched__float) { run_tests(); } TEST_P(GERQ2, strided_batched__double) { run_tests(); } TEST_P(GERQ2, strided_batched__float_complex) { run_tests(); } TEST_P(GERQ2, strided_batched__double_complex) { run_tests(); } TEST_P(GERQF, strided_batched__float) { run_tests(); } TEST_P(GERQF, strided_batched__double) { run_tests(); } TEST_P(GERQF, strided_batched__float_complex) { run_tests(); } TEST_P(GERQF, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GERQ2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GERQ2, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, GERQF, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GERQF, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/gesv_gtest.cpp000066400000000000000000000132751436600607200216570ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_gesv.hpp" #include "testing_gesv_outofplace.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, int> gesv_tuple; // each A_range vector is a {N, lda, ldb/ldx, singular}; // if singular = 1, then the used matrix for the tests is singular // each B_range vector is a {nrhs}; // case when N = nrhs = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_sizeA_range = { // quick return {0, 1, 1, 0}, // invalid {-1, 1, 1, 0}, {10, 2, 10, 0}, {10, 10, 2, 0}, /// normal (valid) samples {20, 20, 20, 0}, {30, 50, 30, 1}, {30, 30, 50, 0}, {50, 60, 60, 1}}; const vector matrix_sizeB_range = { // quick return 0, // invalid -1, // normal (valid) samples 10, 20, 30, }; // for daily_lapack tests const vector> large_matrix_sizeA_range = {{70, 70, 100, 0}, {192, 192, 192, 1}, {600, 700, 645, 0}, {1000, 1000, 1000, 1}, {1000, 2000, 2000, 0}}; const vector large_matrix_sizeB_range = { 100, 150, 200, 524, 1000, }; Arguments gesv_setup_arguments(gesv_tuple tup, bool outofplace) { vector matrix_sizeA = std::get<0>(tup); int matrix_sizeB = std::get<1>(tup); Arguments arg; arg.set("n", matrix_sizeA[0]); arg.set("nrhs", matrix_sizeB); arg.set("lda", matrix_sizeA[1]); arg.set("ldb", matrix_sizeA[2]); if(outofplace) arg.set("ldx", matrix_sizeA[2]); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = matrix_sizeA[3]; return arg; } class GESV : public ::TestWithParam { protected: GESV() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = gesv_setup_arguments(GetParam(), false); if(arg.peek("n") == 0 && arg.peek("nrhs") == 0) testing_gesv_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_gesv(arg); arg.singular = 0; testing_gesv(arg); } }; class GESV_OUTOFPLACE : public ::TestWithParam { protected: GESV_OUTOFPLACE() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = gesv_setup_arguments(GetParam(), true); if(arg.peek("n") == 0 && arg.peek("nrhs") == 0) testing_gesv_outofplace_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_gesv_outofplace(arg); arg.singular = 0; testing_gesv_outofplace(arg); } }; // non-batch tests TEST_P(GESV, __float) { run_tests(); } TEST_P(GESV, __double) { run_tests(); } TEST_P(GESV, __float_complex) { run_tests(); } TEST_P(GESV, __double_complex) { run_tests(); } TEST_P(GESV_OUTOFPLACE, __float) { run_tests(); } TEST_P(GESV_OUTOFPLACE, __double) { run_tests(); } TEST_P(GESV_OUTOFPLACE, __float_complex) { run_tests(); } TEST_P(GESV_OUTOFPLACE, __double_complex) { run_tests(); } // batched tests TEST_P(GESV, batched__float) { run_tests(); } TEST_P(GESV, batched__double) { run_tests(); } TEST_P(GESV, batched__float_complex) { run_tests(); } TEST_P(GESV, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(GESV, strided_batched__float) { run_tests(); } TEST_P(GESV, strided_batched__double) { run_tests(); } TEST_P(GESV, strided_batched__float_complex) { run_tests(); } TEST_P(GESV, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GESV, Combine(ValuesIn(large_matrix_sizeA_range), ValuesIn(large_matrix_sizeB_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GESV, Combine(ValuesIn(matrix_sizeA_range), ValuesIn(matrix_sizeB_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, GESV_OUTOFPLACE, Combine(ValuesIn(large_matrix_sizeA_range), ValuesIn(large_matrix_sizeB_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GESV_OUTOFPLACE, Combine(ValuesIn(matrix_sizeA_range), ValuesIn(matrix_sizeB_range))); rocSOLVER-rocm-5.5.1/clients/gtest/gesvd_gtest.cpp000066400000000000000000000132301436600607200220120ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_gesvd.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> gesvd_tuple; // each size_range vector is a {m, n, fa}; // if fa = 0 then no fast algorithm is allowed // if fa = 1 fast algorithm is used when possible // each opt_range vector is a {lda, ldu, ldv, leftsv, rightsv}; // if ldx = -1 then ldx < limit (invalid size) // if ldx = 0 then ldx = limit // if ldx = 1 then ldx > limit // if leftsv (rightsv) = 0 then overwrite singular vectors // if leftsv (rightsv) = 1 then compute singular vectors // if leftsv (rightsv) = 2 then compute all orthogonal matrix // if leftsv (rightsv) = 3 then no singular vectors are computed // case when m = n = 0 and rightsv = leftsv = 3 will also execute the bad // arguments test (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> size_range = { // quick return {0, 0, 0}, {0, 1, 0}, {1, 0, 0}, // invalid {-1, 1, 0}, {1, -1, 0}, // normal (valid) samples {1, 1, 0}, {20, 20, 0}, {40, 30, 0}, {60, 30, 0}, {60, 30, 1}, {30, 40, 0}, {30, 60, 0}, {30, 60, 1}}; const vector> opt_range = { // invalid {-1, 0, 0, 2, 2}, {0, -1, 0, 1, 2}, {0, 0, -1, 2, 1}, {0, 0, 0, 0, 0}, // normal (valid) samples {1, 1, 1, 3, 3}, {0, 0, 1, 3, 2}, {0, 1, 0, 3, 1}, {0, 1, 1, 3, 0}, {1, 0, 0, 2, 3}, {1, 0, 1, 2, 2}, {1, 1, 0, 2, 1}, {0, 0, 0, 2, 0}, {0, 0, 0, 1, 3}, {0, 0, 0, 1, 2}, {0, 0, 0, 1, 1}, {0, 0, 0, 1, 0}, {0, 0, 0, 0, 3}, {0, 0, 0, 0, 2}, {0, 0, 0, 0, 1}}; // for daily_lapack tests const vector> large_size_range = {{120, 100, 0}, {300, 120, 0}, {300, 120, 1}, {100, 120, 1}, {120, 300, 0}, {120, 300, 1}}; const vector> large_opt_range = {{0, 0, 0, 3, 3}, {1, 0, 0, 0, 1}, {0, 1, 0, 1, 0}, {0, 0, 1, 1, 1}, {0, 0, 0, 3, 0}, {0, 0, 0, 1, 3}, {0, 0, 0, 3, 2}}; Arguments gesvd_setup_arguments(gesvd_tuple tup) { vector size = std::get<0>(tup); vector opt = std::get<1>(tup); Arguments arg; // sizes rocblas_int m = size[0]; rocblas_int n = size[1]; arg.set("m", m); arg.set("n", n); // fast algorithm if(size[2] == 0) arg.set("fast_alg", 'I'); else arg.set("fast_alg", 'O'); // leading dimensions arg.set("lda", m + opt[0] * 10); arg.set("ldu", m + opt[1] * 10); if(opt[4] == 2) arg.set("ldv", n + opt[2] * 10); else arg.set("ldv", min(m, n) + opt[2] * 10); // vector options if(opt[3] == 0) arg.set("left_svect", 'O'); else if(opt[3] == 1) arg.set("left_svect", 'S'); else if(opt[3] == 2) arg.set("left_svect", 'A'); else arg.set("left_svect", 'N'); if(opt[4] == 0) arg.set("right_svect", 'O'); else if(opt[4] == 1) arg.set("right_svect", 'S'); else if(opt[4] == 2) arg.set("right_svect", 'A'); else arg.set("right_svect", 'N'); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } class GESVD : public ::TestWithParam { protected: GESVD() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = gesvd_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0 && arg.peek("left_svect") == 'N' && arg.peek("right_svect") == 'N') testing_gesvd_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_gesvd(arg); } }; // non-batch tests TEST_P(GESVD, __float) { run_tests(); } TEST_P(GESVD, __double) { run_tests(); } TEST_P(GESVD, __float_complex) { run_tests(); } TEST_P(GESVD, __double_complex) { run_tests(); } // batched tests TEST_P(GESVD, batched__float) { run_tests(); } TEST_P(GESVD, batched__double) { run_tests(); } TEST_P(GESVD, batched__float_complex) { run_tests(); } TEST_P(GESVD, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(GESVD, strided_batched__float) { run_tests(); } TEST_P(GESVD, strided_batched__double) { run_tests(); } TEST_P(GESVD, strided_batched__float_complex) { run_tests(); } TEST_P(GESVD, strided_batched__double_complex) { run_tests(); } // daily_lapack tests normal execution with medium to large sizes INSTANTIATE_TEST_SUITE_P(daily_lapack, GESVD, Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); // checkin_lapack tests normal execution with small sizes, invalid sizes, // quick returns, and corner cases INSTANTIATE_TEST_SUITE_P(checkin_lapack, GESVD, Combine(ValuesIn(size_range), ValuesIn(opt_range))); rocSOLVER-rocm-5.5.1/clients/gtest/gesvdj_gtest.cpp000066400000000000000000000152211436600607200221660ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_gesvdj.hpp" #include "testing_gesvdj_notransv.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> gesvdj_tuple; // each size_range vector is a {m, n}; // each opt_range vector is a {lda, ldu, ldv, leftsv, rightsv}; // if ldx = -1 then ldx < limit (invalid size) // if ldx = 0 then ldx = limit // if ldx = 1 then ldx > limit // if leftsv (rightsv) = 0 then no singular vectors are computed // if leftsv (rightsv) = 1 then compute singular vectors // if leftsv (rightsv) = 2 then compute all orthogonal matrix // case when m = n = 0 and rightsv = leftsv = 0 will also execute the bad // arguments test (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> size_range = { // quick return {0, 0}, {0, 1}, {1, 0}, // invalid {-1, 1}, {1, -1}, // normal (valid) samples {1, 1}, {20, 20}, {40, 30}, {60, 30}, {30, 40}, {30, 60}, }; const vector> opt_range = { // invalid {-1, 0, 0, 0, 0}, {0, -1, 0, 0, 1}, {0, 0, -1, 1, 0}, // normal (valid) samples {1, 1, 1, 0, 0}, {0, 0, 1, 0, 1}, {0, 1, 0, 0, 2}, {0, 0, 0, 1, 0}, {0, 0, 0, 1, 1}, {0, 0, 0, 1, 2}, {1, 0, 0, 2, 0}, {1, 0, 1, 2, 1}, {1, 1, 0, 2, 2}, }; // for daily_lapack tests const vector> large_size_range = {{120, 100}, {300, 120}, {100, 120}, {120, 300}}; const vector> large_opt_range = {{0, 0, 0, 0, 0}, {1, 0, 0, 1, 1}, {0, 1, 0, 2, 0}, {0, 0, 1, 0, 2}}; Arguments gesvdj_setup_arguments(gesvdj_tuple tup, bool notransv) { vector size = std::get<0>(tup); vector opt = std::get<1>(tup); Arguments arg; // sizes rocblas_int m = size[0]; rocblas_int n = size[1]; arg.set("m", m); arg.set("n", n); // leading dimensions arg.set("lda", m + opt[0] * 10); arg.set("ldu", m + opt[1] * 10); if(notransv) arg.set("ldv", n + opt[2] * 10); else arg.set("ldv", min(m, n) + opt[2] * 10); // vector options if(opt[3] == 0) arg.set("left_svect", 'N'); else if(opt[3] == 1) arg.set("left_svect", 'S'); else arg.set("left_svect", 'A'); if(opt[4] == 0) arg.set("right_svect", 'N'); else if(opt[4] == 1) arg.set("right_svect", 'S'); else arg.set("right_svect", 'A'); arg.set("abstol", 0); arg.set("max_sweeps", 100); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } class GESVDJ : public ::TestWithParam { protected: GESVDJ() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = gesvdj_setup_arguments(GetParam(), false); if(arg.peek("m") == 0 && arg.peek("n") == 0 && arg.peek("left_svect") == 'N' && arg.peek("right_svect") == 'N') testing_gesvdj_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_gesvdj(arg); } }; class GESVDJ_NOTRANSV : public ::TestWithParam { protected: GESVDJ_NOTRANSV() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = gesvdj_setup_arguments(GetParam(), true); if(arg.peek("m") == 0 && arg.peek("n") == 0 && arg.peek("left_svect") == 'N' && arg.peek("right_svect") == 'N') testing_gesvdj_notransv_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_gesvdj_notransv(arg); } }; // non-batch tests TEST_P(GESVDJ, __float) { run_tests(); } TEST_P(GESVDJ, __double) { run_tests(); } TEST_P(GESVDJ, __float_complex) { run_tests(); } TEST_P(GESVDJ, __double_complex) { run_tests(); } TEST_P(GESVDJ_NOTRANSV, __float) { run_tests(); } TEST_P(GESVDJ_NOTRANSV, __double) { run_tests(); } TEST_P(GESVDJ_NOTRANSV, __float_complex) { run_tests(); } TEST_P(GESVDJ_NOTRANSV, __double_complex) { run_tests(); } // batched tests TEST_P(GESVDJ, batched__float) { run_tests(); } TEST_P(GESVDJ, batched__double) { run_tests(); } TEST_P(GESVDJ, batched__float_complex) { run_tests(); } TEST_P(GESVDJ, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(GESVDJ, strided_batched__float) { run_tests(); } TEST_P(GESVDJ, strided_batched__double) { run_tests(); } TEST_P(GESVDJ, strided_batched__float_complex) { run_tests(); } TEST_P(GESVDJ, strided_batched__double_complex) { run_tests(); } TEST_P(GESVDJ_NOTRANSV, strided_batched__float) { run_tests(); } TEST_P(GESVDJ_NOTRANSV, strided_batched__double) { run_tests(); } TEST_P(GESVDJ_NOTRANSV, strided_batched__float_complex) { run_tests(); } TEST_P(GESVDJ_NOTRANSV, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GESVDJ, Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GESVDJ, Combine(ValuesIn(size_range), ValuesIn(opt_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, GESVDJ_NOTRANSV, Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GESVDJ_NOTRANSV, Combine(ValuesIn(size_range), ValuesIn(opt_range))); rocSOLVER-rocm-5.5.1/clients/gtest/gesvdx_gtest.cpp000066400000000000000000000132431436600607200222060ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_gesvdx.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> gesvdx_tuple; // each size_range vector is a {m, n, lda, ldu, ldv}; // if ldx = -1 then ldx < limit (invalid size) // if ldx = 0 then ldx = limit // if ldx = 1 then ldx > limit // each opt_range vector is a {leftsv, rightsv, rng, vl, vu, il, iu}; // if leftsv (rightsv) = 1 then compute singular vectors // if leftsv (rightsv) = 0 then no singular vectors are computed // if rng = 0, then find all singular values // if rng = 1, then find singular values in (vl, vu] // if rng = 2, then find the il-th to the iu-th singular value // case when m = n = 0, rightsv = leftsv = 0 and rng = 0 will also execute the bad // arguments test (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> size_range = { // quick return {0, 0, 0, 0, 0}, {0, 1, 0, 0, 0}, {1, 0, 0, 0, 0}, // invalid {-1, 1, 0, 0, 0}, {1, -1, 0, 0, 0}, {10, 10, -1, 0, 0}, {10, 10, 0, -1, 0}, {10, 10, 0, 0, -1}, // normal (valid) samples {1, 1, 0, 0, 0}, {20, 20, 0, 0, 0}, {40, 30, 0, 0, 0}, {30, 40, 0, 0, 0}, {30, 30, 1, 0, 0}, {60, 40, 0, 1, 0}, {40, 60, 0, 0, 1}, {50, 50, 1, 1, 1}}; const vector> opt_range = { // always invalid {0, 0, 1, 2, 1, 0, 0}, {0, 0, 1, -1, 1, 0, 0}, {0, 0, 2, 0, 0, 2, 1}, {0, 0, 2, 0, 0, 10, 80}, // valid only when n=0 {0, 0, 2, 0, 0, 1, 0}, // valid only when n>1 {0, 0, 2, 0, 0, 1, 5}, {0, 1, 2, 0, 0, 1, 15}, {1, 0, 2, 0, 0, 7, 12}, {1, 1, 2, 0, 0, 10, 15}, // always valid samples {0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 0}, {0, 1, 0, 0, 0, 0, 0}, {1, 1, 0, 0, 0, 0, 0}, {0, 0, 1, 0, 10, 0, 0}, {0, 1, 1, 5, 12, 0, 0}, {1, 0, 1, 0, 12, 0, 0}, {1, 1, 1, 12, 18, 0, 0}}; // for daily_lapack tests const vector> large_size_range = {{100, 100, 1, 0, 0}, {300, 120, 0, 0, 1}, {200, 300, 0, 0, 0}}; const vector> large_opt_range = {{0, 0, 1, 5, 10, 0, 0}, {1, 0, 1, 10, 20, 0, 0}, {0, 1, 2, 0, 0, 3, 10}, {1, 1, 0, 0, 0, 0, 0}}; Arguments gesvdx_setup_arguments(gesvdx_tuple tup) { vector size = std::get<0>(tup); vector opt = std::get<1>(tup); Arguments arg; // sizes rocblas_int m = size[0]; rocblas_int n = size[1]; arg.set("m", m); arg.set("n", n); // leading dimensions arg.set("lda", m + size[2] * 10); arg.set("ldu", m + size[3] * 10); arg.set("ldv", min(m, n) + size[4] * 10); // vector options if(opt[0] == 0) arg.set("left_svect", 'N'); else arg.set("left_svect", 'S'); if(opt[1] == 0) arg.set("right_svect", 'N'); else arg.set("right_svect", 'S'); // only testing standard use case/defaults for strides // ranges arg.set("srange", (opt[2] == 0 ? 'A' : (opt[2] == 1 ? 'V' : 'I'))); arg.set("vl", opt[3]); arg.set("vu", opt[4]); arg.set("il", opt[5]); arg.set("iu", opt[6]); arg.timing = 0; return arg; } class GESVDX : public ::TestWithParam { protected: GESVDX() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = gesvdx_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0 && arg.peek("left_svect") == 'N' && arg.peek("right_svect") == 'N' && arg.peek("srange") == 'A') testing_gesvdx_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_gesvdx(arg); } }; // non-batch tests TEST_P(GESVDX, __float) { run_tests(); } TEST_P(GESVDX, __double) { run_tests(); } TEST_P(GESVDX, __float_complex) { run_tests(); } TEST_P(GESVDX, __double_complex) { run_tests(); } // batched tests TEST_P(GESVDX, batched__float) { run_tests(); } TEST_P(GESVDX, batched__double) { run_tests(); } TEST_P(GESVDX, batched__float_complex) { run_tests(); } TEST_P(GESVDX, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(GESVDX, strided_batched__float) { run_tests(); } TEST_P(GESVDX, strided_batched__double) { run_tests(); } TEST_P(GESVDX, strided_batched__float_complex) { run_tests(); } TEST_P(GESVDX, strided_batched__double_complex) { run_tests(); } // daily_lapack tests normal execution with medium to large sizes INSTANTIATE_TEST_SUITE_P(daily_lapack, GESVDX, Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); // checkin_lapack tests normal execution with small sizes, invalid sizes, // quick returns, and corner cases INSTANTIATE_TEST_SUITE_P(checkin_lapack, GESVDX, Combine(ValuesIn(size_range), ValuesIn(opt_range))); rocSOLVER-rocm-5.5.1/clients/gtest/getf2_getrf_gtest.cpp000066400000000000000000000215411436600607200231040ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_getf2_getrf.hpp" #include "testing_getf2_getrf_npvt.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, int> getrf_tuple; // each matrix_size_range vector is a {m, lda, singular} // if singular = 1, then the used matrix for the tests is singular // case when m = n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 0}, // invalid {-1, 1, 0}, {20, 5, 0}, // normal (valid) samples {32, 32, 0}, {50, 50, 1}, {70, 100, 0}}; const vector n_size_range = { // quick return 0, // invalid -1, // normal (valid) samples 16, 20, 40, 100, }; // for daily_lapack tests const vector> large_matrix_size_range = { {192, 192, 0}, {640, 640, 1}, {1000, 1024, 0}, }; const vector large_n_size_range = { 45, 64, 520, 1024, 2000, }; Arguments getrf_setup_arguments(getrf_tuple tup) { vector matrix_size = std::get<0>(tup); int n_size = std::get<1>(tup); Arguments arg; arg.set("m", matrix_size[0]); arg.set("n", n_size); arg.set("lda", matrix_size[1]); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = matrix_size[2]; return arg; } template class GETF2_GETRF : public ::TestWithParam { protected: GETF2_GETRF() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = getrf_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0) testing_getf2_getrf_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_getf2_getrf(arg); arg.singular = 0; testing_getf2_getrf(arg); } }; template class GETF2_GETRF_NPVT : public ::TestWithParam { protected: GETF2_GETRF_NPVT() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = getrf_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0) testing_getf2_getrf_npvt_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_getf2_getrf_npvt(arg); arg.singular = 0; testing_getf2_getrf_npvt(arg); } }; class GETF2 : public GETF2_GETRF { }; class GETRF : public GETF2_GETRF { }; class GETF2_NPVT : public GETF2_GETRF_NPVT { }; class GETRF_NPVT : public GETF2_GETRF_NPVT { }; // non-batch tests TEST_P(GETF2_NPVT, __float) { run_tests(); } TEST_P(GETF2_NPVT, __double) { run_tests(); } TEST_P(GETF2_NPVT, __float_complex) { run_tests(); } TEST_P(GETF2_NPVT, __double_complex) { run_tests(); } TEST_P(GETRF_NPVT, __float) { run_tests(); } TEST_P(GETRF_NPVT, __double) { run_tests(); } TEST_P(GETRF_NPVT, __float_complex) { run_tests(); } TEST_P(GETRF_NPVT, __double_complex) { run_tests(); } TEST_P(GETF2, __float) { run_tests(); } TEST_P(GETF2, __double) { run_tests(); } TEST_P(GETF2, __float_complex) { run_tests(); } TEST_P(GETF2, __double_complex) { run_tests(); } TEST_P(GETRF, __float) { run_tests(); } TEST_P(GETRF, __double) { run_tests(); } TEST_P(GETRF, __float_complex) { run_tests(); } TEST_P(GETRF, __double_complex) { run_tests(); } // batched tests TEST_P(GETF2_NPVT, batched__float) { run_tests(); } TEST_P(GETF2_NPVT, batched__double) { run_tests(); } TEST_P(GETF2_NPVT, batched__float_complex) { run_tests(); } TEST_P(GETF2_NPVT, batched__double_complex) { run_tests(); } TEST_P(GETRF_NPVT, batched__float) { run_tests(); } TEST_P(GETRF_NPVT, batched__double) { run_tests(); } TEST_P(GETRF_NPVT, batched__float_complex) { run_tests(); } TEST_P(GETRF_NPVT, batched__double_complex) { run_tests(); } TEST_P(GETF2, batched__float) { run_tests(); } TEST_P(GETF2, batched__double) { run_tests(); } TEST_P(GETF2, batched__float_complex) { run_tests(); } TEST_P(GETF2, batched__double_complex) { run_tests(); } TEST_P(GETRF, batched__float) { run_tests(); } TEST_P(GETRF, batched__double) { run_tests(); } TEST_P(GETRF, batched__float_complex) { run_tests(); } TEST_P(GETRF, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(GETF2_NPVT, strided_batched__float) { run_tests(); } TEST_P(GETF2_NPVT, strided_batched__double) { run_tests(); } TEST_P(GETF2_NPVT, strided_batched__float_complex) { run_tests(); } TEST_P(GETF2_NPVT, strided_batched__double_complex) { run_tests(); } TEST_P(GETRF_NPVT, strided_batched__float) { run_tests(); } TEST_P(GETRF_NPVT, strided_batched__double) { run_tests(); } TEST_P(GETRF_NPVT, strided_batched__float_complex) { run_tests(); } TEST_P(GETRF_NPVT, strided_batched__double_complex) { run_tests(); } TEST_P(GETF2, strided_batched__float) { run_tests(); } TEST_P(GETF2, strided_batched__double) { run_tests(); } TEST_P(GETF2, strided_batched__float_complex) { run_tests(); } TEST_P(GETF2, strided_batched__double_complex) { run_tests(); } TEST_P(GETRF, strided_batched__float) { run_tests(); } TEST_P(GETRF, strided_batched__double) { run_tests(); } TEST_P(GETRF, strided_batched__float_complex) { run_tests(); } TEST_P(GETRF, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GETF2_NPVT, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GETF2_NPVT, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, GETRF_NPVT, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GETRF_NPVT, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, GETF2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GETF2, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, GETRF, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GETRF, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/getri_gtest.cpp000066400000000000000000000223221436600607200220160ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_getri.hpp" #include "testing_getri_npvt.hpp" #include "testing_getri_npvt_outofplace.hpp" #include "testing_getri_outofplace.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef vector getri_tuple; // each matrix_size_range vector is a {n, lda/ldc, singular} // if singular = 1, then the used matrix for the tests is singular // case when n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 0}, // invalid {-1, 1, 0}, {20, 5, 0}, // normal (valid) samples {32, 32, 0}, {50, 50, 1}, {70, 100, 0}, {100, 150, 1}}; // for daily_lapack tests const vector> large_matrix_size_range = {{192, 192, 1}, {500, 600, 1}, {640, 640, 0}, {1000, 1024, 0}, {1200, 1230, 0}}; Arguments getri_setup_arguments(getri_tuple tup, bool outofplace) { Arguments arg; arg.set("n", tup[0]); arg.set("lda", tup[1]); if(outofplace) arg.set("ldc", tup[1]); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = tup[2]; return arg; } class GETRI : public ::TestWithParam { protected: GETRI() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = getri_setup_arguments(GetParam(), false); if(arg.peek("n") == 0) testing_getri_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_getri(arg); arg.singular = 0; testing_getri(arg); } }; class GETRI_NPVT : public ::TestWithParam { protected: GETRI_NPVT() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = getri_setup_arguments(GetParam(), false); if(arg.peek("n") == 0) testing_getri_npvt_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_getri_npvt(arg); arg.singular = 0; testing_getri_npvt(arg); } }; class GETRI_OUTOFPLACE : public ::TestWithParam { protected: GETRI_OUTOFPLACE() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = getri_setup_arguments(GetParam(), true); if(arg.peek("n") == 0) testing_getri_outofplace_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_getri_outofplace(arg); arg.singular = 0; testing_getri_outofplace(arg); } }; class GETRI_NPVT_OUTOFPLACE : public ::TestWithParam { protected: GETRI_NPVT_OUTOFPLACE() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = getri_setup_arguments(GetParam(), true); if(arg.peek("n") == 0) testing_getri_npvt_outofplace_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_getri_npvt_outofplace(arg); arg.singular = 0; testing_getri_npvt_outofplace(arg); } }; // non-batch tests TEST_P(GETRI, __float) { run_tests(); } TEST_P(GETRI, __double) { run_tests(); } TEST_P(GETRI, __float_complex) { run_tests(); } TEST_P(GETRI, __double_complex) { run_tests(); } TEST_P(GETRI_NPVT, __float) { run_tests(); } TEST_P(GETRI_NPVT, __double) { run_tests(); } TEST_P(GETRI_NPVT, __float_complex) { run_tests(); } TEST_P(GETRI_NPVT, __double_complex) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, __float) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, __double) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, __float_complex) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, __double_complex) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, __float) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, __double) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, __float_complex) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, __double_complex) { run_tests(); } // batched tests TEST_P(GETRI, batched__float) { run_tests(); } TEST_P(GETRI, batched__double) { run_tests(); } TEST_P(GETRI, batched__float_complex) { run_tests(); } TEST_P(GETRI, batched__double_complex) { run_tests(); } TEST_P(GETRI_NPVT, batched__float) { run_tests(); } TEST_P(GETRI_NPVT, batched__double) { run_tests(); } TEST_P(GETRI_NPVT, batched__float_complex) { run_tests(); } TEST_P(GETRI_NPVT, batched__double_complex) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, batched__float) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, batched__double) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, batched__float_complex) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, batched__double_complex) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, batched__float) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, batched__double) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, batched__float_complex) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(GETRI, strided_batched__float) { run_tests(); } TEST_P(GETRI, strided_batched__double) { run_tests(); } TEST_P(GETRI, strided_batched__float_complex) { run_tests(); } TEST_P(GETRI, strided_batched__double_complex) { run_tests(); } TEST_P(GETRI_NPVT, strided_batched__float) { run_tests(); } TEST_P(GETRI_NPVT, strided_batched__double) { run_tests(); } TEST_P(GETRI_NPVT, strided_batched__float_complex) { run_tests(); } TEST_P(GETRI_NPVT, strided_batched__double_complex) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, strided_batched__float) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, strided_batched__double) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, strided_batched__float_complex) { run_tests(); } TEST_P(GETRI_OUTOFPLACE, strided_batched__double_complex) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, strided_batched__float) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, strided_batched__double) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, strided_batched__float_complex) { run_tests(); } TEST_P(GETRI_NPVT_OUTOFPLACE, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, GETRI, ValuesIn(large_matrix_size_range)); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GETRI, ValuesIn(matrix_size_range)); INSTANTIATE_TEST_SUITE_P(daily_lapack, GETRI_NPVT, ValuesIn(large_matrix_size_range)); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GETRI_NPVT, ValuesIn(matrix_size_range)); INSTANTIATE_TEST_SUITE_P(daily_lapack, GETRI_OUTOFPLACE, ValuesIn(large_matrix_size_range)); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GETRI_OUTOFPLACE, ValuesIn(matrix_size_range)); INSTANTIATE_TEST_SUITE_P(daily_lapack, GETRI_NPVT_OUTOFPLACE, ValuesIn(large_matrix_size_range)); INSTANTIATE_TEST_SUITE_P(checkin_lapack, GETRI_NPVT_OUTOFPLACE, ValuesIn(matrix_size_range)); rocSOLVER-rocm-5.5.1/clients/gtest/getrs_gtest.cpp000066400000000000000000000103541436600607200220320ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_getrs.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> getrs_tuple; // each A_range vector is a {N, lda, ldb}; // each B_range vector is a {nrhs, trans}; // if trans = 0 then no transpose // if trans = 1 then transpose // if trans = 2 then conjugate transpose // case when N = nrhs = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_sizeA_range = { // quick return {0, 1, 1}, // invalid {-1, 1, 1}, {10, 2, 10}, {10, 10, 2}, /// normal (valid) samples {20, 20, 20}, {30, 50, 30}, {30, 30, 50}, {50, 60, 60}}; const vector> matrix_sizeB_range = { // quick return {0, 0}, // invalid {-1, 0}, // normal (valid) samples {10, 0}, {20, 1}, {30, 2}, }; // for daily_lapack tests const vector> large_matrix_sizeA_range = {{70, 70, 100}, {192, 192, 192}, {600, 700, 645}, {1000, 1000, 1000}, {1000, 2000, 2000}}; const vector> large_matrix_sizeB_range = { {100, 0}, {150, 0}, {200, 1}, {524, 2}, {1000, 2}, }; Arguments getrs_setup_arguments(getrs_tuple tup) { vector matrix_sizeA = std::get<0>(tup); vector matrix_sizeB = std::get<1>(tup); Arguments arg; arg.set("n", matrix_sizeA[0]); arg.set("nrhs", matrix_sizeB[0]); arg.set("lda", matrix_sizeA[1]); arg.set("ldb", matrix_sizeA[2]); if(matrix_sizeB[1] == 0) arg.set("trans", 'N'); else if(matrix_sizeB[1] == 1) arg.set("trans", 'T'); else arg.set("trans", 'C'); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } class GETRS : public ::TestWithParam { protected: GETRS() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = getrs_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("nrhs") == 0) testing_getrs_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_getrs(arg); } }; // non-batch tests TEST_P(GETRS, __float) { run_tests(); } TEST_P(GETRS, __double) { run_tests(); } TEST_P(GETRS, __float_complex) { run_tests(); } TEST_P(GETRS, __double_complex) { run_tests(); } // batched tests TEST_P(GETRS, batched__float) { run_tests(); } TEST_P(GETRS, batched__double) { run_tests(); } TEST_P(GETRS, batched__float_complex) { run_tests(); } TEST_P(GETRS, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(GETRS, strided_batched__float) { run_tests(); } TEST_P(GETRS, strided_batched__double) { run_tests(); } TEST_P(GETRS, strided_batched__float_complex) { run_tests(); } TEST_P(GETRS, strided_batched__double_complex) { run_tests(); } // daily_lapack tests normal execution with medium to large sizes INSTANTIATE_TEST_SUITE_P(daily_lapack, GETRS, Combine(ValuesIn(large_matrix_sizeA_range), ValuesIn(large_matrix_sizeB_range))); // checkin_lapack tests normal execution with small sizes, invalid sizes, // quick returns, and corner cases INSTANTIATE_TEST_SUITE_P(checkin_lapack, GETRS, Combine(ValuesIn(matrix_sizeA_range), ValuesIn(matrix_sizeB_range))); rocSOLVER-rocm-5.5.1/clients/gtest/labrd_gtest.cpp000066400000000000000000000057531436600607200220010ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_labrd.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> labrd_tuple; // each matrix_size_range is a {m, lda, ldx} // each n_size_range is a {n, ldy, nb} // case when m = n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return (if nb = 0, else invalid) {0, 1, 1}, // invalid {1, 1, 0}, {-1, 1, 1}, {1, 1, -1}, {20, 5, 20}, {20, 20, 5}, // normal (valid) samples {50, 50, 50}, {70, 100, 70}, {130, 130, 150}, {150, 200, 200}}; const vector> n_size_range = { // quick return {0, 1, 0}, {1, 1, 0}, // invalid {-1, 1, 1}, {1, 1, -1}, {20, 5, 20}, {20, 20, 25}, // normal (valid) samples {16, 16, 10}, {20, 30, 10}, {120, 120, 30}, {150, 200, 30}}; // for daily_lapack tests const vector> large_matrix_size_range = { {152, 152, 152}, {640, 640, 656}, {1000, 1024, 1000}, }; const vector> large_n_size_range = {{64, 64, 60}, {98, 98, 60}, {130, 130, 100}, {220, 240, 100}, {400, 450, 100}}; Arguments labrd_setup_arguments(labrd_tuple tup) { vector matrix_size = std::get<0>(tup); vector n_size = std::get<1>(tup); Arguments arg; arg.set("m", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("ldx", matrix_size[2]); arg.set("n", n_size[0]); arg.set("ldy", n_size[1]); arg.set("k", n_size[2]); arg.timing = 0; return arg; } class LABRD : public ::TestWithParam { protected: LABRD() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = labrd_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0) testing_labrd_bad_arg(); testing_labrd(arg); } }; // non-batch tests TEST_P(LABRD, __float) { run_tests(); } TEST_P(LABRD, __double) { run_tests(); } TEST_P(LABRD, __float_complex) { run_tests(); } TEST_P(LABRD, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, LABRD, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, LABRD, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/lacgv_gtest.cpp000066400000000000000000000033751436600607200220070ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_lacgv.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef vector lacgv_tuple; // each range is a {n,inc} // case when n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> range = { // quick return {0, 1}, // invalid {-1, 1}, {1, 0}, // normal (valid) samples {10, 1}, {10, -1}, {20, 2}, {30, 3}, {30, -3}}; // for daily_lapack tests const vector> large_range = {{192, 10}, {192, -10}, {250, 20}, {500, 30}, {1500, 40}, {1500, -40}}; Arguments lacgv_setup_arguments(lacgv_tuple tup) { Arguments arg; arg.set("n", tup[0]); arg.set("incx", tup[1]); return arg; } class LACGV : public ::TestWithParam { protected: LACGV() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = lacgv_setup_arguments(GetParam()); if(arg.peek("n") == 0) testing_lacgv_bad_arg(); testing_lacgv(arg); } }; // non-batch tests TEST_P(LACGV, __float_complex) { run_tests(); } TEST_P(LACGV, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, LACGV, ValuesIn(large_range)); INSTANTIATE_TEST_SUITE_P(checkin_lapack, LACGV, ValuesIn(range)); rocSOLVER-rocm-5.5.1/clients/gtest/larf_gtest.cpp000066400000000000000000000053071436600607200216340ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_larf.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> larf_tuple; // each size_range vector is a {M,N,lda} // each incx_range vector is a {incx,s} // if s = 0, then side = 'L' // if s = 1, then side = 'R' // case when M == 0 and incx == 0 also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> incx_range = { // invalid {0, 0}, // normal (valid) samples {-10, 0}, {-5, 1}, {-1, 0}, {1, 1}, {5, 0}, {10, 1}}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 10, 1}, {10, 0, 10}, // invalid {-1, 10, 1}, {10, -1, 10}, {10, 10, 5}, // normal (valid) samples {12, 20, 12}, {20, 15, 20}, {35, 35, 50}}; // for daily_lapack tests const vector> large_matrix_size_range = {{192, 192, 192}, {640, 300, 700}, {1024, 2000, 1024}, {2547, 2547, 2550}}; Arguments larf_setup_arguments(larf_tuple tup) { vector matrix_size = std::get<0>(tup); vector inc = std::get<1>(tup); Arguments arg; arg.set("m", matrix_size[0]); arg.set("n", matrix_size[1]); arg.set("lda", matrix_size[2]); arg.set("incx", inc[0]); arg.set("side", inc[1] == 1 ? 'R' : 'L'); arg.timing = 0; return arg; } class LARF : public ::TestWithParam { protected: LARF() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = larf_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("incx") == 0) testing_larf_bad_arg(); testing_larf(arg); } }; // non-batch tests TEST_P(LARF, __float) { run_tests(); } TEST_P(LARF, __double) { run_tests(); } TEST_P(LARF, __float_complex) { run_tests(); } TEST_P(LARF, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, LARF, Combine(ValuesIn(large_matrix_size_range), ValuesIn(incx_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, LARF, Combine(ValuesIn(matrix_size_range), ValuesIn(incx_range))); rocSOLVER-rocm-5.5.1/clients/gtest/larfb_gtest.cpp000066400000000000000000000077311436600607200220010ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_larfb.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> larfb_tuple; // each matrix_size vector is a {M,N,lda,s,ldv,st} // if s = 0, then side = 'L' // if s = 1, then side = 'R' // if st = 0, then storev = 'C' // if st = 1, then storev = 'R' // each reflector_size vector is a {K,ldt,d,t} // if d = 0, then direct = 'F' // if d = 1, then direct = 'B' // if t = 0, then trans = 'N' // if t = 1, then trans = 'T' // if t = 2, then trans = 'C' // case when m = 0 and k = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 1, 0, 1, 0}, {1, 0, 1, 0, 1, 0}, // invalid {-1, 1, 1, 0, 1, 0}, {1, -1, 1, 0, 1, 0}, {15, 15, 5, 0, 15, 0}, {12, 5, 12, 0, 5, 0}, {5, 12, 15, 1, 5, 0}, {15, 10, 15, 0, 5, 1}, // normal (valid) samples {15, 15, 15, 0, 15, 0}, {18, 20, 20, 1, 20, 0}, {20, 18, 20, 0, 20, 0}, {20, 30, 20, 1, 30, 0}, {50, 35, 50, 0, 50, 0}, {40, 40, 40, 0, 15, 1}, {40, 40, 40, 1, 25, 1}}; const vector> reflector_size_range = { // invalid {0, 1, 0, 0}, {5, 1, 0, 0}, // normal (valid) samples {7, 7, 0, 1}, {10, 10, 1, 1}, {12, 70, 0, 2}, {15, 15, 1, 2}}; // for daily_lapack tests const vector> large_matrix_size_range = {{192, 192, 192, 0, 192, 0}, {640, 640, 640, 1, 700, 0}, {640, 640, 700, 0, 640, 0}, {840, 1024, 840, 1, 1024, 0}, {2547, 1980, 2547, 0, 2547, 0}, {200, 200, 220, 0, 100, 1}, {240, 300, 240, 1, 100, 1}, {600, 200, 600, 1, 100, 1}}; const vector> large_reflector_size_range = {{35, 35, 0, 1}, {50, 70, 0, 0}, {85, 85, 1, 1}, {100, 150, 1, 0}, {100, 150, 0, 2}, {100, 150, 1, 2}}; Arguments larfb_setup_arguments(larfb_tuple tup) { vector order_size = std::get<0>(tup); vector reflector_size = std::get<1>(tup); Arguments arg; arg.set("m", order_size[0]); arg.set("n", order_size[1]); arg.set("lda", order_size[2]); arg.set("side", order_size[3] == 0 ? 'L' : 'R'); arg.set("ldv", order_size[4]); arg.set("storev", order_size[5] == 1 ? 'R' : 'C'); arg.set("k", reflector_size[0]); arg.set("ldt", reflector_size[1]); arg.set("direct", reflector_size[2] == 1 ? 'B' : 'F'); arg.set("trans", reflector_size[3] == 0 ? 'N' : (reflector_size[3] == 1 ? 'T' : 'C')); arg.timing = 0; return arg; } class LARFB : public ::TestWithParam { protected: LARFB() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = larfb_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("k") == 0) testing_larfb_bad_arg(); testing_larfb(arg); } }; // non-batch tests TEST_P(LARFB, __float) { run_tests(); } TEST_P(LARFB, __double) { run_tests(); } TEST_P(LARFB, __float_complex) { run_tests(); } TEST_P(LARFB, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, LARFB, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_reflector_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, LARFB, Combine(ValuesIn(matrix_size_range), ValuesIn(reflector_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/larfg_gtest.cpp000066400000000000000000000041711436600607200220010ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_larfg.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple larfg_tuple; // case when n = 0 and incx = 0 also execute the bad arguments test // (null handle, null pointers and invalid values) const vector incx_range = { // invalid -1, 0, // normal (valid) samples 1, 5, 8, 10, }; // for checkin_lapack tests const vector n_size_range = { // quick return 0, // invalid -1, // normal (valid) samples 1, 12, 20, 35, }; // for daily_lapack tests const vector large_n_size_range = { 192, 640, 1024, 2547, }; Arguments larfg_setup_arguments(larfg_tuple tup) { int n_size = std::get<0>(tup); int inc = std::get<1>(tup); Arguments arg; arg.set("n", n_size); arg.set("incx", inc); arg.timing = 0; return arg; } class LARFG : public ::TestWithParam { protected: LARFG() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = larfg_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("incx") == 0) testing_larfg_bad_arg(); testing_larfg(arg); } }; // non-batch tests TEST_P(LARFG, __float) { run_tests(); } TEST_P(LARFG, __double) { run_tests(); } TEST_P(LARFG, __float_complex) { run_tests(); } TEST_P(LARFG, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, LARFG, Combine(ValuesIn(large_n_size_range), ValuesIn(incx_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, LARFG, Combine(ValuesIn(n_size_range), ValuesIn(incx_range))); rocSOLVER-rocm-5.5.1/clients/gtest/larft_gtest.cpp000066400000000000000000000060151436600607200220150ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_larft.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> larft_tuple; // each order_size_range vector is {N,ldv,s} // if s = 0, then storev = 'C' // if s = 1, then storev = 'R' // each reflector_size_range is {K,ldt,d} // if d = 0, then direct = 'F' // if d = 1, then direct = 'B' // case when n == 0 and k == 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> order_size_range = { // quick return {0, 1, 0}, // invalid {-1, 1, 0}, {10, 5, 0}, {10, 3, 1}, // normal (valid) samples {15, 15, 0}, {20, 20, 1}, {35, 50, 0}}; const vector> reflector_size_range = { // invalid {0, 1, 0}, {5, 1, 0}, // normal (valid) samples {5, 5, 0}, {10, 20, 1}, {15, 15, 0}}; // for daily_lapack tests const vector> large_order_size_range = {{192, 192, 0}, {640, 75, 1}, {1024, 1200, 0}, {2048, 100, 1}}; const vector> large_reflector_size_range = {{15, 15, 0}, {25, 40, 1}, {45, 45, 0}, {60, 70, 1}, {75, 75, 0}}; Arguments larft_setup_arguments(larft_tuple tup) { vector order_size = std::get<0>(tup); vector reflector_size = std::get<1>(tup); Arguments arg; arg.set("n", order_size[0]); arg.set("ldv", order_size[1]); arg.set("storev", order_size[2] == 1 ? 'R' : 'C'); arg.set("k", reflector_size[0]); arg.set("ldt", reflector_size[1]); arg.set("direct", reflector_size[2] == 1 ? 'B' : 'F'); arg.timing = 0; return arg; } class LARFT : public ::TestWithParam { protected: LARFT() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = larft_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("k") == 0) testing_larft_bad_arg(); testing_larft(arg); } }; // non-batch tests TEST_P(LARFT, __float) { run_tests(); } TEST_P(LARFT, __double) { run_tests(); } TEST_P(LARFT, __float_complex) { run_tests(); } TEST_P(LARFT, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, LARFT, Combine(ValuesIn(large_order_size_range), ValuesIn(large_reflector_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, LARFT, Combine(ValuesIn(order_size_range), ValuesIn(reflector_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/laswp_gtest.cpp000066400000000000000000000051331436600607200220330ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_laswp.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> laswp_tuple; // each range1 vector is a {n,lda} // each range2 vector is a {k1,k2,inc} // case when n = 0, k1 = 1 and k2 = 3 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> range1 = { // quick return {0, 1}, // invalid {-1, 1}, {10, 0}, // normal (valid) samples {10, 100}, {20, 100}, {30, 100}}; const vector> range2 = { // invalid {0, 1, 1}, {1, 0, 1}, {1, 2, 0}, {2, 1, 1}, // normal (valid) samples {1, 3, 1}, {3, 5, 2}, {5, 10, -1}, {3, 12, -2}}; // for daily_lapack tests const vector> large_range1 = {{192, 100}, {250, 100}, {500, 100}, {1500, 100}}; const vector> large_range2 = {{1, 50, 1}, {5, 60, 2}, {3, 70, -1}, {20, 100, -2}}; Arguments laswp_setup_arguments(laswp_tuple tup) { vector matrix_size = std::get<0>(tup); vector pivots = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("k1", pivots[0]); arg.set("k2", pivots[1]); arg.set("incx", pivots[2]); arg.timing = 0; return arg; } class LASWP : public ::TestWithParam { protected: LASWP() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = laswp_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("k1") == 1 && arg.peek("k2") == 3) testing_laswp_bad_arg(); testing_laswp(arg); } }; // non-batch tests TEST_P(LASWP, __float) { run_tests(); } TEST_P(LASWP, __double) { run_tests(); } TEST_P(LASWP, __float_complex) { run_tests(); } TEST_P(LASWP, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, LASWP, Combine(ValuesIn(large_range1), ValuesIn(large_range2))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, LASWP, Combine(ValuesIn(range1), ValuesIn(range2))); rocSOLVER-rocm-5.5.1/clients/gtest/lasyf_gtest.cpp000066400000000000000000000056701436600607200220310ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_lasyf.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> lasyf_tuple; // each matrix_size_range is a {n, lda, singular} // if singular = 1, then the used matrix for the tests is singular // each op_range is a {nb, ul} // if ul = 0, then uplo = 'L' // if ul = 1, then uplo = 'U' // case when n = 0 and nb = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 0}, // invalid {-1, 1, 0}, {20, 5, 0}, {20, 20, 0}, // normal (valid) samples {35, 50, 0}, {70, 100, 1}, {130, 130, 0}, {150, 150, 1}}; const vector> op_range = { // quick return {0, 0}, // invalid {-1, 0}, {180, 0}, // normal (valid) samples {10, 0}, {25, 1}, {30, 0}, {30, 1}}; // for daily_lapack tests const vector> large_matrix_size_range = {{152, 152, 1}, {640, 640, 0}, {1000, 1024, 1}}; const vector> large_op_range = {{64, 0}, {98, 1}, {130, 0}, {150, 1}}; Arguments lasyf_setup_arguments(lasyf_tuple tup) { vector matrix_size = std::get<0>(tup); vector op_size = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("nb", op_size[0]); arg.set("uplo", op_size[1] ? 'U' : 'L'); arg.timing = 0; arg.singular = matrix_size[2]; return arg; } class LASYF : public ::TestWithParam { protected: LASYF() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = lasyf_setup_arguments(GetParam()); if(arg.peek("nb") == 0 && arg.peek("n") == 0) testing_lasyf_bad_arg(); if(arg.singular == 1) testing_lasyf(arg); arg.singular = 0; testing_lasyf(arg); } }; // non-batch tests TEST_P(LASYF, __float) { run_tests(); } TEST_P(LASYF, __double) { run_tests(); } TEST_P(LASYF, __float_complex) { run_tests(); } TEST_P(LASYF, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, LASYF, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, LASYF, Combine(ValuesIn(matrix_size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/latrd_gtest.cpp000066400000000000000000000054601436600607200220160ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_latrd.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> latrd_tuple; // each matrix_size_range is a {n, lda, ldw} // each op_range is a {k, ul} // if ul = 0, then uplo = 'L' // if ul = 1, then uplo = 'U' // case when n = 0 and k = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 1}, // invalid {-1, 1, 1}, {20, 5, 20}, {20, 20, 5}, // normal (valid) samples {35, 50, 50}, {70, 100, 70}, {130, 130, 150}, {150, 150, 150}}; const vector> op_range = { // quick return {0, 0}, // invalid {-1, 0}, {180, 0}, // normal (valid) samples {10, 0}, {25, 1}, {30, 0}, {30, 1}}; // for daily_lapack tests const vector> large_matrix_size_range = {{152, 152, 152}, {640, 640, 656}, {1000, 1024, 1000}}; const vector> large_op_range = {{64, 0}, {98, 1}, {130, 0}, {150, 1}}; Arguments latrd_setup_arguments(latrd_tuple tup) { vector matrix_size = std::get<0>(tup); vector op_size = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("ldw", matrix_size[2]); arg.set("k", op_size[0]); arg.set("uplo", op_size[1] ? 'U' : 'L'); arg.timing = 0; return arg; } class LATRD : public ::TestWithParam { protected: LATRD() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = latrd_setup_arguments(GetParam()); if(arg.peek("k") == 0 && arg.peek("n") == 0) testing_latrd_bad_arg(); testing_latrd(arg); } }; // non-batch tests TEST_P(LATRD, __float) { run_tests(); } TEST_P(LATRD, __double) { run_tests(); } TEST_P(LATRD, __float_complex) { run_tests(); } TEST_P(LATRD, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, LATRD, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, LATRD, Combine(ValuesIn(matrix_size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/lauum_gtest.cpp000066400000000000000000000040771436600607200220360ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_lauum.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple> lauum_tuple; // each range is a {n,lda} // for checkin_lapack tests const vector> size_range = { // quick return {0, 1}, // invalid {-1, 1}, {10, 0}, // normal (valid) samples {3, 3}, {10, 10}, {20, 20}, {30, 30}, {10, 20}, {20, 30}}; const vector> large_size_range = {{100, 100}, {200, 200}}; const vector uplo_range = {'L', 'U'}; Arguments lauum_setup_arguments(lauum_tuple tup) { char uplo = std::get<0>(tup); vector size = std::get<1>(tup); Arguments arg; arg.set("n", size[0]); arg.set("lda", size[1]); arg.set("uplo", uplo); arg.timing = 0; return arg; } class LAUUM : public ::TestWithParam { protected: LAUUM() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = lauum_setup_arguments(GetParam()); if((arg.peek("n") == 0) && (arg.peek("uplo") == 'L')) testing_lauum_bad_arg(); testing_lauum(arg); } }; // non-batch tests TEST_P(LAUUM, __float) { run_tests(); } TEST_P(LAUUM, __double) { run_tests(); } /* TEST_P(LAUUM, __float_complex) { run_tests(); } TEST_P(LAUUM, __double_complex) { run_tests(); } */ INSTANTIATE_TEST_SUITE_P(daily_lapack, LAUUM, Combine(ValuesIn(uplo_range), ValuesIn(large_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, LAUUM, Combine(ValuesIn(uplo_range), ValuesIn(size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/logging_gtest.cpp000066400000000000000000000402561436600607200223400ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #if __has_include() #include namespace fs = std::filesystem; #else #include namespace fs = std::experimental::filesystem; #endif #include #include #include #include #include #include #include #include "client_environment_helpers.hpp" #include "clientcommon.hpp" using ::testing::Matcher; using ::testing::MatchesRegex; using ::testing::UnitTest; class checkin_misc_LOGGING : public ::testing::Test { protected: void SetUp() override { fs::path temp_dir = fs::temp_directory_path(); std::string test_name = UnitTest::GetInstance()->current_test_info()->name(); log_filepath = temp_dir / fmt::format("{}.{}.log", test_name, nondeterministic_value()); fs::path nonexistent_dirpath = temp_dir / fmt::format("nonexistent_{}", test_name); ASSERT_FALSE(fs::exists(nonexistent_dirpath)); invalid_log_filepath = nonexistent_dirpath / "invalid.log"; ASSERT_EQ(hipMalloc(&dA, sizeof(double) * stA * bc), hipSuccess); ASSERT_EQ(hipMalloc(&dP, sizeof(rocblas_int) * stP * bc), hipSuccess); ASSERT_EQ(hipMalloc(&dinfo, sizeof(rocblas_int) * bc), hipSuccess); } void TearDown() override { if(fs::exists(log_filepath)) { if(HasFailure() && std::getenv("ROCSOLVER_TEST_DEBUG")) fmt::print(stderr, "ROCSOLVER_TEST_DEBUG is set so {} was not removed.\n", log_filepath); else EXPECT_TRUE(fs::remove(log_filepath)); } EXPECT_EQ(hipFree(dA), hipSuccess); EXPECT_EQ(hipFree(dP), hipSuccess); EXPECT_EQ(hipFree(dinfo), hipSuccess); } unsigned int nondeterministic_value() { return rd(); } std::random_device rd; fs::path log_filepath; fs::path invalid_log_filepath; double* dA; rocblas_int *dP, *dinfo; const rocblas_int m = 25; const rocblas_int n = 25; const rocblas_int lda = m; const rocblas_stride stA = lda * n; const rocblas_stride stP = n; const rocblas_int bc = 3; }; static void verify_file(const fs::path& filepath, const std::vector& expected_lines) { std::ifstream logfile(filepath); ASSERT_TRUE(logfile.good()) << "which implies a failure to open " << filepath; std::string line; size_t line_number = 1; for(; std::getline(logfile, line); ++line_number) { ASSERT_LE(line_number, expected_lines.size()) << "extra line containing '" << line << "' in " << filepath; const std::string& expected_pattern = expected_lines[line_number - 1]; EXPECT_TRUE(Matcher(MatchesRegex(expected_pattern)).Matches(line)) << "Mismatch at line " << line_number << ":\n Actual line: " << line << "\nExpected pattern: " << expected_pattern; } ASSERT_EQ(line_number - 1, expected_lines.size()) << "missing lines in " << filepath; } TEST_F(checkin_misc_LOGGING, rocsolver_log_path) { rocblas_local_handle handle; scoped_envvar logpath_variable("ROCSOLVER_LOG_PATH", log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_layer_mode(rocblas_layer_mode_log_trace), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_max_levels(1), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); ASSERT_TRUE(fs::exists(log_filepath)); } TEST_F(checkin_misc_LOGGING, rocblas_layer_mode_log_trace) { rocblas_local_handle handle; scoped_envvar logpath_variable("ROCSOLVER_LOG_TRACE_PATH", log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_layer_mode(rocblas_layer_mode_log_trace), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_max_levels(1), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); std::vector expected_lines = { "ROCSOLVER LOG FILE", "rocSOLVER Version: .*", "rocBLAS Version: .*", ".*ENTER rocsolver_dgetrf_strided_batched trace tree.*", ".*getrf.*m: 25, n: 25, shiftA: 0, lda: 25, shiftP: 0, bc: 3.*", ".*EXIT rocsolver_dgetrf_strided_batched trace tree.*", "\\s*", }; verify_file(log_filepath, expected_lines); } TEST_F(checkin_misc_LOGGING, rocblas_layer_mode_log_bench) { rocblas_local_handle handle; scoped_envvar logpath_variable("ROCSOLVER_LOG_BENCH_PATH", log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_layer_mode(rocblas_layer_mode_log_bench), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_max_levels(1), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); std::vector expected_lines = { "ROCSOLVER LOG FILE", "rocSOLVER Version: .*", "rocBLAS Version: .*", ".*rocsolver-bench -f getrf_strided_batched -r d -m 25 -n 25 --lda 25 --strideA 625 " "--strideP 25 --batch_count 3", }; verify_file(log_filepath, expected_lines); } TEST_F(checkin_misc_LOGGING, rocblas_layer_mode_log_profile) { rocblas_local_handle handle; scoped_envvar logpath_variable("ROCSOLVER_LOG_PROFILE_PATH", log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_max_levels(1), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); std::vector expected_lines = { "ROCSOLVER LOG FILE", "rocSOLVER Version: .*", "rocBLAS Version: .*", ".*PROFILE.*", ".*getrf.*Calls: 1, Total Time: .+ .+ .in nested functions: .+ .+.", "\\s*", }; verify_file(log_filepath, expected_lines); } TEST_F(checkin_misc_LOGGING, rocsolver_log_write_profile) { rocblas_local_handle handle; scoped_envvar logpath_variable("ROCSOLVER_LOG_PROFILE_PATH", log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_max_levels(1), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); EXPECT_EQ(rocsolver_log_write_profile(), rocblas_status_success); EXPECT_EQ(rocsolver_log_write_profile(), rocblas_status_success); std::vector expected_lines = { "ROCSOLVER LOG FILE", "rocSOLVER Version: .*", "rocBLAS Version: .*", ".*PROFILE.*", ".*getrf.*Calls: 1, Total Time: .+ .+ .in nested functions: .+ .+.", "\\s*", ".*PROFILE.*", ".*getrf.*Calls: 1, Total Time: .+ .+ .in nested functions: .+ .+.", "\\s*", }; verify_file(log_filepath, expected_lines); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); // reset global state for other tests } TEST_F(checkin_misc_LOGGING, rocsolver_log_flush_profile) { rocblas_local_handle handle; scoped_envvar logpath_variable("ROCSOLVER_LOG_PROFILE_PATH", log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_max_levels(1), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); EXPECT_EQ(rocsolver_log_flush_profile(), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); EXPECT_EQ(rocsolver_log_flush_profile(), rocblas_status_success); EXPECT_EQ(rocsolver_log_flush_profile(), rocblas_status_success); std::vector expected_lines = { "ROCSOLVER LOG FILE", "rocSOLVER Version: .*", "rocBLAS Version: .*", ".*PROFILE.*", ".*getrf.*Calls: 1, Total Time: .+ .+ .in nested functions: .+ .+.", "\\s*", ".*PROFILE.*", ".*getrf.*Calls: 2, Total Time: .+ .+ .in nested functions: .+ .+.", "\\s*", }; verify_file(log_filepath, expected_lines); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); // reset global state for other tests } TEST_F(checkin_misc_LOGGING, rocsolver_log_restore_defaults_resets_layer_mode) { rocblas_local_handle handle; scoped_envvar logpath_variable("ROCSOLVER_LOG_TRACE_PATH", log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_layer_mode(rocblas_layer_mode_log_trace), rocblas_status_success); EXPECT_EQ(rocsolver_log_restore_defaults(), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); std::vector expected_lines = { "ROCSOLVER LOG FILE", "rocSOLVER Version: .*", "rocBLAS Version: .*", }; verify_file(log_filepath, expected_lines); } TEST_F(checkin_misc_LOGGING, rocsolver_log_restore_defaults_resets_max_levels) { rocblas_local_handle handle; scoped_envvar logpath_variable("ROCSOLVER_LOG_TRACE_PATH", log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_max_levels(2), rocblas_status_success); EXPECT_EQ(rocsolver_log_restore_defaults(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_layer_mode(rocblas_layer_mode_log_trace), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); std::vector expected_lines = { "ROCSOLVER LOG FILE", "rocSOLVER Version: .*", "rocBLAS Version: .*", ".*ENTER rocsolver_dgetrf_strided_batched trace tree.*", ".*getrf.*m: 25, n: 25, shiftA: 0, lda: 25, shiftP: 0, bc: 3.*", ".*EXIT rocsolver_dgetrf_strided_batched trace tree.*", "\\s*", }; verify_file(log_filepath, expected_lines); } TEST_F(checkin_misc_LOGGING, rocblas_layer_mode_log_trace_tree) { rocblas_local_handle handle; scoped_envvar logpath_variable("ROCSOLVER_LOG_TRACE_PATH", log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_layer_mode(rocblas_layer_mode_log_trace), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_max_levels(2), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); std::vector expected_lines = { "ROCSOLVER LOG FILE", "rocSOLVER Version: .*", "rocBLAS Version: .*", ".*ENTER rocsolver_dgetrf_strided_batched trace tree.*", ".*getrf.*m: 25, n: 25, shiftA: 0, lda: 25, shiftP: 0, bc: 3.*", " .*getf2.*m: 25, n: 25, shiftA: 0, lda: 25, shiftP: 0, bc: 3.*", ".*EXIT rocsolver_dgetrf_strided_batched trace tree.*", "\\s*", }; verify_file(log_filepath, expected_lines); } TEST_F(checkin_misc_LOGGING, rocblas_layer_mode_log_profile_tree) { rocblas_local_handle handle; scoped_envvar logpath_variable("ROCSOLVER_LOG_PROFILE_PATH", log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_max_levels(2), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); std::vector expected_lines = { "ROCSOLVER LOG FILE", "rocSOLVER Version: .*", "rocBLAS Version: .*", ".*PROFILE.*", ".*getrf.*Calls: 1, Total Time: .+ .+ .in nested functions: .+ .+.", ".*getf2.*Calls: 1, Total Time: .+ .+", "\\s*", }; verify_file(log_filepath, expected_lines); } TEST_F(checkin_misc_LOGGING, rocblas_layer_mode_log_bench_tree) { rocblas_local_handle handle; scoped_envvar logpath_variable("ROCSOLVER_LOG_BENCH_PATH", log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_layer_mode(rocblas_layer_mode_log_bench), rocblas_status_success); EXPECT_EQ(rocsolver_log_set_max_levels(2), rocblas_status_success); EXPECT_EQ(rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc), rocblas_status_success); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); std::vector expected_lines = { "ROCSOLVER LOG FILE", "rocSOLVER Version: .*", "rocBLAS Version: .*", ".*rocsolver-bench -f getrf_strided_batched -r d -m 25 -n 25 --lda 25 --strideA 625 " "--strideP 25 --batch_count 3", }; verify_file(log_filepath, expected_lines); } TEST_F(checkin_misc_LOGGING, invalid_trace_file_open) { scoped_envvar logpath_variable("ROCSOLVER_LOG_TRACE_PATH", invalid_log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_internal_error); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); } TEST_F(checkin_misc_LOGGING, invalid_profile_file_open) { scoped_envvar logpath_variable("ROCSOLVER_LOG_PROFILE_PATH", invalid_log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_internal_error); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); } TEST_F(checkin_misc_LOGGING, invalid_bench_file_open) { scoped_envvar logpath_variable("ROCSOLVER_LOG_BENCH_PATH", invalid_log_filepath.generic_string().c_str()); ASSERT_EQ(rocsolver_log_begin(), rocblas_status_internal_error); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); } TEST_F(checkin_misc_LOGGING, begin_twice) { ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_begin(), rocblas_status_internal_error); ASSERT_EQ(rocsolver_log_end(), rocblas_status_success); } TEST_F(checkin_misc_LOGGING, end_twice) { ASSERT_EQ(rocsolver_log_begin(), rocblas_status_success); EXPECT_EQ(rocsolver_log_end(), rocblas_status_success); ASSERT_EQ(rocsolver_log_end(), rocblas_status_internal_error); } TEST_F(checkin_misc_LOGGING, end_before_begin) { ASSERT_EQ(rocsolver_log_end(), rocblas_status_internal_error); } rocSOLVER-rocm-5.5.1/clients/gtest/managed_malloc_gtest.cpp000066400000000000000000000046451436600607200236370ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_managed_malloc.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> managed_malloc_tuple; // each matrix_size_range is a {m, lda, ldx} // each n_size_range is a {n, ldy, nb} // for checkin_lapack tests const vector> matrix_size_range = { // normal (valid) samples {50, 50, 50}, {70, 100, 70}}; const vector> n_size_range = { // normal (valid) samples {16, 16, 10}, {20, 30, 10}}; // for daily_lapack tests const vector> large_matrix_size_range = { {130, 130, 150}, }; const vector> large_n_size_range = {{64, 64, 60}}; Arguments managed_malloc_setup_arguments(managed_malloc_tuple tup) { vector matrix_size = std::get<0>(tup); vector n_size = std::get<1>(tup); Arguments arg; arg.set("m", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("ldx", matrix_size[2]); arg.set("n", n_size[0]); arg.set("ldy", n_size[1]); arg.set("k", n_size[2]); arg.timing = 0; return arg; } class MANAGED_MALLOC : public ::TestWithParam { protected: MANAGED_MALLOC() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = managed_malloc_setup_arguments(GetParam()); testing_managed_malloc(arg); } }; // non-batch tests TEST_P(MANAGED_MALLOC, __float) { run_tests(); } TEST_P(MANAGED_MALLOC, __double) { run_tests(); } TEST_P(MANAGED_MALLOC, __float_complex) { run_tests(); } TEST_P(MANAGED_MALLOC, __double_complex) { run_tests(); } // INSTANTIATE_TEST_SUITE_P(daily_lapack, // MANAGED_MALLOC, // Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(known_bug, MANAGED_MALLOC, Combine(ValuesIn(matrix_size_range), ValuesIn(n_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/memory_model_gtest.cpp000066400000000000000000000304431436600607200233770ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #include #include #include #include "client_environment_helpers.hpp" class checkin_misc_MEMORY_MODEL : public ::testing::Test { protected: void SetUp() override { if(char* envvar = getenv("ROCBLAS_DEVICE_MEMORY_SIZE")) GTEST_SKIP() << "Cannot execute in dirty environment; ROCBLAS_DEVICE_MEMORY_SIZE=" << envvar; ASSERT_EQ(hipMalloc(&dA, sizeof(double) * stA * bc), hipSuccess); ASSERT_EQ(hipMalloc(&dP, sizeof(rocblas_int) * stP * bc), hipSuccess); ASSERT_EQ(hipMalloc(&dinfo, sizeof(rocblas_int) * bc), hipSuccess); } void TearDown() override { if(getenv("ROCBLAS_DEVICE_MEMORY_SIZE")) unset_environment_variable("ROCBLAS_DEVICE_MEMORY_SIZE"); ASSERT_EQ(hipFree(dA), hipSuccess); ASSERT_EQ(hipFree(dP), hipSuccess); ASSERT_EQ(hipFree(dinfo), hipSuccess); } double* dA; rocblas_int *dP, *dinfo; const rocblas_int m = 1500; const rocblas_int n = 1500; const rocblas_int m_small = 65; const rocblas_int n_small = 65; const rocblas_int lda = m; const rocblas_stride stA = lda * n; const rocblas_stride stP = n; const rocblas_int bc = 8; const rocblas_int bc_small = 5; }; /*************************************/ /***** rocblas_managed (default) *****/ /*************************************/ TEST_F(checkin_misc_MEMORY_MODEL, rocblas_managed) { size_t size, size1; rocblas_status status; rocblas_handle handle; // 1. create handle ASSERT_EQ(rocblas_create_handle(&handle), rocblas_status_success); // 2. by default, memory is rocblas managed EXPECT_TRUE(rocblas_is_managing_device_memory(handle)); // 3. by default, 32MB should be reserved rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 32 * 1024 * 1024); // 4. start query rocblas_start_device_memory_size_query(handle); EXPECT_TRUE(rocblas_is_device_memory_size_query(handle)); // 5. getrf baseline will require ~54MB status = rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc); EXPECT_EQ(status, rocblas_status_size_increased); // 6. stop query rocblas_stop_device_memory_size_query(handle, &size1); EXPECT_GT(size1, 32 * 1024 * 1024); // 7. device memory size should not change yet; it should be 32MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 32 * 1024 * 1024); // 8. When executing getrf, rocblas should increase memory automatically // allowing execution to success status = rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc); EXPECT_EQ(status, rocblas_status_success); // 9. device memory size should have changed after execution of getrf to 54MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, size1); // 10. start query rocblas_start_device_memory_size_query(handle); EXPECT_TRUE(rocblas_is_device_memory_size_query(handle)); // 11. getrf small will require ~.5MB status = rocsolver_dgetrf_strided_batched(handle, m_small, n_small, dA, lda, stA, dP, stP, dinfo, bc_small); EXPECT_EQ(status, rocblas_status_size_increased); // 12. stop query rocblas_stop_device_memory_size_query(handle, &size); EXPECT_LT(size, size1); // 13. device memory size should not change; it should be 54MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, size1); // 14. When executing getrf, device memory is enough for execution to success status = rocsolver_dgetrf_strided_batched(handle, m_small, n_small, dA, lda, stA, dP, stP, dinfo, bc_small); EXPECT_EQ(status, rocblas_status_success); // 15. device memory size should be the same 54MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, size1); // 16. destroy handle EXPECT_EQ(rocblas_destroy_handle(handle), rocblas_status_success); } TEST_F(checkin_misc_MEMORY_MODEL, user_managed) { size_t size; rocblas_status status; rocblas_handle handle; /*************************************/ /******** user fixed size ***********/ /*************************************/ // set environment variable to 2MB ASSERT_TRUE(set_environment_variable("ROCBLAS_DEVICE_MEMORY_SIZE", "2000000")); // 1. create handle ASSERT_EQ(rocblas_create_handle(&handle), rocblas_status_success); // 2. memory size is now fixed by user via the environment variable EXPECT_FALSE(rocblas_is_managing_device_memory(handle)); // 3. 2MB should be reserved rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 2000000); // 4. start query rocblas_start_device_memory_size_query(handle); EXPECT_TRUE(rocblas_is_device_memory_size_query(handle)); // 5. getrf small will require .5MB status = rocsolver_dgetrf_strided_batched(handle, m_small, n_small, dA, lda, stA, dP, stP, dinfo, bc_small); EXPECT_EQ(status, rocblas_status_size_increased); // 6. stop query; required size at the end of query (.5MB) rocblas_stop_device_memory_size_query(handle, &size); EXPECT_LT(size, 2000000); // 7. device memory size should not change; it should be 2MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 2000000); // 8. When executing getrf. Enough memory allowing execution to success status = rocsolver_dgetrf_strided_batched(handle, m_small, n_small, dA, lda, stA, dP, stP, dinfo, bc_small); EXPECT_EQ(status, rocblas_status_success); // 9. device memory size should stay the same (2MB) rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 2000000); // 10. start query rocblas_start_device_memory_size_query(handle); EXPECT_TRUE(rocblas_is_device_memory_size_query(handle)); // 11. getrf baseline will require 54MB status = rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc); EXPECT_EQ(status, rocblas_status_size_increased); // 12. stop query; required size at the end of query (54MB) rocblas_stop_device_memory_size_query(handle, &size); EXPECT_GT(size, 2000000); // 13. device memory size should not change; it should still be 2MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 2000000); // 14. When executing getrf, device memory is not enough for execution to success status = rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc); EXPECT_EQ(status, rocblas_status_memory_error); // 15. device memory size should be the same 2MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 2000000); // 16. set mem size to 0 get rocblas the control back rocblas_set_device_memory_size(handle, 0); EXPECT_TRUE(rocblas_is_managing_device_memory(handle)); /*************************************/ /******** user managed size **********/ /*************************************/ // 1. set mem size to 2MB rocblas_set_device_memory_size(handle, 2000000); rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 2000000); // 2. memory size should now be fixed by the user EXPECT_FALSE(rocblas_is_managing_device_memory(handle)); // 3. start query rocblas_start_device_memory_size_query(handle); EXPECT_TRUE(rocblas_is_device_memory_size_query(handle)); // 4. getrf baseline will require 54MB status = rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc); EXPECT_EQ(status, rocblas_status_size_increased); // 5. call getrf small, which will require less than 54MB and so size will remain unchanged status = rocsolver_dgetrf_strided_batched(handle, m_small, n_small, dA, lda, stA, dP, stP, dinfo, bc_small); EXPECT_EQ(status, rocblas_status_size_unchanged); // 6. stop query rocblas_stop_device_memory_size_query(handle, &size); EXPECT_GT(size, 2000000); // 7. device memory size should not change; it should be 2MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 2000000); // 8. When executing getrf baseline, device memory is not enough for success status = rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc); EXPECT_EQ(status, rocblas_status_memory_error); // 9. device memory size should be the same 2MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 2000000); // 10. set mem size to 100MB rocblas_set_device_memory_size(handle, 100000000); rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 100000000); // 11. When executing getrf, device memory should now be enough for execution to success status = rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc); EXPECT_EQ(status, rocblas_status_success); // 12. destroy handle EXPECT_EQ(rocblas_destroy_handle(handle), rocblas_status_success); } /*************************************/ /******** user owned workspace *******/ /*************************************/ TEST_F(checkin_misc_MEMORY_MODEL, user_owned) { size_t size; rocblas_status status; rocblas_handle handle; // 1. create handle ASSERT_EQ(rocblas_create_handle(&handle), rocblas_status_success); // 2. by default, memory is rocblas managed EXPECT_TRUE(rocblas_is_managing_device_memory(handle)); // 3. by default, 32MB should be reserved rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 32 * 1024 * 1024); // 4. pass user owned workspace (2MB) void* W; size_t sw = 2000000; hipMalloc(&W, sw); ASSERT_EQ(rocblas_set_workspace(handle, W, sw), rocblas_status_success); // 5. memory should now be user managed EXPECT_FALSE(rocblas_is_managing_device_memory(handle)); // 6. 2MB should be reserved rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 2000000); // 7. start query rocblas_start_device_memory_size_query(handle); EXPECT_TRUE(rocblas_is_device_memory_size_query(handle)); // 8. getrf baseline will require 54MB status = rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc); EXPECT_EQ(status, rocblas_status_size_increased); // 9. getrf small will require less than 54MB, so size should be unchanged status = rocsolver_dgetrf_strided_batched(handle, m_small, n_small, dA, lda, stA, dP, stP, dinfo, bc_small); EXPECT_EQ(status, rocblas_status_size_unchanged); // 10. stop query; required size at the end of query is 54MB rocblas_stop_device_memory_size_query(handle, &size); EXPECT_GT(size, 2000000); // 11. device memory size should not change; it should be 2MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 2000000); // 12. When executing getrf, device memory is not enough for success status = rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc); EXPECT_EQ(status, rocblas_status_memory_error); // 13. device memory size should be the same 2MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 2000000); // 14. pass larger user owned workspace hipFree(W); sw = 100000000; hipMalloc(&W, sw); ASSERT_EQ(rocblas_set_workspace(handle, W, sw), rocblas_status_success); // 15. 100MB should be reserved rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 100000000); // 16. When executing getrf, device memory is now enough for success status = rocsolver_dgetrf_strided_batched(handle, m, n, dA, lda, stA, dP, stP, dinfo, bc); EXPECT_EQ(status, rocblas_status_success); // 17. device memory size should be the same 100MB rocblas_get_device_memory_size(handle, &size); EXPECT_EQ(size, 100000000); // 18. destroy handle hipFree(W); EXPECT_EQ(rocblas_destroy_handle(handle), rocblas_status_success); } rocSOLVER-rocm-5.5.1/clients/gtest/orgbr_ungbr_gtest.cpp000066400000000000000000000067611436600607200232250ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_orgbr_ungbr.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> orgbr_tuple; // each size_range is a {M, N, K}; // each store_range vector is a {lda, st} // if lda = -1, then lda < limit (invalid size) // if lda = 0, then lda = limit // if lda = 1, then lda > limit // if st = 0, then storev = 'C' // if st = 1, then storev = 'R' // case when m = 0, n = 0 and storev = 'C' will also execute the bad arguments // test (null handle, null pointers and invalid values) const vector> store_range = { // always invalid {-1, 0}, {-1, 1}, // normal (valid) samples {0, 0}, {0, 1}, {1, 0}, {1, 1}}; // for checkin_lapack tests const vector> size_range = { // always quick return {0, 0, 0}, // quick return for storev = 'R' invalid for 'C' {0, 1, 0}, // quick return for storev = 'C' invalid for 'R' {1, 0, 0}, // always invalid {-1, 1, 1}, {1, -1, 1}, {1, 1, -1}, // invalid for storev = 'C' {10, 30, 5}, // invalid for storev = 'R' {30, 10, 5}, // always invalid {30, 10, 20}, {10, 30, 20}, // normal (valid) samples {30, 30, 0}, {20, 20, 20}, {50, 50, 50}, {100, 100, 50}}; // for daily_lapack tests const vector> large_size_range = {{150, 150, 100}, {270, 270, 270}, {400, 400, 400}, {800, 800, 300}, {1000, 1000, 1000}, {1500, 1500, 800}}; Arguments orgbr_setup_arguments(orgbr_tuple tup) { vector size = std::get<0>(tup); vector store = std::get<1>(tup); Arguments arg; arg.set("m", size[0]); arg.set("n", size[1]); arg.set("k", size[2]); arg.set("lda", size[0] + store[0] * 10); arg.set("storev", store[1] == 1 ? 'R' : 'C'); arg.timing = 0; return arg; } class ORGBR_UNGBR : public ::TestWithParam { protected: ORGBR_UNGBR() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = orgbr_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0 && arg.get("storev") == 'C') testing_orgbr_ungbr_bad_arg(); testing_orgbr_ungbr(arg); } }; class ORGBR : public ORGBR_UNGBR { }; class UNGBR : public ORGBR_UNGBR { }; // non-batch tests TEST_P(ORGBR, __float) { run_tests(); } TEST_P(ORGBR, __double) { run_tests(); } TEST_P(UNGBR, __float_complex) { run_tests(); } TEST_P(UNGBR, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, ORGBR, Combine(ValuesIn(large_size_range), ValuesIn(store_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORGBR, Combine(ValuesIn(size_range), ValuesIn(store_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNGBR, Combine(ValuesIn(large_size_range), ValuesIn(store_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNGBR, Combine(ValuesIn(size_range), ValuesIn(store_range))); rocSOLVER-rocm-5.5.1/clients/gtest/orglx_unglx_gtest.cpp000066400000000000000000000100701436600607200232510ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_orglx_unglx.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, int> orglq_tuple; // each m_size_range vector is a {M, lda, K} // case when m = 0 and n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> m_size_range = { // quick return {0, 1, 0}, // always invalid {-1, 1, 1}, {20, 5, 1}, {10, 10, 20}, // invalid for case * {30, 30, 25}, // normal (valid) samples {10, 10, 10}, {20, 50, 20}, }; const vector n_size_range = { // quick return 0, // always invalid -1, // invalid for case * 25, // normal (valid) samples 50, 70, 130}; // for daily_lapack tests const vector> large_m_size_range = {{164, 164, 130}, {198, 640, 198}, {130, 130, 130}, {220, 220, 140}, {400, 400, 200}}; const vector large_n_size_range = {400, 640, 1000, 2000}; Arguments orglq_setup_arguments(orglq_tuple tup) { vector m_size = std::get<0>(tup); int n_size = std::get<1>(tup); Arguments arg; arg.set("m", m_size[0]); arg.set("lda", m_size[1]); arg.set("k", m_size[2]); arg.set("n", n_size); arg.timing = 0; return arg; } template class ORGLX_UNGLX : public ::TestWithParam { protected: ORGLX_UNGLX() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = orglq_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0) testing_orglx_unglx_bad_arg(); testing_orglx_unglx(arg); } }; class ORGL2 : public ORGLX_UNGLX { }; class UNGL2 : public ORGLX_UNGLX { }; class ORGLQ : public ORGLX_UNGLX { }; class UNGLQ : public ORGLX_UNGLX { }; // non-batch tests TEST_P(ORGL2, __float) { run_tests(); } TEST_P(ORGL2, __double) { run_tests(); } TEST_P(UNGL2, __float_complex) { run_tests(); } TEST_P(UNGL2, __double_complex) { run_tests(); } TEST_P(ORGLQ, __float) { run_tests(); } TEST_P(ORGLQ, __double) { run_tests(); } TEST_P(UNGLQ, __float_complex) { run_tests(); } TEST_P(UNGLQ, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, ORGL2, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORGL2, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNGL2, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNGL2, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, ORGLQ, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORGLQ, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNGLQ, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNGLQ, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/orgtr_ungtr_gtest.cpp000066400000000000000000000051201436600607200232550ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_orgtr_ungtr.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, printable_char> orgtr_tuple; // each size_range vector is a {n, lda} // case when n = 0 and uplo = 'U' will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector uplo_range = {'L', 'U'}; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1}, // invalid {-1, 1}, {20, 5}, // normal (valid) samples {32, 32}, {50, 50}, {70, 100}, {100, 150}}; // for daily_lapack tests const vector> large_size_range = {{192, 192}, {500, 600}, {640, 640}, {1000, 1024}}; Arguments orgtr_setup_arguments(orgtr_tuple tup) { vector size = std::get<0>(tup); char uplo = std::get<1>(tup); Arguments arg; arg.set("n", size[0]); arg.set("lda", size[1]); arg.set("uplo", uplo); arg.timing = 0; return arg; } class ORGTR_UNGTR : public ::TestWithParam { protected: ORGTR_UNGTR() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = orgtr_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("uplo") == 'U') testing_orgtr_ungtr_bad_arg(); testing_orgtr_ungtr(arg); } }; class ORGTR : public ORGTR_UNGTR { }; class UNGTR : public ORGTR_UNGTR { }; // non-batch tests TEST_P(ORGTR, __float) { run_tests(); } TEST_P(ORGTR, __double) { run_tests(); } TEST_P(UNGTR, __float_complex) { run_tests(); } TEST_P(UNGTR, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, ORGTR, Combine(ValuesIn(large_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORGTR, Combine(ValuesIn(size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNGTR, Combine(ValuesIn(large_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNGTR, Combine(ValuesIn(size_range), ValuesIn(uplo_range))); rocSOLVER-rocm-5.5.1/clients/gtest/orgxl_ungxl_gtest.cpp000066400000000000000000000102461436600607200232560ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_orgxl_ungxl.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> orgql_tuple; // each n_size_range vector is a {N, K} // each m_size_range vector is a {M, lda} // case when m = 0 and n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> n_size_range = { // quick return {0, 0}, // always invalid {-1, 1}, {1, -1}, {10, 20}, // invalid for case * {30, 25}, // normal (valid) samples {10, 10}, {20, 20}, }; const vector> m_size_range = { // quick return {0, 1}, // always invalid {-1, 1}, {20, 5}, // invalid for case * {25, 25}, // normal (valid) samples {50, 50}, {70, 100}, {130, 130}}; // for daily_lapack tests const vector> large_n_size_range = {{164, 130}, {198, 198}, {130, 130}, {220, 140}, {400, 200}}; const vector> large_m_size_range = {{400, 640}, {640, 640}, {1000, 1000}, {2000, 2000}}; Arguments orgql_setup_arguments(orgql_tuple tup) { vector m_size = std::get<0>(tup); vector n_size = std::get<1>(tup); Arguments arg; arg.set("m", m_size[0]); arg.set("lda", m_size[1]); arg.set("n", n_size[0]); arg.set("k", n_size[1]); arg.timing = 0; return arg; } template class ORGXL_UNGXL : public ::TestWithParam { protected: ORGXL_UNGXL() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = orgql_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0) testing_orgxl_ungxl_bad_arg(); testing_orgxl_ungxl(arg); } }; class ORG2L : public ORGXL_UNGXL { }; class UNG2L : public ORGXL_UNGXL { }; class ORGQL : public ORGXL_UNGXL { }; class UNGQL : public ORGXL_UNGXL { }; // non-batch tests TEST_P(ORG2L, __float) { run_tests(); } TEST_P(ORG2L, __double) { run_tests(); } TEST_P(UNG2L, __float_complex) { run_tests(); } TEST_P(UNG2L, __double_complex) { run_tests(); } TEST_P(ORGQL, __float) { run_tests(); } TEST_P(ORGQL, __double) { run_tests(); } TEST_P(UNGQL, __float_complex) { run_tests(); } TEST_P(UNGQL, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, ORG2L, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORG2L, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNG2L, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNG2L, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, ORGQL, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORGQL, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNGQL, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNGQL, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/orgxr_ungxr_gtest.cpp000066400000000000000000000102431436600607200232670ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_orgxr_ungxr.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> orgqr_tuple; // each m_size_range vector is a {M, lda} // each n_size_range vector is a {N, K} // case when m = 0 and n = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> m_size_range = { // quick return {0, 1}, // always invalid {-1, 1}, {20, 5}, // invalid for case * {50, 50}, // normal (valid) samples {70, 100}, {130, 130}}; const vector> n_size_range = { // quick return {0, 1}, // always invalid {-1, 1}, {1, -1}, {10, 20}, // invalid for case * {55, 55}, // normal (valid) samples {10, 0}, {20, 20}, {35, 25}}; // for daily_lapack tests const vector> large_m_size_range = {{400, 410}, {640, 640}, {1000, 1024}, {2000, 2000}}; const vector> large_n_size_range = {{164, 162}, {198, 140}, {130, 130}, {220, 220}, {400, 200}}; Arguments orgqr_setup_arguments(orgqr_tuple tup) { vector m_size = std::get<0>(tup); vector n_size = std::get<1>(tup); Arguments arg; arg.set("m", m_size[0]); arg.set("lda", m_size[1]); arg.set("n", n_size[0]); arg.set("k", n_size[1]); arg.timing = 0; return arg; } template class ORGXR_UNGXR : public ::TestWithParam { protected: ORGXR_UNGXR() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = orgqr_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 0) testing_orgxr_ungxr_bad_arg(); testing_orgxr_ungxr(arg); } }; class ORG2R : public ORGXR_UNGXR { }; class UNG2R : public ORGXR_UNGXR { }; class ORGQR : public ORGXR_UNGXR { }; class UNGQR : public ORGXR_UNGXR { }; // non-batch tests TEST_P(ORG2R, __float) { run_tests(); } TEST_P(ORG2R, __double) { run_tests(); } TEST_P(UNG2R, __float_complex) { run_tests(); } TEST_P(UNG2R, __double_complex) { run_tests(); } TEST_P(ORGQR, __float) { run_tests(); } TEST_P(ORGQR, __double) { run_tests(); } TEST_P(UNGQR, __float_complex) { run_tests(); } TEST_P(UNGQR, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, ORG2R, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORG2R, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNG2R, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNG2R, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, ORGQR, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORGQR, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNGQR, Combine(ValuesIn(large_m_size_range), ValuesIn(large_n_size_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNGQR, Combine(ValuesIn(m_size_range), ValuesIn(n_size_range))); rocSOLVER-rocm-5.5.1/clients/gtest/ormbr_unmbr_gtest.cpp000066400000000000000000000103421436600607200232270ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_ormbr_unmbr.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> ormbr_tuple; // each size_range vector is a {M, N, K} // each store_range vector is a {lda, ldc, s, t, st} // if lda = -1, then lda < limit (invalid size) // if lda = 0, then lda = limit // if lda = 1, then lda > limit // if ldc = -1, then ldc < limit (invalid size) // if ldc = 0, then ldc = limit // if ldc = 1, then ldc > limit // if s = 0, then side = 'L' // if s = 1, then side = 'R' // if t = 0, then trans = 'N' // if t = 1, then trans = 'T' // if t = 2, then trans = 'C' // if st = 0, then storev = 'C' // if st = 1, then storev = 'R' // case when m = 0, n = 1, side = 'L', trans = 'T' and storev = 'C' // will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> store_range = { // invalid {-1, 0, 0, 0, 0}, {0, -1, 0, 0, 0}, // normal (valid) samples {1, 1, 0, 0, 0}, {1, 1, 0, 0, 1}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 1}, {0, 0, 0, 1, 0}, {0, 0, 0, 1, 1}, {0, 0, 0, 2, 0}, {0, 0, 0, 2, 1}, {0, 0, 1, 0, 0}, {0, 0, 1, 0, 1}, {0, 0, 1, 1, 0}, {0, 0, 1, 1, 1}, {0, 0, 1, 2, 0}, {0, 0, 1, 2, 1}, }; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1, 1}, {1, 0, 1}, {1, 1, 0}, // invalid {-1, 1, 1}, {1, -1, 1}, {1, 1, -1}, // normal (valid) samples {10, 30, 5}, {20, 5, 10}, {20, 20, 25}, {50, 50, 30}, {70, 40, 40}, }; // for daily_lapack tests const vector> large_size_range = { {200, 150, 100}, {270, 270, 270}, {400, 400, 405}, {800, 500, 300}, {1500, 1000, 300}, }; Arguments ormbr_setup_arguments(ormbr_tuple tup) { vector size = std::get<0>(tup); vector store = std::get<1>(tup); Arguments arg; rocblas_int m = size[0]; rocblas_int n = size[1]; rocblas_int k = size[2]; arg.set("m", m); arg.set("n", n); arg.set("k", k); rocblas_int nq = store[2] == 0 ? m : n; if(store[4] == 0) arg.set("lda", nq + store[0] * 10); else arg.set("lda", min(nq, k) + store[0] * 10); arg.set("ldc", m + store[1] * 10); arg.set("side", store[2] == 0 ? 'L' : 'R'); arg.set("trans", (store[3] == 0 ? 'N' : (store[3] == 1 ? 'T' : 'C'))); arg.set("storev", store[4] == 0 ? 'C' : 'R'); arg.timing = 0; return arg; } class ORMBR_UNMBR : public ::TestWithParam { protected: ORMBR_UNMBR() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = ormbr_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 1 && arg.peek("side") == 'L' && arg.peek("trans") == 'T' && arg.peek("storev") == 'C') testing_ormbr_unmbr_bad_arg(); testing_ormbr_unmbr(arg); } }; class ORMBR : public ORMBR_UNMBR { }; class UNMBR : public ORMBR_UNMBR { }; // non-batch tests TEST_P(ORMBR, __float) { run_tests(); } TEST_P(ORMBR, __double) { run_tests(); } TEST_P(UNMBR, __float_complex) { run_tests(); } TEST_P(UNMBR, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, ORMBR, Combine(ValuesIn(large_size_range), ValuesIn(store_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORMBR, Combine(ValuesIn(size_range), ValuesIn(store_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNMBR, Combine(ValuesIn(large_size_range), ValuesIn(store_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNMBR, Combine(ValuesIn(size_range), ValuesIn(store_range))); rocSOLVER-rocm-5.5.1/clients/gtest/ormlx_unmlx_gtest.cpp000066400000000000000000000106431436600607200232730ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_ormlx_unmlx.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> ormlq_tuple; // each size_range vector is a {M, N, K}; // each op_range is a {lda, ldc, s, t} // if lda = -1, then lda < limit (invalid size) // if lda = 0, then lda = limit // if lda = 1, then lda > limit // if ldc = -1, then ldc < limit (invalid size) // if ldc = 0, then ldc = limit // if ldc = 1, then ldc > limit // if s = 0, then side = 'L' // if s = 1, then side = 'R' // if t = 0, then trans = 'N' // if t = 1, then trans = 'T' // if t = 2, then trans = 'C' // case when m = 0, side = 'L' and trans = 'T' will also execute the bad // arguments test (null handle, null pointers and invalid values) const vector> op_range = { // invalid {-1, 0, 0, 0}, {0, -1, 0, 0}, // normal (valid) samples {0, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 0, 2}, {0, 0, 1, 0}, {0, 0, 1, 1}, {0, 0, 1, 2}, {1, 1, 0, 0}}; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1, 0}, {1, 0, 0}, {30, 30, 0}, // always invalid {-1, 1, 1}, {1, -1, 1}, {1, 1, -1}, // invalid for side = 'R' {20, 10, 20}, // invalid for side = 'L' {15, 25, 25}, // normal (valid) samples {40, 40, 40}, {45, 40, 30}, {50, 50, 20}}; // for daily_lapack tests const vector> large_size_range = {{100, 100, 100}, {150, 100, 80}, {300, 400, 300}, {1024, 1000, 950}, {1500, 1500, 1000}}; Arguments ormlq_setup_arguments(ormlq_tuple tup) { vector size = std::get<0>(tup); vector op = std::get<1>(tup); Arguments arg; rocblas_int m = size[0]; rocblas_int n = size[1]; rocblas_int k = size[2]; arg.set("m", m); arg.set("n", n); arg.set("k", k); arg.set("lda", k + op[0] * 10); arg.set("ldc", m + op[1] * 10); arg.set("side", op[2] == 0 ? 'L' : 'R'); arg.set("trans", (op[3] == 0 ? 'N' : (op[3] == 1 ? 'T' : 'C'))); arg.timing = 0; return arg; } template class ORMLX_UNMLX : public ::TestWithParam { protected: ORMLX_UNMLX() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = ormlq_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("side") == 'L' && arg.peek("trans") == 'T') testing_ormlx_unmlx_bad_arg(); testing_ormlx_unmlx(arg); } }; class ORML2 : public ORMLX_UNMLX { }; class UNML2 : public ORMLX_UNMLX { }; class ORMLQ : public ORMLX_UNMLX { }; class UNMLQ : public ORMLX_UNMLX { }; // non-batch tests TEST_P(ORML2, __float) { run_tests(); } TEST_P(ORML2, __double) { run_tests(); } TEST_P(UNML2, __float_complex) { run_tests(); } TEST_P(UNML2, __double_complex) { run_tests(); } TEST_P(ORMLQ, __float) { run_tests(); } TEST_P(ORMLQ, __double) { run_tests(); } TEST_P(UNMLQ, __float_complex) { run_tests(); } TEST_P(UNMLQ, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, ORML2, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORML2, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNML2, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNML2, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, ORMLQ, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORMLQ, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNMLQ, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNMLQ, Combine(ValuesIn(size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/ormtr_unmtr_gtest.cpp000066400000000000000000000077201436600607200233010ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_ormtr_unmtr.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> ormtr_tuple; // each size_range vector is a {M, N} // each store_range vector is a {lda, ldc, s, t, u} // if lda = -1, then lda < limit (invalid size) // if lda = 0, then lda = limit // if lda = 1, then lda > limit // if ldc = -1, then ldc < limit (invalid size) // if ldc = 0, then ldc = limit // if ldc = 1, then ldc > limit // if s = 0, then side = 'L' // if s = 1, then side = 'R' // if t = 0, then trans = 'N' // if t = 1, then trans = 'T' // if t = 2, then trans = 'C' // if u = 0, then uplo = 'U' // if u = 1, then uplo = 'L' // case when m = 0, n = 1, side = 'L', trans = 'T' and uplo = 'U' // will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> store_range = { // invalid {-1, 0, 0, 0, 0}, {0, -1, 0, 0, 0}, // normal (valid) samples {1, 1, 0, 0, 0}, {1, 1, 0, 0, 1}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 1}, {0, 0, 0, 1, 0}, {0, 0, 0, 1, 1}, {0, 0, 0, 2, 0}, {0, 0, 0, 2, 1}, {0, 0, 1, 0, 0}, {0, 0, 1, 0, 1}, {0, 0, 1, 1, 0}, {0, 0, 1, 1, 1}, {0, 0, 1, 2, 0}, {0, 0, 1, 2, 1}, }; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1}, {1, 0}, // invalid {-1, 1}, {1, -1}, // normal (valid) samples {10, 30}, {20, 5}, {20, 20}, {50, 50}, {70, 40}, }; // for daily_lapack tests const vector> large_size_range = { {200, 150}, {270, 270}, {400, 400}, {800, 500}, {1500, 1000}, }; Arguments ormtr_setup_arguments(ormtr_tuple tup) { vector size = std::get<0>(tup); vector store = std::get<1>(tup); Arguments arg; rocblas_int m = size[0]; rocblas_int n = size[1]; arg.set("m", m); arg.set("n", n); int nq = store[2] == 0 ? m : n; arg.set("lda", nq + store[0] * 10); arg.set("ldc", m + store[1] * 10); arg.set("side", store[2] == 0 ? 'L' : 'R'); arg.set("trans", (store[3] == 0 ? 'N' : (store[3] == 1 ? 'T' : 'C'))); arg.set("uplo", store[4] == 0 ? 'U' : 'L'); arg.timing = 0; return arg; } class ORMTR_UNMTR : public ::TestWithParam { protected: ORMTR_UNMTR() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = ormtr_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("n") == 1 && arg.peek("side") == 'L' && arg.peek("trans") == 'T' && arg.peek("uplo") == 'U') testing_ormtr_unmtr_bad_arg(); testing_ormtr_unmtr(arg); } }; class ORMTR : public ORMTR_UNMTR { }; class UNMTR : public ORMTR_UNMTR { }; // non-batch tests TEST_P(ORMTR, __float) { run_tests(); } TEST_P(ORMTR, __double) { run_tests(); } TEST_P(UNMTR, __float_complex) { run_tests(); } TEST_P(UNMTR, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, ORMTR, Combine(ValuesIn(large_size_range), ValuesIn(store_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORMTR, Combine(ValuesIn(size_range), ValuesIn(store_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNMTR, Combine(ValuesIn(large_size_range), ValuesIn(store_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNMTR, Combine(ValuesIn(size_range), ValuesIn(store_range))); rocSOLVER-rocm-5.5.1/clients/gtest/ormxl_unmxl_gtest.cpp000066400000000000000000000107701436600607200232740ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_ormxl_unmxl.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> ormql_tuple; // each size_range vector is a {M, N, K}; // each op_range is a {lda, ldc, s, t} // if lda = -1, then lda < limit (invalid size) // if lda = 0, then lda = limit // if lda = 1, then lda > limit // if ldc = -1, then ldc < limit (invalid size) // if ldc = 0, then ldc = limit // if ldc = 1, then ldc > limit // if s = 0, then side = 'L' // if s = 1, then side = 'R' // if t = 0, then trans = 'N' // if t = 1, then trans = 'T' // if t = 2, then trans = 'C' // case when m = 0, side = 'L' and trans = 'T' will also execute the bad // arguments test (null handle, null pointers and invalid values) const vector> op_range = { // invalid {-1, 0, 0, 0}, {0, -1, 0, 0}, // normal (valid) samples {0, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 0, 2}, {0, 0, 1, 0}, {0, 0, 1, 1}, {0, 0, 1, 2}, {1, 1, 0, 0}}; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1, 0}, {1, 0, 0}, {30, 30, 0}, // always invalid {-1, 1, 1}, {1, -1, 1}, {1, 1, -1}, // invalid for side = 'R' {20, 10, 20}, // invalid for side = 'L' {15, 25, 25}, // normal (valid) samples {40, 40, 40}, {45, 40, 30}, {50, 50, 20}}; // for daily_lapack tests const vector> large_size_range = {{100, 100, 100}, {150, 100, 80}, {300, 400, 300}, {1024, 1000, 950}, {1500, 1500, 1000}}; Arguments ormql_setup_arguments(ormql_tuple tup) { vector size = std::get<0>(tup); vector op = std::get<1>(tup); Arguments arg; rocblas_int m = size[0]; rocblas_int n = size[1]; rocblas_int k = size[2]; arg.set("m", m); arg.set("n", n); arg.set("k", k); if(op[2] == 0) arg.set("lda", m + op[0] * 10); else arg.set("lda", n + op[0] * 10); arg.set("ldc", m + op[1] * 10); arg.set("side", op[2] == 0 ? 'L' : 'R'); arg.set("trans", (op[3] == 0 ? 'N' : (op[3] == 1 ? 'T' : 'C'))); arg.timing = 0; return arg; } template class ORMXL_UNMXL : public ::TestWithParam { protected: ORMXL_UNMXL() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = ormql_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("side") == 'L' && arg.peek("trans") == 'T') testing_ormxl_unmxl_bad_arg(); testing_ormxl_unmxl(arg); } }; class ORM2L : public ORMXL_UNMXL { }; class UNM2L : public ORMXL_UNMXL { }; class ORMQL : public ORMXL_UNMXL { }; class UNMQL : public ORMXL_UNMXL { }; // non-batch tests TEST_P(ORM2L, __float) { run_tests(); } TEST_P(ORM2L, __double) { run_tests(); } TEST_P(UNM2L, __float_complex) { run_tests(); } TEST_P(UNM2L, __double_complex) { run_tests(); } TEST_P(ORMQL, __float) { run_tests(); } TEST_P(ORMQL, __double) { run_tests(); } TEST_P(UNMQL, __float_complex) { run_tests(); } TEST_P(UNMQL, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, ORM2L, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORM2L, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNM2L, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNM2L, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, ORMQL, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORMQL, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNMQL, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNMQL, Combine(ValuesIn(size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/ormxr_unmxr_gtest.cpp000066400000000000000000000107721436600607200233120ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_ormxr_unmxr.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> ormqr_tuple; // each size_range vector is a {M, N, K} // each op_range vector is a {lda, ldc, s, t} // if lda = -1, then lda < limit (invalid size) // if lda = 0, then lda = limit // if lda = 1, then lda > limit // if ldc = -1, then ldc < limit (invalid size) // if ldc = 0, then ldc = limit // if ldc = 1, then ldc > limit // if s = 0, then side = 'L' // if s = 1, then side = 'R' // if t = 0, then trans = 'N' // if t = 1, then trans = 'T' // if t = 2, then trans = 'C' // case when m = 0, side = L and trans = T will also execute the bad arguments // test (null handle, null pointers and invalid values) const vector> op_range = { // invalid {-1, 0, 0, 0}, {0, -1, 0, 0}, // normal (valid) samples {0, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 0, 2}, {0, 0, 1, 0}, {0, 0, 1, 1}, {0, 0, 1, 2}, {1, 1, 0, 0}}; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1, 0}, {1, 0, 0}, {30, 30, 0}, // always invalid {-1, 1, 1}, {1, -1, 1}, {1, 1, -1}, // invalid for side = 'R' {20, 10, 20}, // invalid for side = 'L' {15, 25, 25}, // normal (valid) samples {40, 40, 40}, {45, 40, 30}, {50, 50, 20}}; // for daily_lapack tests const vector> large_size_range = {{100, 100, 100}, {150, 100, 80}, {300, 400, 300}, {1024, 1000, 950}, {1500, 1500, 1000}}; Arguments ormqr_setup_arguments(ormqr_tuple tup) { vector size = std::get<0>(tup); vector op = std::get<1>(tup); Arguments arg; rocblas_int m = size[0]; rocblas_int n = size[1]; rocblas_int k = size[2]; arg.set("m", m); arg.set("n", n); arg.set("k", k); if(op[2] == 0) arg.set("lda", m + op[0] * 10); else arg.set("lda", n + op[0] * 10); arg.set("ldc", m + op[1] * 10); arg.set("side", op[2] == 0 ? 'L' : 'R'); arg.set("trans", (op[3] == 0 ? 'N' : (op[3] == 1 ? 'T' : 'C'))); arg.timing = 0; return arg; } template class ORMXR_UNMXR : public ::TestWithParam { protected: ORMXR_UNMXR() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = ormqr_setup_arguments(GetParam()); if(arg.peek("m") == 0 && arg.peek("side") == 'L' && arg.peek("trans") == 'T') testing_ormxr_unmxr_bad_arg(); testing_ormxr_unmxr(arg); } }; class ORM2R : public ORMXR_UNMXR { }; class UNM2R : public ORMXR_UNMXR { }; class ORMQR : public ORMXR_UNMXR { }; class UNMQR : public ORMXR_UNMXR { }; // non-batch tests TEST_P(ORM2R, __float) { run_tests(); } TEST_P(ORM2R, __double) { run_tests(); } TEST_P(UNM2R, __float_complex) { run_tests(); } TEST_P(UNM2R, __double_complex) { run_tests(); } TEST_P(ORMQR, __float) { run_tests(); } TEST_P(ORMQR, __double) { run_tests(); } TEST_P(UNMQR, __float_complex) { run_tests(); } TEST_P(UNMQR, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, ORM2R, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORM2R, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNM2R, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNM2R, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, ORMQR, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, ORMQR, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, UNMQR, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, UNMQR, Combine(ValuesIn(size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/posv_gtest.cpp000066400000000000000000000110651436600607200216750ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_posv.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> posv_tuple; // each A_range vector is a {N, lda, ldb, singular}; // if singular = 1, then the used matrix for the tests is not positive definite // each B_range vector is a {nrhs, uplo}; // if uplo = 0 then upper // if uplo = 1 then lower // case when N = nrhs = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_sizeA_range = { // quick return {0, 1, 1, 0}, // invalid {-1, 1, 1, 0}, {10, 2, 10, 0}, {10, 10, 2, 0}, /// normal (valid) samples {20, 20, 20, 0}, {30, 50, 30, 1}, {30, 30, 50, 0}, {50, 60, 60, 1}}; const vector> matrix_sizeB_range = { // quick return {0, 0}, // invalid {-1, 0}, // normal (valid) samples {10, 0}, {20, 1}, {30, 1}, }; // for daily_lapack tests const vector> large_matrix_sizeA_range = {{70, 70, 100, 0}, {192, 192, 192, 1}, {600, 700, 645, 0}, {1000, 1000, 1000, 1}, {1000, 2000, 2000, 0}}; const vector> large_matrix_sizeB_range = { {100, 0}, {150, 0}, {200, 1}, {524, 1}, {1000, 0}, }; Arguments posv_setup_arguments(posv_tuple tup) { vector matrix_sizeA = std::get<0>(tup); vector matrix_sizeB = std::get<1>(tup); Arguments arg; arg.set("n", matrix_sizeA[0]); arg.set("nrhs", matrix_sizeB[0]); arg.set("lda", matrix_sizeA[1]); arg.set("ldb", matrix_sizeA[2]); if(matrix_sizeB[1] == 0) arg.set("uplo", 'U'); else arg.set("uplo", 'L'); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = matrix_sizeA[3]; return arg; } class POSV : public ::TestWithParam { protected: POSV() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = posv_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("nrhs") == 0) testing_posv_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_posv(arg); arg.singular = 0; testing_posv(arg); } }; // non-batch tests TEST_P(POSV, __float) { run_tests(); } TEST_P(POSV, __double) { run_tests(); } TEST_P(POSV, __float_complex) { run_tests(); } TEST_P(POSV, __double_complex) { run_tests(); } // batched tests TEST_P(POSV, batched__float) { run_tests(); } TEST_P(POSV, batched__double) { run_tests(); } TEST_P(POSV, batched__float_complex) { run_tests(); } TEST_P(POSV, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(POSV, strided_batched__float) { run_tests(); } TEST_P(POSV, strided_batched__double) { run_tests(); } TEST_P(POSV, strided_batched__float_complex) { run_tests(); } TEST_P(POSV, strided_batched__double_complex) { run_tests(); } // daily_lapack tests normal execution with medium to large sizes INSTANTIATE_TEST_SUITE_P(daily_lapack, POSV, Combine(ValuesIn(large_matrix_sizeA_range), ValuesIn(large_matrix_sizeB_range))); // checkin_lapack tests normal execution with small sizes, invalid sizes, // quick returns, and corner cases INSTANTIATE_TEST_SUITE_P(checkin_lapack, POSV, Combine(ValuesIn(matrix_sizeA_range), ValuesIn(matrix_sizeB_range))); rocSOLVER-rocm-5.5.1/clients/gtest/potf2_potrf_gtest.cpp000066400000000000000000000121261436600607200231510ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_potf2_potrf.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, printable_char> potrf_tuple; // each size_range vector is a {N, lda, singular} // if singular = 1, then the used matrix for the tests is not positive definite // each uplo_range is a {uplo} // case when n = 0 and uplo = L will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector uplo_range = {'L', 'U'}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 0}, // invalid {-1, 1, 0}, {10, 2, 0}, // normal (valid) samples {10, 10, 1}, {20, 30, 0}, {50, 50, 1}, {70, 80, 0}}; // for daily_lapack tests const vector> large_matrix_size_range = { {192, 192, 0}, {640, 960, 1}, {1000, 1000, 0}, {1024, 1024, 1}, {2000, 2000, 0}, }; Arguments potrf_setup_arguments(potrf_tuple tup) { vector matrix_size = std::get<0>(tup); char uplo = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("uplo", uplo); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = matrix_size[2]; return arg; } template class POTF2_POTRF : public ::TestWithParam { protected: POTF2_POTRF() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = potrf_setup_arguments(GetParam()); if(arg.peek("uplo") == 'L' && arg.peek("n") == 0) testing_potf2_potrf_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_potf2_potrf(arg); arg.singular = 0; testing_potf2_potrf(arg); } }; class POTF2 : public POTF2_POTRF { }; class POTRF : public POTF2_POTRF { }; // non-batch tests TEST_P(POTF2, __float) { run_tests(); } TEST_P(POTF2, __double) { run_tests(); } TEST_P(POTF2, __float_complex) { run_tests(); } TEST_P(POTF2, __double_complex) { run_tests(); } TEST_P(POTRF, __float) { run_tests(); } TEST_P(POTRF, __double) { run_tests(); } TEST_P(POTRF, __float_complex) { run_tests(); } TEST_P(POTRF, __double_complex) { run_tests(); } // batched tests TEST_P(POTF2, batched__float) { run_tests(); } TEST_P(POTF2, batched__double) { run_tests(); } TEST_P(POTF2, batched__float_complex) { run_tests(); } TEST_P(POTF2, batched__double_complex) { run_tests(); } TEST_P(POTRF, batched__float) { run_tests(); } TEST_P(POTRF, batched__double) { run_tests(); } TEST_P(POTRF, batched__float_complex) { run_tests(); } TEST_P(POTRF, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(POTF2, strided_batched__float) { run_tests(); } TEST_P(POTF2, strided_batched__double) { run_tests(); } TEST_P(POTF2, strided_batched__float_complex) { run_tests(); } TEST_P(POTF2, strided_batched__double_complex) { run_tests(); } TEST_P(POTRF, strided_batched__float) { run_tests(); } TEST_P(POTRF, strided_batched__double) { run_tests(); } TEST_P(POTRF, strided_batched__float_complex) { run_tests(); } TEST_P(POTRF, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, POTF2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, POTF2, Combine(ValuesIn(matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, POTRF, Combine(ValuesIn(large_matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, POTRF, Combine(ValuesIn(matrix_size_range), ValuesIn(uplo_range))); rocSOLVER-rocm-5.5.1/clients/gtest/potri_gtest.cpp000066400000000000000000000071301436600607200220410ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_potri.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, printable_char> potri_tuple; // each matrix_size_range vector is a {n, lda, singular} // if singular = 1, then the used matrix for the tests is singular // each uplo_range is a {uplo} // case when n = 0 and uplo = L will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector uplo_range = {'L', 'U'}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 0}, // invalid {-1, 1, 0}, {20, 5, 0}, // normal (valid) samples {32, 32, 0}, {50, 50, 1}, {70, 100, 0}, {100, 150, 1}}; // for daily_lapack tests const vector> large_matrix_size_range = {{192, 192, 1}, {500, 600, 1}, {640, 640, 0}, {1000, 1024, 0}, {1200, 1230, 0}}; Arguments potri_setup_arguments(potri_tuple tup) { vector matrix_size = std::get<0>(tup); char uplo = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("uplo", uplo); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = matrix_size[2]; return arg; } class POTRI : public ::TestWithParam { protected: POTRI() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = potri_setup_arguments(GetParam()); if(arg.peek("uplo") == 'L' && arg.peek("n") == 0) testing_potri_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_potri(arg); arg.singular = 0; testing_potri(arg); } }; // non-batch tests TEST_P(POTRI, __float) { run_tests(); } TEST_P(POTRI, __double) { run_tests(); } TEST_P(POTRI, __float_complex) { run_tests(); } TEST_P(POTRI, __double_complex) { run_tests(); } // batched tests TEST_P(POTRI, batched__float) { run_tests(); } TEST_P(POTRI, batched__double) { run_tests(); } TEST_P(POTRI, batched__float_complex) { run_tests(); } TEST_P(POTRI, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(POTRI, strided_batched__float) { run_tests(); } TEST_P(POTRI, strided_batched__double) { run_tests(); } TEST_P(POTRI, strided_batched__float_complex) { run_tests(); } TEST_P(POTRI, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, POTRI, Combine(ValuesIn(large_matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, POTRI, Combine(ValuesIn(matrix_size_range), ValuesIn(uplo_range))); rocSOLVER-rocm-5.5.1/clients/gtest/potrs_gtest.cpp000066400000000000000000000101471436600607200220550ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_potrs.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> potrs_tuple; // each A_range vector is a {N, lda, ldb}; // each B_range vector is a {nrhs, uplo}; // if uplo = 0 then upper // if uplo = 1 then lower // case when N = nrhs = 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_sizeA_range = { // quick return {0, 1, 1}, // invalid {-1, 1, 1}, {10, 2, 10}, {10, 10, 2}, /// normal (valid) samples {20, 20, 20}, {30, 50, 30}, {30, 30, 50}, {50, 60, 60}}; const vector> matrix_sizeB_range = { // quick return {0, 0}, // invalid {-1, 0}, // normal (valid) samples {10, 0}, {20, 1}, {30, 1}, }; // for daily_lapack tests const vector> large_matrix_sizeA_range = {{70, 70, 100}, {192, 192, 192}, {600, 700, 645}, {1000, 1000, 1000}, {1000, 2000, 2000}}; const vector> large_matrix_sizeB_range = { {100, 0}, {150, 0}, {200, 1}, {524, 1}, {1000, 0}, }; Arguments potrs_setup_arguments(potrs_tuple tup) { vector matrix_sizeA = std::get<0>(tup); vector matrix_sizeB = std::get<1>(tup); Arguments arg; arg.set("n", matrix_sizeA[0]); arg.set("nrhs", matrix_sizeB[0]); arg.set("lda", matrix_sizeA[1]); arg.set("ldb", matrix_sizeA[2]); if(matrix_sizeB[1] == 0) arg.set("uplo", 'U'); else arg.set("uplo", 'L'); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } class POTRS : public ::TestWithParam { protected: POTRS() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = potrs_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("nrhs") == 0) testing_potrs_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_potrs(arg); } }; // non-batch tests TEST_P(POTRS, __float) { run_tests(); } TEST_P(POTRS, __double) { run_tests(); } TEST_P(POTRS, __float_complex) { run_tests(); } TEST_P(POTRS, __double_complex) { run_tests(); } // batched tests TEST_P(POTRS, batched__float) { run_tests(); } TEST_P(POTRS, batched__double) { run_tests(); } TEST_P(POTRS, batched__float_complex) { run_tests(); } TEST_P(POTRS, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(POTRS, strided_batched__float) { run_tests(); } TEST_P(POTRS, strided_batched__double) { run_tests(); } TEST_P(POTRS, strided_batched__float_complex) { run_tests(); } TEST_P(POTRS, strided_batched__double_complex) { run_tests(); } // daily_lapack tests normal execution with medium to large sizes INSTANTIATE_TEST_SUITE_P(daily_lapack, POTRS, Combine(ValuesIn(large_matrix_sizeA_range), ValuesIn(large_matrix_sizeB_range))); // checkin_lapack tests normal execution with small sizes, invalid sizes, // quick returns, and corner cases INSTANTIATE_TEST_SUITE_P(checkin_lapack, POTRS, Combine(ValuesIn(matrix_sizeA_range), ValuesIn(matrix_sizeB_range))); rocSOLVER-rocm-5.5.1/clients/gtest/rocsolver_gtest_main.cpp000066400000000000000000000027161436600607200237330ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #include #include #include #include #include #include "clientcommon.hpp" static std::string rocblas_version() { size_t size; rocblas_get_version_string_size(&size); std::string str(size - 1, '\0'); rocblas_get_version_string(str.data(), size); return str; } static std::string rocsolver_version() { size_t size; rocsolver_get_version_string_size(&size); std::string str(size - 1, '\0'); rocsolver_get_version_string(str.data(), size); return str; } static void print_version_info() { fmt::print("rocSOLVER version {} (with rocBLAS {})\n", rocsolver_version(), rocblas_version()); std::fflush(stdout); } int main(int argc, char** argv) { print_version_info(); // print device info int device_count = query_device_property(); if(device_count <= 0) { fmt::print(stderr, "Error: No devices found\n"); return -1; } set_device(0); // use first device // Initialize gtest and rocBLAS ::testing::InitGoogleTest(&argc, argv); rocblas_initialize(); int status = RUN_ALL_TESTS(); print_version_info(); // redundant, but convenient when tests fail return status; } rocSOLVER-rocm-5.5.1/clients/gtest/stebz_gtest.cpp000066400000000000000000000067401436600607200220410ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include "testing_stebz.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> stebz_tuple; // each size_range vector is a {n, ord} // if ord = 1, then order eigenvalues by blocks // if ord = 0, then order eigenvalues of the entire matrix // each ops_range vector is a {rng, vl, vu, il, iu, tol} // if rng = 0, then find all eigenvalues // if rng = 1, then find eigenavlues in (vl, vu] // if rng = 2, then find the il-th to the iu-th eigenvalue // Note: all tests are prepared with diagonally dominant matrices that have random diagonal // elements in [-20, -11] U [11, 20], and off-diagonal elements in [-0.4, 0.5]. // Thus, all the eigenvalues are guaranteed to be in [-20, 20] // case when n == 0, ord == 0, and rng == 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> size_range = { // quick return {0, 0}, // invalid {-1, 0}, // normal (valid) samples {1, 1}, {15, 0}, {20, 1}, {64, 0}}; const vector> ops_range = { // always invalid {1, 2, 1, 0, 0, 0}, {2, 0, 0, 0, -1, 0}, {2, 0, 0, 1, 80, 0}, // valid only when n=0 {2, 0, 0, 1, 0, 0}, // valid only when n>0 {2, 0, 0, 1, 5, 0}, {2, 0, 0, 1, 15, 0}, {2, 0, 0, 7, 12, 0}, // always valid samples {0, 0, 0, 0, 0, 0}, {1, -15, -5, 0, 0, -1}, {1, -15, 15, 0, 0, 1}, {1, -5, 5, 0, 0, 0}, {1, 5, 15, 0, 0, -1}, {1, 35, 55, 0, 0, 0}}; // for daily_lapack tests const vector> large_size_range = {{120, 1}, {256, 0}, {350, 1}, {512, 0}, {1024, 1}}; const vector> large_ops_range = {{0, 0, 0, 0, 0, -1}, {1, -15, 15, 0, 0, 0}, {1, -25, 0, 0, 0, 1}, {1, 0, 15, 0, 0, 0}, {2, 0, 0, 50, 75, -1}, {2, 0, 0, 1, 25, 0}}; Arguments stebz_setup_arguments(stebz_tuple tup) { Arguments arg; vector size = std::get<0>(tup); vector op = std::get<1>(tup); arg.set("n", size[0]); arg.set("eorder", (size[1] == 0 ? 'E' : 'B')); arg.set("erange", (op[0] == 0 ? 'A' : (op[0] == 1 ? 'V' : 'I'))); arg.set("vl", op[1]); arg.set("vu", op[2]); arg.set("il", op[3]); arg.set("iu", op[4]); arg.set("abstol", op[5]); arg.timing = 0; return arg; } class STEBZ : public ::TestWithParam { protected: STEBZ() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = stebz_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("eorder") == 'E' && arg.peek("erange") == 'A') testing_stebz_bad_arg(); testing_stebz(arg); } }; // non-batch tests TEST_P(STEBZ, __float) { run_tests(); } TEST_P(STEBZ, __double) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, STEBZ, Combine(ValuesIn(large_size_range), ValuesIn(large_ops_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, STEBZ, Combine(ValuesIn(size_range), ValuesIn(ops_range))); rocSOLVER-rocm-5.5.1/clients/gtest/stedc_gtest.cpp000066400000000000000000000045351436600607200220140ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_stedc.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, printable_char> stedc_tuple; // each size_range vector is a {N, ldc} // each op_range vector is a {e} // case when N == 0 and evect == N will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector op_range = {'N', 'I', 'V'}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1}, // invalid {-1, 1}, // invalid for case evect != N {2, 1}, // normal (valid) samples {12, 12}, {20, 30}, {35, 40}}; // for daily_lapack tests const vector> large_matrix_size_range = {{192, 192}, {250, 250}, {256, 270}, {300, 300}}; Arguments stedc_setup_arguments(stedc_tuple tup) { vector size = std::get<0>(tup); char op = std::get<1>(tup); Arguments arg; arg.set("n", size[0]); arg.set("ldc", size[1]); arg.set("evect", op); arg.timing = 0; return arg; } class STEDC : public ::TestWithParam { protected: STEDC() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = stedc_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("evect") == 'N') testing_stedc_bad_arg(); testing_stedc(arg); } }; // non-batch tests TEST_P(STEDC, __float) { run_tests(); } TEST_P(STEDC, __double) { run_tests(); } TEST_P(STEDC, __float_complex) { run_tests(); } TEST_P(STEDC, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, STEDC, Combine(ValuesIn(large_matrix_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, STEDC, Combine(ValuesIn(matrix_size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/stein_gtest.cpp000066400000000000000000000047071436600607200220350ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_stein.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, int> stein_tuple; // each size_range vector is a {N, ldz} // each vec_range is a {nev} // Indicates the number of vectors to compute // (vectors are always associated with the last nev eigenvalues) // case when N == 0 and nev == 5 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1}, // invalid {-1, 1}, {2, 1}, // normal (valid) samples {15, 15}, {20, 30}, {35, 40}}; const vector vec_range = {5, 10, 15}; // for daily_lapack tests const vector> large_matrix_size_range = {{192, 192}, {256, 270}, {300, 300}}; const vector large_vec_range = {25, 40, 65}; Arguments stein_setup_arguments(stein_tuple tup) { Arguments arg; vector size = std::get<0>(tup); rocblas_int nev = std::get<1>(tup); arg.set("n", size[0]); arg.set("ldz", size[1]); arg.set("nev", nev); arg.timing = 0; return arg; } class STEIN : public ::TestWithParam { protected: STEIN() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = stein_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("nev") == 5) testing_stein_bad_arg(); testing_stein(arg); } }; // non-batch tests TEST_P(STEIN, __float) { run_tests(); } TEST_P(STEIN, __double) { run_tests(); } TEST_P(STEIN, __float_complex) { run_tests(); } TEST_P(STEIN, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, STEIN, Combine(ValuesIn(large_matrix_size_range), ValuesIn(large_vec_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, STEIN, Combine(ValuesIn(matrix_size_range), ValuesIn(vec_range))); rocSOLVER-rocm-5.5.1/clients/gtest/steqr_gtest.cpp000066400000000000000000000045211436600607200220430ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_steqr.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, printable_char> steqr_tuple; // each size_range vector is a {N, ldc} // each op_range vector is a {e} // case when N == 0 and evect == N will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector op_range = {'N', 'I', 'V'}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1}, // invalid {-1, 1}, // invalid for case evect != N {2, 1}, // normal (valid) samples {12, 12}, {20, 30}, {35, 40}}; // for daily_lapack tests const vector> large_matrix_size_range = {{192, 192}, {256, 270}, {300, 300}}; Arguments steqr_setup_arguments(steqr_tuple tup) { vector size = std::get<0>(tup); char op = std::get<1>(tup); Arguments arg; arg.set("n", size[0]); arg.set("ldc", size[1]); arg.set("evect", op); arg.timing = 0; return arg; } class STEQR : public ::TestWithParam { protected: STEQR() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = steqr_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("evect") == 'N') testing_steqr_bad_arg(); testing_steqr(arg); } }; // non-batch tests TEST_P(STEQR, __float) { run_tests(); } TEST_P(STEQR, __double) { run_tests(); } TEST_P(STEQR, __float_complex) { run_tests(); } TEST_P(STEQR, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, STEQR, Combine(ValuesIn(large_matrix_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, STEQR, Combine(ValuesIn(matrix_size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/sterf_gtest.cpp000066400000000000000000000032051436600607200220260ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_sterf.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef vector sterf_tuple; // each size_range vector is a {N} // case when N == 0 will also execute the bad arguments test // (null handle, null pointers and invalid values) // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0}, // invalid {-1}, // normal (valid) samples {12}, {20}, {35}}; // for daily_lapack tests const vector> large_matrix_size_range = {{192}, {256}, {300}}; Arguments sterf_setup_arguments(sterf_tuple tup) { Arguments arg; arg.set("n", tup[0]); arg.timing = 0; return arg; } class STERF : public ::TestWithParam { protected: STERF() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = sterf_setup_arguments(GetParam()); if(arg.peek("n") == 0) testing_sterf_bad_arg(); testing_sterf(arg); } }; // non-batch tests TEST_P(STERF, __float) { run_tests(); } TEST_P(STERF, __double) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, STERF, ValuesIn(large_matrix_size_range)); INSTANTIATE_TEST_SUITE_P(checkin_lapack, STERF, ValuesIn(matrix_size_range)); rocSOLVER-rocm-5.5.1/clients/gtest/syev_heev_gtest.cpp000066400000000000000000000074531436600607200227110ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_syev_heev.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> syev_heev_tuple; // each size_range vector is a {n, lda} // each op_range vector is a {evect, uplo} // case when n == 0, evect == N, and uplo = L will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> op_range = {{'N', 'L'}, {'N', 'U'}, {'V', 'L'}, {'V', 'U'}}; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1}, // invalid {-1, 1}, {10, 5}, // normal (valid) samples {1, 1}, {12, 12}, {20, 30}, {35, 35}, {50, 60}}; // for daily_lapack tests const vector> large_size_range = {{192, 192}, {256, 270}, {300, 300}}; Arguments syev_heev_setup_arguments(syev_heev_tuple tup) { vector size = std::get<0>(tup); vector op = std::get<1>(tup); Arguments arg; arg.set("n", size[0]); arg.set("lda", size[1]); arg.set("evect", op[0]); arg.set("uplo", op[1]); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } class SYEV_HEEV : public ::TestWithParam { protected: SYEV_HEEV() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = syev_heev_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("evect") == 'N' && arg.peek("uplo") == 'L') testing_syev_heev_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_syev_heev(arg); } }; class SYEV : public SYEV_HEEV { }; class HEEV : public SYEV_HEEV { }; // non-batch tests TEST_P(SYEV, __float) { run_tests(); } TEST_P(SYEV, __double) { run_tests(); } TEST_P(HEEV, __float_complex) { run_tests(); } TEST_P(HEEV, __double_complex) { run_tests(); } // batched tests TEST_P(SYEV, batched__float) { run_tests(); } TEST_P(SYEV, batched__double) { run_tests(); } TEST_P(HEEV, batched__float_complex) { run_tests(); } TEST_P(HEEV, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(SYEV, strided_batched__float) { run_tests(); } TEST_P(SYEV, strided_batched__double) { run_tests(); } TEST_P(HEEV, strided_batched__float_complex) { run_tests(); } TEST_P(HEEV, strided_batched__double_complex) { run_tests(); } // daily_lapack tests normal execution with medium to large sizes INSTANTIATE_TEST_SUITE_P(daily_lapack, SYEV, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEEV, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); // checkin_lapack tests normal execution with small sizes, invalid sizes, // quick returns, and corner cases INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYEV, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEEV, Combine(ValuesIn(size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/syevd_heevd_gtest.cpp000066400000000000000000000075251436600607200232210ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_syevd_heevd.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> syevd_heevd_tuple; // each size_range vector is a {n, lda} // each op_range vector is a {evect, uplo} // case when n == 0, evect == N, and uplo = L will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> op_range = {{'N', 'L'}, {'N', 'U'}, {'V', 'L'}, {'V', 'U'}}; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1}, // invalid {-1, 1}, {10, 5}, // normal (valid) samples {1, 1}, {12, 12}, {20, 30}, {36, 36}, {50, 60}}; // for daily_lapack tests const vector> large_size_range = {{192, 192}, {256, 270}, {300, 300}}; Arguments syevd_heevd_setup_arguments(syevd_heevd_tuple tup) { vector size = std::get<0>(tup); vector op = std::get<1>(tup); Arguments arg; arg.set("n", size[0]); arg.set("lda", size[1]); arg.set("evect", op[0]); arg.set("uplo", op[1]); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } class SYEVD_HEEVD : public ::TestWithParam { protected: SYEVD_HEEVD() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = syevd_heevd_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("evect") == 'N' && arg.peek("uplo") == 'L') testing_syevd_heevd_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_syevd_heevd(arg); } }; class SYEVD : public SYEVD_HEEVD { }; class HEEVD : public SYEVD_HEEVD { }; // non-batch tests TEST_P(SYEVD, __float) { run_tests(); } TEST_P(SYEVD, __double) { run_tests(); } TEST_P(HEEVD, __float_complex) { run_tests(); } TEST_P(HEEVD, __double_complex) { run_tests(); } // batched tests TEST_P(SYEVD, batched__float) { run_tests(); } TEST_P(SYEVD, batched__double) { run_tests(); } TEST_P(HEEVD, batched__float_complex) { run_tests(); } TEST_P(HEEVD, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(SYEVD, strided_batched__float) { run_tests(); } TEST_P(SYEVD, strided_batched__double) { run_tests(); } TEST_P(HEEVD, strided_batched__float_complex) { run_tests(); } TEST_P(HEEVD, strided_batched__double_complex) { run_tests(); } // daily_lapack tests normal execution with medium to large sizes INSTANTIATE_TEST_SUITE_P(daily_lapack, SYEVD, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEEVD, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); // checkin_lapack tests normal execution with small sizes, invalid sizes, // quick returns, and corner cases INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYEVD, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEEVD, Combine(ValuesIn(size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/syevdx_heevdx_gtest.cpp000066400000000000000000000102761436600607200235760ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_syevdx_heevdx_inplace.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> syevdx_heevdx_tuple; // each size_range vector is a {n, lda, ldz, vl, vu, il, iu} // each op_range vector is a {evect, erange, uplo} // case when n == 0, evect == N, erange == V and uplo = L will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> op_range = {{'N', 'V', 'L'}, {'V', 'A', 'U'}, {'V', 'V', 'L'}, {'V', 'I', 'U'}}; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1, 1, 0, 10, 1, 0}, // invalid {-1, 1, 1, 0, 10, 1, 1}, {10, 5, 10, 0, 10, 1, 1}, {10, 10, 5, 0, 10, 1, 1}, // valid only when erange=A {10, 10, 10, 10, 0, 10, 1}, // normal (valid) samples {1, 1, 1, 0, 10, 1, 1}, {12, 12, 15, -20, 20, 10, 12}, {20, 30, 30, 5, 15, 1, 20}, {35, 35, 35, -10, 10, 1, 15}, {50, 60, 50, -15, -5, 20, 30}}; // for daily_lapack tests const vector> large_size_range = {{192, 192, 192, 5, 15, 100, 170}, {256, 270, 256, -10, 10, 1, 256}, {300, 300, 330, -15, -5, 200, 300}}; template Arguments syevdx_heevdx_setup_arguments(syevdx_heevdx_tuple tup, bool inplace) { using S = decltype(std::real(T{})); vector size = std::get<0>(tup); vector op = std::get<1>(tup); Arguments arg; arg.set("n", size[0]); arg.set("lda", size[1]); if(!inplace) arg.set("ldz", size[2]); arg.set("vl", size[3]); arg.set("vu", size[4]); arg.set("il", size[5]); arg.set("iu", size[6]); arg.set("evect", op[0]); arg.set("erange", op[1]); arg.set("uplo", op[2]); arg.set("abstol", 0); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } class SYEVDX_HEEVDX_INPLACE : public ::TestWithParam { protected: SYEVDX_HEEVDX_INPLACE() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { using S = decltype(std::real(T{})); Arguments arg = syevdx_heevdx_setup_arguments(GetParam(), true); if(arg.peek("n") == 0 && arg.peek("evect") == 'N' && arg.peek("erange") == 'V' && arg.peek("uplo") == 'L') testing_syevdx_heevdx_inplace_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_syevdx_heevdx_inplace(arg); } }; class SYEVDX_INPLACE : public SYEVDX_HEEVDX_INPLACE { }; class HEEVDX_INPLACE : public SYEVDX_HEEVDX_INPLACE { }; // non-batch tests TEST_P(SYEVDX_INPLACE, __float) { run_tests(); } TEST_P(SYEVDX_INPLACE, __double) { run_tests(); } TEST_P(HEEVDX_INPLACE, __float_complex) { run_tests(); } TEST_P(HEEVDX_INPLACE, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, SYEVDX_INPLACE, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYEVDX_INPLACE, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEEVDX_INPLACE, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEEVDX_INPLACE, Combine(ValuesIn(size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/syevj_heevj_gtest.cpp000066400000000000000000000100001436600607200232130ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_syevj_heevj.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> syevj_heevj_tuple; // each size_range vector is a {n, lda} // each op_range vector is a {evect, uplo} // case when n == 0, evect == N, and uplo = L will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> op_range = {{'N', 'L'}, {'N', 'U'}, {'V', 'L'}, {'V', 'U'}}; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1}, // invalid {-1, 1}, {10, 5}, // normal (valid) samples {1, 1}, {12, 12}, {20, 30}, {40, 45}, {60, 70}, {70, 70}, }; // for daily_lapack tests const vector> large_size_range = {{192, 192}, {256, 270}, {300, 300}}; Arguments syevj_heevj_setup_arguments(syevj_heevj_tuple tup) { vector size = std::get<0>(tup); vector op = std::get<1>(tup); Arguments arg; arg.set("n", size[0]); arg.set("lda", size[1]); arg.set("evect", op[0]); arg.set("uplo", op[1]); // only need to test the sorted case arg.set("esort", 'A'); arg.set("abstol", 0); arg.set("max_sweeps", 100); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } class SYEVJ_HEEVJ : public ::TestWithParam { protected: SYEVJ_HEEVJ() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = syevj_heevj_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("evect") == 'N' && arg.peek("uplo") == 'L') testing_syevj_heevj_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_syevj_heevj(arg); } }; class SYEVJ : public SYEVJ_HEEVJ { }; class HEEVJ : public SYEVJ_HEEVJ { }; // non-batch tests TEST_P(SYEVJ, __float) { run_tests(); } TEST_P(SYEVJ, __double) { run_tests(); } TEST_P(HEEVJ, __float_complex) { run_tests(); } TEST_P(HEEVJ, __double_complex) { run_tests(); } // batched tests TEST_P(SYEVJ, batched__float) { run_tests(); } TEST_P(SYEVJ, batched__double) { run_tests(); } TEST_P(HEEVJ, batched__float_complex) { run_tests(); } TEST_P(HEEVJ, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(SYEVJ, strided_batched__float) { run_tests(); } TEST_P(SYEVJ, strided_batched__double) { run_tests(); } TEST_P(HEEVJ, strided_batched__float_complex) { run_tests(); } TEST_P(HEEVJ, strided_batched__double_complex) { run_tests(); } // daily_lapack tests normal execution with medium to large sizes INSTANTIATE_TEST_SUITE_P(daily_lapack, SYEVJ, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEEVJ, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); // checkin_lapack tests normal execution with small sizes, invalid sizes, // quick returns, and corner cases INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYEVJ, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEEVJ, Combine(ValuesIn(size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/syevx_heevx_gtest.cpp000066400000000000000000000110231436600607200232550ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_syevx_heevx.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> syevx_heevx_tuple; // each size_range vector is a {n, lda, ldz, vl, vu, il, iu} // each op_range vector is a {evect, erange, uplo} // case when n == 0, evect == N, erange == V and uplo = L will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> op_range = {{'N', 'V', 'L'}, {'V', 'A', 'U'}, {'V', 'V', 'L'}, {'V', 'I', 'U'}}; // for checkin_lapack tests const vector> size_range = { // quick return {0, 1, 1, 0, 10, 1, 0}, // invalid {-1, 1, 1, 0, 10, 1, 1}, {10, 5, 10, 0, 10, 1, 1}, {10, 10, 5, 0, 10, 1, 1}, // valid only when erange=A {10, 10, 10, 10, 0, 10, 1}, // normal (valid) samples {1, 1, 1, 0, 10, 1, 1}, {12, 12, 15, -20, 20, 10, 12}, {20, 30, 30, 5, 15, 1, 20}, {35, 35, 35, -10, 10, 1, 15}, {50, 60, 50, -15, -5, 20, 30}}; // for daily_lapack tests const vector> large_size_range = {{192, 192, 192, 5, 15, 100, 170}, {256, 270, 256, -10, 10, 1, 256}, {300, 300, 330, -15, -5, 200, 300}}; template Arguments syevx_heevx_setup_arguments(syevx_heevx_tuple tup) { using S = decltype(std::real(T{})); vector size = std::get<0>(tup); vector op = std::get<1>(tup); Arguments arg; arg.set("n", size[0]); arg.set("lda", size[1]); arg.set("ldz", size[2]); arg.set("vl", size[3]); arg.set("vu", size[4]); arg.set("il", size[5]); arg.set("iu", size[6]); arg.set("evect", op[0]); arg.set("erange", op[1]); arg.set("uplo", op[2]); arg.set("abstol", 0); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } class SYEVX_HEEVX : public ::TestWithParam { protected: SYEVX_HEEVX() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { using S = decltype(std::real(T{})); Arguments arg = syevx_heevx_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("evect") == 'N' && arg.peek("erange") == 'V' && arg.peek("uplo") == 'L') testing_syevx_heevx_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_syevx_heevx(arg); } }; class SYEVX : public SYEVX_HEEVX { }; class HEEVX : public SYEVX_HEEVX { }; // non-batch tests TEST_P(SYEVX, __float) { run_tests(); } TEST_P(SYEVX, __double) { run_tests(); } TEST_P(HEEVX, __float_complex) { run_tests(); } TEST_P(HEEVX, __double_complex) { run_tests(); } // batched tests TEST_P(SYEVX, batched__float) { run_tests(); } TEST_P(SYEVX, batched__double) { run_tests(); } TEST_P(HEEVX, batched__float_complex) { run_tests(); } TEST_P(HEEVX, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(SYEVX, strided_batched__float) { run_tests(); } TEST_P(SYEVX, strided_batched__double) { run_tests(); } TEST_P(HEEVX, strided_batched__float_complex) { run_tests(); } TEST_P(HEEVX, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, SYEVX, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYEVX, Combine(ValuesIn(size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEEVX, Combine(ValuesIn(large_size_range), ValuesIn(op_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEEVX, Combine(ValuesIn(size_range), ValuesIn(op_range))); rocSOLVER-rocm-5.5.1/clients/gtest/sygsx_hegsx_gtest.cpp000066400000000000000000000134471436600607200232670ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_sygsx_hegsx.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> sygst_tuple; // each matrix_size_range is a {n, lda, ldb} // each type_range is a {itype, uplo} // case when n = 0, itype = 1, and uplo = U will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> type_range = {{'1', 'U'}, {'1', 'L'}, {'2', 'U'}, {'2', 'L'}, {'3', 'U'}, {'3', 'L'}}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 1}, // invalid {-1, 1, 1}, {20, 5, 5}, // normal (valid) samples {50, 50, 50}, {70, 100, 110}, {130, 130, 130}, }; // for daily_lapack tests const vector> large_matrix_size_range = { {152, 152, 152}, {640, 640, 640}, {1000, 1024, 1024}, }; Arguments sygst_setup_arguments(sygst_tuple tup) { vector matrix_size = std::get<0>(tup); vector type = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("ldb", matrix_size[2]); arg.set("itype", type[0]); arg.set("uplo", type[1]); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } template class SYGSX_HEGSX : public ::TestWithParam { protected: SYGSX_HEGSX() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = sygst_setup_arguments(GetParam()); if(arg.peek("itype") == '1' && arg.peek("uplo") == 'U' && arg.peek("n") == 0) testing_sygsx_hegsx_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_sygsx_hegsx(arg); } }; class SYGS2 : public SYGSX_HEGSX { }; class HEGS2 : public SYGSX_HEGSX { }; class SYGST : public SYGSX_HEGSX { }; class HEGST : public SYGSX_HEGSX { }; // non-batch tests TEST_P(SYGS2, __float) { run_tests(); } TEST_P(SYGS2, __double) { run_tests(); } TEST_P(HEGS2, __float_complex) { run_tests(); } TEST_P(HEGS2, __double_complex) { run_tests(); } TEST_P(SYGST, __float) { run_tests(); } TEST_P(SYGST, __double) { run_tests(); } TEST_P(HEGST, __float_complex) { run_tests(); } TEST_P(HEGST, __double_complex) { run_tests(); } // batched tests TEST_P(SYGS2, batched__float) { run_tests(); } TEST_P(SYGS2, batched__double) { run_tests(); } TEST_P(HEGS2, batched__float_complex) { run_tests(); } TEST_P(HEGS2, batched__double_complex) { run_tests(); } TEST_P(SYGST, batched__float) { run_tests(); } TEST_P(SYGST, batched__double) { run_tests(); } TEST_P(HEGST, batched__float_complex) { run_tests(); } TEST_P(HEGST, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(SYGS2, strided_batched__float) { run_tests(); } TEST_P(SYGS2, strided_batched__double) { run_tests(); } TEST_P(HEGS2, strided_batched__float_complex) { run_tests(); } TEST_P(HEGS2, strided_batched__double_complex) { run_tests(); } TEST_P(SYGST, strided_batched__float) { run_tests(); } TEST_P(SYGST, strided_batched__double) { run_tests(); } TEST_P(HEGST, strided_batched__float_complex) { run_tests(); } TEST_P(HEGST, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, SYGS2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYGS2, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEGS2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEGS2, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, SYGST, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYGST, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEGST, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEGST, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); rocSOLVER-rocm-5.5.1/clients/gtest/sygv_hegv_gtest.cpp000066400000000000000000000105501436600607200227050ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_sygv_hegv.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> sygv_tuple; // each matrix_size_range is a {n, lda, ldb, singular} // if singular = 1, then the used matrix for the tests is not positive definite // each type_range is a {itype, evect, uplo} // case when n = 0, itype = 1, evect = 'N', and uplo = U will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> type_range = {{'1', 'N', 'U'}, {'2', 'N', 'L'}, {'3', 'N', 'U'}, {'1', 'V', 'L'}, {'2', 'V', 'U'}, {'3', 'V', 'L'}}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 1, 0}, // invalid {-1, 1, 1, 0}, {20, 5, 5, 0}, // normal (valid) samples {20, 30, 20, 1}, {35, 35, 35, 0}, {50, 50, 60, 1}}; // for daily_lapack tests const vector> large_matrix_size_range = { {192, 192, 192, 0}, {256, 270, 256, 0}, {300, 300, 310, 0}, }; Arguments sygv_setup_arguments(sygv_tuple tup) { vector matrix_size = std::get<0>(tup); vector type = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("ldb", matrix_size[2]); arg.set("itype", type[0]); arg.set("evect", type[1]); arg.set("uplo", type[2]); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = matrix_size[3]; return arg; } class SYGV_HEGV : public ::TestWithParam { protected: SYGV_HEGV() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = sygv_setup_arguments(GetParam()); if(arg.peek("itype") == '1' && arg.peek("evect") == 'N' && arg.peek("uplo") == 'U' && arg.peek("n") == 0) testing_sygv_hegv_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_sygv_hegv(arg); arg.singular = 0; testing_sygv_hegv(arg); } }; class SYGV : public SYGV_HEGV { }; class HEGV : public SYGV_HEGV { }; // non-batch tests TEST_P(SYGV, __float) { run_tests(); } TEST_P(SYGV, __double) { run_tests(); } TEST_P(HEGV, __float_complex) { run_tests(); } TEST_P(HEGV, __double_complex) { run_tests(); } // batched tests TEST_P(SYGV, batched__float) { run_tests(); } TEST_P(SYGV, batched__double) { run_tests(); } TEST_P(HEGV, batched__float_complex) { run_tests(); } TEST_P(HEGV, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(SYGV, strided_batched__float) { run_tests(); } TEST_P(SYGV, strided_batched__double) { run_tests(); } TEST_P(HEGV, strided_batched__float_complex) { run_tests(); } TEST_P(HEGV, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, SYGV, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYGV, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEGV, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEGV, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); rocSOLVER-rocm-5.5.1/clients/gtest/sygvd_hegvd_gtest.cpp000066400000000000000000000106441436600607200232210ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_sygvd_hegvd.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> sygvd_tuple; // each matrix_size_range is a {n, lda, ldb, singular} // if singular = 1, then the used matrix for the tests is not positive definite // each type_range is a {itype, evect, uplo} // case when n = 0, itype = 1, evect = 'N', and uplo = U will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> type_range = {{'1', 'N', 'U'}, {'2', 'N', 'L'}, {'3', 'N', 'U'}, {'1', 'V', 'L'}, {'2', 'V', 'U'}, {'3', 'V', 'L'}}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 1, 0}, // invalid {-1, 1, 1, 0}, {20, 5, 5, 0}, // normal (valid) samples {20, 30, 20, 1}, {35, 35, 35, 0}, {52, 52, 52, 1}, {50, 50, 60, 1}}; // for daily_lapack tests const vector> large_matrix_size_range = { {192, 192, 192, 0}, {256, 270, 256, 0}, {300, 300, 310, 0}, }; Arguments sygvd_setup_arguments(sygvd_tuple tup) { vector matrix_size = std::get<0>(tup); vector type = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("ldb", matrix_size[2]); arg.set("itype", type[0]); arg.set("evect", type[1]); arg.set("uplo", type[2]); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = matrix_size[3]; return arg; } class SYGVD_HEGVD : public ::TestWithParam { protected: SYGVD_HEGVD() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = sygvd_setup_arguments(GetParam()); if(arg.peek("itype") == '1' && arg.peek("evect") == 'N' && arg.peek("uplo") == 'U' && arg.peek("n") == 0) testing_sygvd_hegvd_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_sygvd_hegvd(arg); arg.singular = 0; testing_sygvd_hegvd(arg); } }; class SYGVD : public SYGVD_HEGVD { }; class HEGVD : public SYGVD_HEGVD { }; // non-batch tests TEST_P(SYGVD, __float) { run_tests(); } TEST_P(SYGVD, __double) { run_tests(); } TEST_P(HEGVD, __float_complex) { run_tests(); } TEST_P(HEGVD, __double_complex) { run_tests(); } // batched tests TEST_P(SYGVD, batched__float) { run_tests(); } TEST_P(SYGVD, batched__double) { run_tests(); } TEST_P(HEGVD, batched__float_complex) { run_tests(); } TEST_P(HEGVD, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(SYGVD, strided_batched__float) { run_tests(); } TEST_P(SYGVD, strided_batched__double) { run_tests(); } TEST_P(HEGVD, strided_batched__float_complex) { run_tests(); } TEST_P(HEGVD, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, SYGVD, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYGVD, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEGVD, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEGVD, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); rocSOLVER-rocm-5.5.1/clients/gtest/sygvdx_hegvdx_gtest.cpp000066400000000000000000000111741436600607200236000ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_sygvdx_hegvdx_inplace.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> sygvdx_tuple; // each matrix_size_range is a {n, lda, ldb, ldz, vl, vu, il, iu, singular} // if singular = 1, then the used matrix for the tests is not positive definite // each type_range is a {itype, evect, erange, uplo} // case when n = 0, itype = 1, evect = 'N', erange = 'A', and uplo = U will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> type_range = {{'1', 'N', 'A', 'U'}, {'2', 'N', 'V', 'L'}, {'3', 'N', 'I', 'U'}, {'1', 'V', 'V', 'L'}, {'2', 'V', 'I', 'U'}, {'3', 'V', 'A', 'L'}}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 1, 1, 0, 10, 1, 0, 0}, // invalid {-1, 1, 1, 1, 0, 10, 1, 1, 0}, {20, 5, 5, 20, 0, 10, 1, 1, 0}, {20, 20, 20, 5, 0, 10, 1, 1, 0}, // valid only when erange=A {20, 20, 20, 20, 10, 0, 10, 1, 0}, // normal (valid) samples {20, 30, 20, 20, 5, 15, 1, 10, 1}, {35, 35, 35, 35, -10, 10, 1, 35, 0}, {50, 50, 60, 70, -15, -5, 25, 50, 1}}; // for daily_lapack tests const vector> large_matrix_size_range = { {192, 192, 192, 192, 5, 15, 100, 150, 0}, {256, 270, 256, 260, -10, 10, 1, 100, 0}, {300, 300, 310, 320, -15, -5, 200, 300, 0}, }; template Arguments sygvdx_setup_arguments(sygvdx_tuple tup, bool inplace) { using S = decltype(std::real(T{})); vector matrix_size = std::get<0>(tup); vector type = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("ldb", matrix_size[2]); if(!inplace) arg.set("ldz", matrix_size[3]); arg.set("vl", matrix_size[4]); arg.set("vu", matrix_size[5]); arg.set("il", matrix_size[6]); arg.set("iu", matrix_size[7]); arg.set("itype", type[0]); arg.set("evect", type[1]); arg.set("erange", type[2]); arg.set("uplo", type[3]); arg.set("abstol", 0); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = matrix_size[8]; return arg; } class SYGVDX_HEGVDX_INPLACE : public ::TestWithParam { protected: SYGVDX_HEGVDX_INPLACE() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = sygvdx_setup_arguments(GetParam(), true); if(arg.peek("itype") == '1' && arg.peek("evect") == 'N' && arg.peek("erange") == 'A' && arg.peek("uplo") == 'U' && arg.peek("n") == 0) testing_sygvdx_hegvdx_inplace_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_sygvdx_hegvdx_inplace(arg); arg.singular = 0; testing_sygvdx_hegvdx_inplace(arg); } }; class SYGVDX_INPLACE : public SYGVDX_HEGVDX_INPLACE { }; class HEGVDX_INPLACE : public SYGVDX_HEGVDX_INPLACE { }; // non-batch tests TEST_P(SYGVDX_INPLACE, __float) { run_tests(); } TEST_P(SYGVDX_INPLACE, __double) { run_tests(); } TEST_P(HEGVDX_INPLACE, __float_complex) { run_tests(); } TEST_P(HEGVDX_INPLACE, __double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, SYGVDX_INPLACE, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYGVDX_INPLACE, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEGVDX_INPLACE, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEGVDX_INPLACE, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); rocSOLVER-rocm-5.5.1/clients/gtest/sygvj_hegvj_gtest.cpp000066400000000000000000000107371436600607200232400ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_sygvj_hegvj.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> sygvj_tuple; // each matrix_size_range is a {n, lda, ldb, singular} // if singular = 1, then the used matrix for the tests is not positive definite // each type_range is a {itype, evect, uplo} // case when n = 0, itype = 1, evect = 'N', and uplo = U will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> type_range = {{'1', 'N', 'U'}, {'2', 'N', 'L'}, {'3', 'N', 'U'}, {'1', 'V', 'L'}, {'2', 'V', 'U'}, {'3', 'V', 'L'}}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 1, 0}, // invalid {-1, 1, 1, 0}, {20, 5, 5, 0}, // normal (valid) samples {20, 30, 20, 1}, {35, 35, 35, 0}, {50, 50, 60, 1}}; // for daily_lapack tests const vector> large_matrix_size_range = { {192, 192, 192, 0}, {256, 270, 256, 0}, {300, 300, 310, 0}, }; Arguments sygvj_setup_arguments(sygvj_tuple tup) { vector matrix_size = std::get<0>(tup); vector type = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("ldb", matrix_size[2]); arg.set("itype", type[0]); arg.set("evect", type[1]); arg.set("uplo", type[2]); arg.set("abstol", 0); arg.set("max_sweeps", 100); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = matrix_size[3]; return arg; } class SYGVJ_HEGVJ : public ::TestWithParam { protected: SYGVJ_HEGVJ() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = sygvj_setup_arguments(GetParam()); if(arg.peek("itype") == '1' && arg.peek("evect") == 'N' && arg.peek("uplo") == 'U' && arg.peek("n") == 0) testing_sygvj_hegvj_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_sygvj_hegvj(arg); arg.singular = 0; testing_sygvj_hegvj(arg); } }; class SYGVJ : public SYGVJ_HEGVJ { }; class HEGVJ : public SYGVJ_HEGVJ { }; // non-batch tests TEST_P(SYGVJ, __float) { run_tests(); } TEST_P(SYGVJ, __double) { run_tests(); } TEST_P(HEGVJ, __float_complex) { run_tests(); } TEST_P(HEGVJ, __double_complex) { run_tests(); } // batched tests TEST_P(SYGVJ, batched__float) { run_tests(); } TEST_P(SYGVJ, batched__double) { run_tests(); } TEST_P(HEGVJ, batched__float_complex) { run_tests(); } TEST_P(HEGVJ, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(SYGVJ, strided_batched__float) { run_tests(); } TEST_P(SYGVJ, strided_batched__double) { run_tests(); } TEST_P(HEGVJ, strided_batched__float_complex) { run_tests(); } TEST_P(HEGVJ, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, SYGVJ, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYGVJ, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEGVJ, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEGVJ, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); rocSOLVER-rocm-5.5.1/clients/gtest/sygvx_hegvx_gtest.cpp000066400000000000000000000122241436600607200232650ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_sygvx_hegvx.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, vector> sygvx_tuple; // each matrix_size_range is a {n, lda, ldb, ldz, vl, vu, il, iu, singular} // if singular = 1, then the used matrix for the tests is not positive definite // each type_range is a {itype, evect, erange, uplo} // case when n = 0, itype = 1, evect = 'N', erange = 'A', and uplo = U will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector> type_range = {{'1', 'N', 'A', 'U'}, {'2', 'N', 'V', 'L'}, {'3', 'N', 'I', 'U'}, {'1', 'V', 'V', 'L'}, {'2', 'V', 'I', 'U'}, {'3', 'V', 'A', 'L'}}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 1, 1, 0, 10, 1, 0, 0}, // invalid {-1, 1, 1, 1, 0, 10, 1, 1, 0}, {20, 5, 5, 20, 0, 10, 1, 1, 0}, {20, 20, 20, 5, 0, 10, 1, 1, 0}, // valid only when erange=A {20, 20, 20, 20, 10, 0, 10, 1, 0}, // normal (valid) samples {20, 30, 20, 20, 5, 15, 1, 10, 1}, {35, 35, 35, 35, -10, 10, 1, 35, 0}, {50, 50, 60, 70, -15, -5, 25, 50, 1}}; // for daily_lapack tests const vector> large_matrix_size_range = { {192, 192, 192, 192, 5, 15, 100, 150, 0}, {256, 270, 256, 260, -10, 10, 1, 100, 0}, {300, 300, 310, 320, -15, -5, 200, 300, 0}, }; template Arguments sygvx_setup_arguments(sygvx_tuple tup) { using S = decltype(std::real(T{})); vector matrix_size = std::get<0>(tup); vector type = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("ldb", matrix_size[2]); arg.set("ldz", matrix_size[3]); arg.set("vl", matrix_size[4]); arg.set("vu", matrix_size[5]); arg.set("il", matrix_size[6]); arg.set("iu", matrix_size[7]); arg.set("itype", type[0]); arg.set("evect", type[1]); arg.set("erange", type[2]); arg.set("uplo", type[3]); arg.set("abstol", 0); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = matrix_size[8]; return arg; } class SYGVX_HEGVX : public ::TestWithParam { protected: SYGVX_HEGVX() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = sygvx_setup_arguments(GetParam()); if(arg.peek("itype") == '1' && arg.peek("evect") == 'N' && arg.peek("erange") == 'A' && arg.peek("uplo") == 'U' && arg.peek("n") == 0) testing_sygvx_hegvx_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_sygvx_hegvx(arg); arg.singular = 0; testing_sygvx_hegvx(arg); } }; class SYGVX : public SYGVX_HEGVX { }; class HEGVX : public SYGVX_HEGVX { }; // non-batch tests TEST_P(SYGVX, __float) { run_tests(); } TEST_P(SYGVX, __double) { run_tests(); } TEST_P(HEGVX, __float_complex) { run_tests(); } TEST_P(HEGVX, __double_complex) { run_tests(); } // batched tests TEST_P(SYGVX, batched__float) { run_tests(); } TEST_P(SYGVX, batched__double) { run_tests(); } TEST_P(HEGVX, batched__float_complex) { run_tests(); } TEST_P(HEGVX, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(SYGVX, strided_batched__float) { run_tests(); } TEST_P(SYGVX, strided_batched__double) { run_tests(); } TEST_P(HEGVX, strided_batched__float_complex) { run_tests(); } TEST_P(HEGVX, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, SYGVX, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYGVX, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HEGVX, Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEGVX, Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); rocSOLVER-rocm-5.5.1/clients/gtest/sytf2_sytrf_gtest.cpp000066400000000000000000000120461436600607200232040ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_sytf2_sytrf.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, printable_char> sytrf_tuple; // each matrix_size_range vector is a {n, lda, singular} // if singular = 1, then the used matrix for the tests is singular // each uplo_range is a {uplo} // case when n = 0 and uplo = L will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector uplo_range = {'L', 'U'}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 0}, // invalid {-1, 1, 0}, {20, 5, 0}, // normal (valid) samples {32, 32, 1}, {50, 50, 0}, {70, 100, 1}}; // for daily_lapack tests const vector> large_matrix_size_range = { {192, 192, 1}, {640, 640, 0}, {1000, 1024, 1}, }; Arguments sytrf_setup_arguments(sytrf_tuple tup) { vector matrix_size = std::get<0>(tup); char uplo = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("uplo", uplo); // only testing standard use case/defaults for strides arg.timing = 0; arg.singular = matrix_size[2]; return arg; } template class SYTF2_SYTRF : public ::TestWithParam { protected: SYTF2_SYTRF() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = sytrf_setup_arguments(GetParam()); if(arg.peek("uplo") == 'L' && arg.peek("n") == 0) testing_sytf2_sytrf_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_sytf2_sytrf(arg); arg.singular = 0; testing_sytf2_sytrf(arg); } }; class SYTF2 : public SYTF2_SYTRF { }; class SYTRF : public SYTF2_SYTRF { }; // non-batch tests TEST_P(SYTF2, __float) { run_tests(); } TEST_P(SYTF2, __double) { run_tests(); } TEST_P(SYTF2, __float_complex) { run_tests(); } TEST_P(SYTF2, __double_complex) { run_tests(); } TEST_P(SYTRF, __float) { run_tests(); } TEST_P(SYTRF, __double) { run_tests(); } TEST_P(SYTRF, __float_complex) { run_tests(); } TEST_P(SYTRF, __double_complex) { run_tests(); } // batched tests TEST_P(SYTF2, batched__float) { run_tests(); } TEST_P(SYTF2, batched__double) { run_tests(); } TEST_P(SYTF2, batched__float_complex) { run_tests(); } TEST_P(SYTF2, batched__double_complex) { run_tests(); } TEST_P(SYTRF, batched__float) { run_tests(); } TEST_P(SYTRF, batched__double) { run_tests(); } TEST_P(SYTRF, batched__float_complex) { run_tests(); } TEST_P(SYTRF, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(SYTF2, strided_batched__float) { run_tests(); } TEST_P(SYTF2, strided_batched__double) { run_tests(); } TEST_P(SYTF2, strided_batched__float_complex) { run_tests(); } TEST_P(SYTF2, strided_batched__double_complex) { run_tests(); } TEST_P(SYTRF, strided_batched__float) { run_tests(); } TEST_P(SYTRF, strided_batched__double) { run_tests(); } TEST_P(SYTRF, strided_batched__float_complex) { run_tests(); } TEST_P(SYTRF, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, SYTF2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYTF2, Combine(ValuesIn(matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, SYTRF, Combine(ValuesIn(large_matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYTRF, Combine(ValuesIn(matrix_size_range), ValuesIn(uplo_range))); rocSOLVER-rocm-5.5.1/clients/gtest/sytxx_hetxx_gtest.cpp000066400000000000000000000127541436600607200233330ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_sytxx_hetxx.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, printable_char> sytrd_tuple; // each matrix_size_range is a {n, lda} // case when n = 0 and uplo = U will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector uplo_range = {'L', 'U'}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1}, // invalid {-1, 1}, {20, 5}, // normal (valid) samples {50, 50}, {70, 100}, {130, 130}, {150, 200}}; // for daily_lapack tests const vector> large_matrix_size_range = { {152, 152}, {640, 640}, {1000, 1024}, }; Arguments sytrd_setup_arguments(sytrd_tuple tup) { vector matrix_size = std::get<0>(tup); char uplo = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("uplo", uplo); // only testing standard use case/defaults for strides arg.timing = 0; return arg; } template class SYTXX_HETXX : public ::TestWithParam { protected: SYTXX_HETXX() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = sytrd_setup_arguments(GetParam()); if(arg.peek("uplo") == 'U' && arg.peek("n") == 0) testing_sytxx_hetxx_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); testing_sytxx_hetxx(arg); } }; class SYTD2 : public SYTXX_HETXX { }; class HETD2 : public SYTXX_HETXX { }; class SYTRD : public SYTXX_HETXX { }; class HETRD : public SYTXX_HETXX { }; // non-batch tests TEST_P(SYTD2, __float) { run_tests(); } TEST_P(SYTD2, __double) { run_tests(); } TEST_P(HETD2, __float_complex) { run_tests(); } TEST_P(HETD2, __double_complex) { run_tests(); } TEST_P(SYTRD, __float) { run_tests(); } TEST_P(SYTRD, __double) { run_tests(); } TEST_P(HETRD, __float_complex) { run_tests(); } TEST_P(HETRD, __double_complex) { run_tests(); } // batched tests TEST_P(SYTD2, batched__float) { run_tests(); } TEST_P(SYTD2, batched__double) { run_tests(); } TEST_P(HETD2, batched__float_complex) { run_tests(); } TEST_P(HETD2, batched__double_complex) { run_tests(); } TEST_P(SYTRD, batched__float) { run_tests(); } TEST_P(SYTRD, batched__double) { run_tests(); } TEST_P(HETRD, batched__float_complex) { run_tests(); } TEST_P(HETRD, batched__double_complex) { run_tests(); } // strided_batched cases TEST_P(SYTD2, strided_batched__float) { run_tests(); } TEST_P(SYTD2, strided_batched__double) { run_tests(); } TEST_P(HETD2, strided_batched__float_complex) { run_tests(); } TEST_P(HETD2, strided_batched__double_complex) { run_tests(); } TEST_P(SYTRD, strided_batched__float) { run_tests(); } TEST_P(SYTRD, strided_batched__double) { run_tests(); } TEST_P(HETRD, strided_batched__float_complex) { run_tests(); } TEST_P(HETRD, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, SYTD2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYTD2, Combine(ValuesIn(matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HETD2, Combine(ValuesIn(large_matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HETD2, Combine(ValuesIn(matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, SYTRD, Combine(ValuesIn(large_matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYTRD, Combine(ValuesIn(matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(daily_lapack, HETRD, Combine(ValuesIn(large_matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, HETRD, Combine(ValuesIn(matrix_size_range), ValuesIn(uplo_range))); rocSOLVER-rocm-5.5.1/clients/gtest/trtri_gtest.cpp000066400000000000000000000077121436600607200220560ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include "testing_trtri.hpp" using ::testing::Combine; using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using namespace std; typedef std::tuple, printable_char> trtri_tuple; // each matrix_size_range vector is a {n, lda, singular/diag} // if singular = 0, then the used matrix for the tests is triangular unit // if singular = 1, then the used matrix for the tests is triangular non-unit and singular // otherwise, the used matrix is triangular non-unit and not singular // each uplo_range is {uplo} // case when n = 0 and uplo = L will also execute the bad arguments test // (null handle, null pointers and invalid values) const vector uplo_range = {'L', 'U'}; // for checkin_lapack tests const vector> matrix_size_range = { // quick return {0, 1, 0}, // invalid {-1, 1, 0}, {20, 5, 0}, // normal (valid) samples {20, 32, 0}, {30, 30, 1}, {40, 60, 2}, {80, 80, 2}, {90, 100, 1}, {100, 150, 0}}; // for daily_lapack tests const vector> large_matrix_size_range = {{192, 192, 1}, {500, 600, 2}, {640, 640, 0}, {1000, 1024, 1}, {1200, 1230, 2}}; Arguments trtri_setup_arguments(trtri_tuple tup) { vector matrix_size = std::get<0>(tup); char uplo = std::get<1>(tup); Arguments arg; arg.set("n", matrix_size[0]); arg.set("lda", matrix_size[1]); arg.set("uplo", uplo); if(matrix_size[2] == 0) arg.set("diag", 'U'); else arg.set("diag", 'N'); if(matrix_size[2] == 1) arg.singular = 1; else arg.singular = 0; // only testing standard use case/defaults for strides arg.timing = 0; return arg; } class TRTRI : public ::TestWithParam { protected: TRTRI() {} virtual void SetUp() {} virtual void TearDown() {} template void run_tests() { Arguments arg = trtri_setup_arguments(GetParam()); if(arg.peek("n") == 0 && arg.peek("uplo") == 'L') testing_trtri_bad_arg(); arg.batch_count = (BATCHED || STRIDED ? 3 : 1); if(arg.singular == 1) testing_trtri(arg); arg.singular = 0; testing_trtri(arg); } }; // non-batch tests TEST_P(TRTRI, __float) { run_tests(); } TEST_P(TRTRI, __double) { run_tests(); } TEST_P(TRTRI, __float_complex) { run_tests(); } TEST_P(TRTRI, __double_complex) { run_tests(); } // batched tests TEST_P(TRTRI, batched__float) { run_tests(); } TEST_P(TRTRI, batched__double) { run_tests(); } TEST_P(TRTRI, batched__float_complex) { run_tests(); } TEST_P(TRTRI, batched__double_complex) { run_tests(); } // strided_batched tests TEST_P(TRTRI, strided_batched__float) { run_tests(); } TEST_P(TRTRI, strided_batched__double) { run_tests(); } TEST_P(TRTRI, strided_batched__float_complex) { run_tests(); } TEST_P(TRTRI, strided_batched__double_complex) { run_tests(); } INSTANTIATE_TEST_SUITE_P(daily_lapack, TRTRI, Combine(ValuesIn(large_matrix_size_range), ValuesIn(uplo_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, TRTRI, Combine(ValuesIn(matrix_size_range), ValuesIn(uplo_range))); rocSOLVER-rocm-5.5.1/clients/include/000077500000000000000000000000001436600607200172665ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/clients/include/client_environment_helpers.hpp000066400000000000000000000017131436600607200254250ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include bool set_environment_variable(const char* name, const char* value); bool unset_environment_variable(const char* name); class environment_error : public std::runtime_error { public: explicit environment_error(const std::string& what_arg) : std::runtime_error(what_arg) { } }; class scoped_envvar { std::string m_name; std::optional m_old_value; public: scoped_envvar(const char* name, const char* value); scoped_envvar(const scoped_envvar&) = delete; scoped_envvar(scoped_envvar&&) = delete; ~scoped_envvar(); scoped_envvar& operator=(const scoped_envvar&) = delete; scoped_envvar& operator=(scoped_envvar&&) = delete; }; rocSOLVER-rocm-5.5.1/clients/include/client_util.hpp000066400000000000000000000072461436600607200223230ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include /* Utility macros for explicit template instantiations. These macros together provide a system for consisely instantiating a template with all possible combinations of template parameters. The first argument to every function is a STAMP() macro function that will be used to emit the text of the declaration. Every other macro function used in this technique just makes function calls to other macros. The stamp is the only component that directly creates text. The stamp function is always taken as the first argument because it must be forwarded through the entire chain of calls down to APPLY_STAMP(). */ /* The final function in the pipeline is always APPLY_STAMP(), which is the function that actually calls/expands the stamp macro. It takes the stamp function as the first argument and it just calls the stamp function. */ #define APPLY_STAMP(STAMP, ...) STAMP(__VA_ARGS__) /* The FOREACH functions are best understood by examining their arguments. The first argument is the STAMP function, which is just forwarded to the next function. The second argument, F, is the next function in the pipeline. All remaining arguments, __VA_ARGS__, are the arguments that should be forwarded to the next function. Each function just calls the next function in the pipeline, appending whatever arguments it wants to add. By calling the next function twice, it can double the number of times the stamp is instantiated. By appending different values in each call to the next function, it can instantiate the stamp with multiple different values. */ #define FOREACH_BLOCKED_VARIANT(STAMP, F, ...) \ F(STAMP, ##__VA_ARGS__, false) \ F(STAMP, ##__VA_ARGS__, true) #define FOREACH_REAL_TYPE(STAMP, F, ...) \ F(STAMP, ##__VA_ARGS__, float) \ F(STAMP, ##__VA_ARGS__, double) #define FOREACH_COMPLEX_TYPE(STAMP, F, ...) \ F(STAMP, ##__VA_ARGS__, rocblas_float_complex) \ F(STAMP, ##__VA_ARGS__, rocblas_double_complex) #define FOREACH_SCALAR_TYPE(STAMP, F, ...) \ F(STAMP, ##__VA_ARGS__, float) \ F(STAMP, ##__VA_ARGS__, double) \ F(STAMP, ##__VA_ARGS__, rocblas_float_complex) \ F(STAMP, ##__VA_ARGS__, rocblas_double_complex) #define FOREACH_MATRIX_DATA_LAYOUT(STAMP, F, ...) \ F(STAMP, ##__VA_ARGS__, false, false) // single \ F(STAMP, ##__VA_ARGS__, true, true) // batched \ F(STAMP, ##__VA_ARGS__, false, true) // strided_batched /* This macro is not strictly necessary. It's does the same thing as any of the FOREACH functions, but it doesn't append any values. It exists as the top-level function of the macro pipeline only so that all the FOREACH functions can appear as arguments, rather than having the first FOREACH appear different from the others. */ #define INSTANTIATE(STAMP, F, ...) F(STAMP, __VA_ARGS__) /* To describe what's happening within one of these macro pipelines in in another way, the list of arguments starts off as a list of functions. Each function pops the next function from the front of the list, appends the values it's adding to the end of the list, and then calls the next function with the list as its arguments. This continues with the number of functions at the head shrinking and the number of values at the tail growing. That pattern ends with the call to APPLY_STAMP(). At that point, there should be no functions remaining and all arguments are values for the STAMP. */ rocSOLVER-rocm-5.5.1/clients/include/clientcommon.hpp000066400000000000000000000013751436600607200224740ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2019-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once // common #include "common_host_helpers.hpp" #include "rocsolver_datatype2string.hpp" // rocblas common #include "rocblascommon/clients_utility.hpp" #include "rocblascommon/rocblas_test.hpp" #include "rocblascommon/rocblas_vector.hpp" //#include "rocblascommon/device_vector.hpp" //#include "rocblascommon/device_batch_vector.hpp" //#include "rocblascommon/device_strided_batch_vector.hpp" //#include "rocblascommon/host_vector.hpp" //#include "rocblascommon/host_batch_vector.hpp" //#include "rocblascommon/host_strided_batch_vector.hpp" rocSOLVER-rocm-5.5.1/clients/include/lapack_host_reference.hpp000066400000000000000000000605751436600607200243220ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************/ #pragma once #include "clientcommon.hpp" #include "rocsolver_datatype2string.hpp" #include /*!\file * \brief provide template functions interfaces to BLAS and LAPACK interfaces, * it is only used for testing not part of the GPU library */ /*template void cpu_iamax(rocblas_int n, const T *x, rocblas_int incx, rocblas_int *result); template void cpu_iamin(rocblas_int n, const T *x, rocblas_int incx, rocblas_int *result); template void cpu_asum(rocblas_int n, const T1 *x, rocblas_int incx, T2 *result); template void cpu_axpy(rocblas_int n, const T alpha, T *x, rocblas_int incx, T *y, rocblas_int incy); template void cpu_copy(rocblas_int n, T *x, rocblas_int incx, T *y, rocblas_int incy); template void cpu_dot(rocblas_int n, const T *x, rocblas_int incx, const T *y, rocblas_int incy, T *result); template void cpu_nrm2(rocblas_int n, const T1 *x, rocblas_int incx, T2 *result); template void cpu_scal(rocblas_int n, const T alpha, T *x, rocblas_int incx); template void cpu_swap(rocblas_int n, T *x, rocblas_int incx, T *y, rocblas_int incy); template void cpu_ger(rocblas_int m, rocblas_int n, T alpha, T *x, rocblas_int incx, T *y, rocblas_int incy, T *A, rocblas_int lda); template void cpu_syr(rocblas_fill uplo, rocblas_int n, T alpha, T *x, rocblas_int incx, T *A, rocblas_int lda); */ template void cpu_gemv(rocblas_operation transA, rocblas_int m, rocblas_int n, T alpha, T* A, rocblas_int lda, T* x, rocblas_int incx, T beta, T* y, rocblas_int incy); template void cpu_gemm(rocblas_operation transA, rocblas_operation transB, rocblas_int m, rocblas_int n, rocblas_int k, T alpha, T* A, rocblas_int lda, T* B, rocblas_int ldb, T beta, T* C, rocblas_int ldc); template void cpu_symv_hemv(rocblas_fill uplo, rocblas_int n, T alpha, T* A, rocblas_int lda, T* x, rocblas_int incx, T beta, T* y, rocblas_int incy); template void cpu_symm_hemm(rocblas_side side, rocblas_fill uplo, rocblas_int m, rocblas_int n, T alpha, T* A, rocblas_int lda, T* B, rocblas_int ldb, T beta, T* C, rocblas_int ldc); template void cpu_trsm(rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, T alpha, T* A, rocblas_int lda, T* B, rocblas_int ldb); /* template void cpu_trsv(rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int n, T* A, rocblas_int lda, T* x, rocblas_int incx); */ template void cpu_trmm(rocblas_side side, rocblas_fill uplo, rocblas_operation transA, rocblas_diagonal diag, rocblas_int m, rocblas_int n, T alpha, T* A, rocblas_int lda, T* B, rocblas_int ldb); template void cpu_potf2(rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, rocblas_int* info); template void cpu_potrf(rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, rocblas_int* info); template void cpu_potrs(rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, T* A, rocblas_int lda, T* B, rocblas_int ldb); template void cpu_posv(rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, T* A, rocblas_int lda, T* B, rocblas_int ldb, rocblas_int* info); template void cpu_potri(rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, rocblas_int* info); template void cpu_getf2(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info); template void cpu_getrf(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info); template void cpu_getrs(rocblas_operation trans, rocblas_int n, rocblas_int nrhs, T* A, rocblas_int lda, rocblas_int* ipiv, T* B, rocblas_int ldb); template void cpu_gesv(rocblas_int n, rocblas_int nrhs, T* A, rocblas_int lda, rocblas_int* ipiv, T* B, rocblas_int ldb, rocblas_int* info); template void cpu_gels(rocblas_operation transR, rocblas_int m, rocblas_int n, rocblas_int nrhs, T* A, rocblas_int lda, T* B, rocblas_int ldb, T* work, rocblas_int lwork, rocblas_int* info); template void cpu_getri(rocblas_int n, T* A, rocblas_int lda, rocblas_int* ipiv, T* work, rocblas_int lwork, rocblas_int* info); template void cpu_trtri(rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, T* A, rocblas_int lda, rocblas_int* info); template void cpu_larfg(rocblas_int n, T* alpha, T* x, rocblas_int incx, T* tau); template void cpu_larf(rocblas_side side, rocblas_int m, rocblas_int n, T* x, rocblas_int incx, T* alpha, T* A, rocblas_int lda, T* work); template void cpu_larft(rocblas_direct direct, rocblas_storev storev, rocblas_int n, rocblas_int k, T* V, rocblas_int ldv, T* tau, T* F, rocblas_int ldt); template void cpu_larfb(rocblas_side side, rocblas_operation trans, rocblas_direct direct, rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, T* V, rocblas_int ldv, T* F, rocblas_int ldt, T* A, rocblas_int lda, T* W, rocblas_int ldw); template void cpu_latrd(rocblas_fill uplo, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, S* E, T* tau, T* W, rocblas_int ldw); template void cpu_labrd(rocblas_int m, rocblas_int n, rocblas_int nb, T* A, rocblas_int lda, S* D, S* E, T* tauq, T* taup, T* X, rocblas_int ldx, T* Y, rocblas_int ldy); template void cpu_bdsqr(rocblas_fill uplo, rocblas_int n, rocblas_int nv, rocblas_int nu, rocblas_int nc, W* D, W* E, T* V, rocblas_int ldv, T* U, rocblas_int ldu, T* C, rocblas_int ldc, W* work, rocblas_int* info); template void cpu_geqr2(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, T* ipiv, T* work); template void cpu_geqrf(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, T* ipiv, T* work, rocblas_int sizeW); template void cpu_gerq2(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, T* ipiv, T* work); template void cpu_gerqf(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, T* ipiv, T* work, rocblas_int sizeW); template void cpu_geql2(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, T* ipiv, T* work); template void cpu_geqlf(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, T* ipiv, T* work, rocblas_int sizeW); template void cpu_gelq2(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, T* ipiv, T* work); template void cpu_gelqf(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, T* ipiv, T* work, rocblas_int sizeW); template void cpu_lacgv(rocblas_int n, T* x, rocblas_int incx); template void cpu_laswp(rocblas_int n, T* A, rocblas_int lda, rocblas_int k1, rocblas_int k2, rocblas_int* ipiv, rocblas_int inc); template void cpu_org2r_ung2r(rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* work); template void cpu_orgqr_ungqr(rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* work, rocblas_int sizeW); template void cpu_orgl2_ungl2(rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* work); template void cpu_orglq_unglq(rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* work, rocblas_int sizeW); template void cpu_org2l_ung2l(rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* work); template void cpu_orgql_ungql(rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* work, rocblas_int sizeW); template void cpu_orgbr_ungbr(rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* work, rocblas_int size_w); template void cpu_orgtr_ungtr(rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, T* Ipiv, T* work, rocblas_int size_w); template void cpu_orm2r_unm2r(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* C, rocblas_int ldc, T* work); template void cpu_ormqr_unmqr(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* C, rocblas_int ldc, T* work, rocblas_int sizeW); template void cpu_orml2_unml2(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* C, rocblas_int ldc, T* work); template void cpu_ormlq_unmlq(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* C, rocblas_int ldc, T* work, rocblas_int sizeW); template void cpu_lauum(rocblas_fill uploR, rocblas_int n, T* A, rocblas_int lda); template void cpu_orm2l_unm2l(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* C, rocblas_int ldc, T* work); template void cpu_ormql_unmql(rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* C, rocblas_int ldc, T* work, rocblas_int sizeW); template void cpu_ormbr_unmbr(rocblas_storev storev, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, T* A, rocblas_int lda, T* Ipiv, T* C, rocblas_int ldc, T* work, rocblas_int sizeW); template void cpu_ormtr_unmtr(rocblas_side side, rocblas_fill uplo, rocblas_operation trans, rocblas_int m, rocblas_int n, T* A, rocblas_int lda, T* Ipiv, T* C, rocblas_int ldc, T* work, rocblas_int sizeW); template void cpu_gebd2(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, S* D, S* E, T* tauq, T* taup, T* work); template void cpu_gebrd(rocblas_int m, rocblas_int n, T* A, rocblas_int lda, S* D, S* E, T* tauq, T* taup, T* work, rocblas_int size_w); template void cpu_sytrd_hetrd(rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, S* D, S* E, T* tau, T* work, rocblas_int size_w); template void cpu_sytd2_hetd2(rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, S* D, S* E, T* tau); template void cpu_gesvd(rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, T* A, rocblas_int lda, W* S, T* U, rocblas_int ldu, T* V, rocblas_int ldv, T* work, rocblas_int lwork, W* rwork, rocblas_int* info); template void cpu_sterf(rocblas_int n, T* D, T* E); template void cpu_steqr(rocblas_evect evect, rocblas_int n, S* D, S* E, T* C, rocblas_int ldc, S* work, rocblas_int* info); template void cpu_stedc(rocblas_evect evect, rocblas_int n, S* D, S* E, T* C, rocblas_int ldc, T* work, rocblas_int lwork, S* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info); template void cpu_stebz(rocblas_erange erange, rocblas_eorder eorder, rocblas_int n, T vl, T vu, rocblas_int il, rocblas_int iu, T abstol, T* D, T* E, rocblas_int* nev, rocblas_int* nsplit, T* W, rocblas_int* iblock, rocblas_int* isplit, T* work, rocblas_int* iwork, rocblas_int* info); template void cpu_stein(rocblas_int n, S* D, S* E, rocblas_int* nev, S* W, rocblas_int* iblock, rocblas_int* isplit, T* Z, rocblas_int ldz, S* work, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info); template void cpu_sygs2_hegs2(rocblas_eform itype, rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, T* B, rocblas_int ldb); template void cpu_sygst_hegst(rocblas_eform itype, rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, T* B, rocblas_int ldb); template void cpu_syev_heev(rocblas_evect evect, rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, S* W, T* work, rocblas_int lwork, S* rwork, rocblas_int lrwork, rocblas_int* info); template void cpu_syevd_heevd(rocblas_evect evect, rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, S* W, T* work, rocblas_int lwork, S* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info); template void cpu_syevx_heevx(rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, S vl, S vu, rocblas_int il, rocblas_int iu, S abstol, rocblas_int* nev, S* W, T* Z, rocblas_int ldz, T* work, rocblas_int lwork, S* rwork, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info); template void cpu_sygv_hegv(rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, T* B, rocblas_int ldb, S* W, T* work, rocblas_int lwork, S* rwork, rocblas_int* info); template void cpu_sygvd_hegvd(rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, T* B, rocblas_int ldb, S* W, T* work, rocblas_int lwork, S* rwork, rocblas_int lrwork, rocblas_int* iwork, rocblas_int liwork, rocblas_int* info); template void cpu_sygvx_hegvx(rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, T* B, rocblas_int ldb, S vl, S vu, rocblas_int il, rocblas_int iu, S abstol, rocblas_int* m, S* W, T* Z, rocblas_int ldz, T* work, rocblas_int lwork, S* rwork, rocblas_int* iwork, rocblas_int* ifail, rocblas_int* info); template void cpu_lasyf(rocblas_fill uplo, rocblas_int n, rocblas_int nb, rocblas_int* kb, T* A, rocblas_int lda, rocblas_int* ipiv, T* W, rocblas_int ldw, rocblas_int* info); template void cpu_sytf2(rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info); template void cpu_sytrf(rocblas_fill uplo, rocblas_int n, T* A, rocblas_int lda, rocblas_int* ipiv, T* work, rocblas_int lwork, rocblas_int* info); template void cpu_bdsvdx(rocblas_fill uplo, rocblas_svect svect, rocblas_srange srange, rocblas_int n, T* D, T* E, T vl, T vu, rocblas_int il, rocblas_int iu, rocblas_int* nsv, T* S, T* Z, rocblas_int ldz, T* work, rocblas_int* iwork, rocblas_int* info); template void cpu_gesvdx(rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, T* A, rocblas_int lda, W vl, W vu, rocblas_int il, rocblas_int iu, rocblas_int* nsv, W* S, T* U, rocblas_int ldu, T* V, rocblas_int ldv, T* work, rocblas_int lwork, W* rwork, rocblas_int* iwork, rocblas_int* info); rocSOLVER-rocm-5.5.1/clients/include/norm.hpp000066400000000000000000000135241436600607200207570ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include "clientcommon.hpp" /* LAPACK fortran library functionality */ extern "C" { float slange_(char* norm_type, int* m, int* n, float* A, int* lda, float* work); double dlange_(char* norm_type, int* m, int* n, double* A, int* lda, double* work); float clange_(char* norm_type, int* m, int* n, rocblas_float_complex* A, int* lda, float* work); double zlange_(char* norm_type, int* m, int* n, rocblas_double_complex* A, int* lda, double* work); void daxpy_(int* n, double* alpha, double* x, int* incx, double* y, int* incy); void zaxpy_(int* n, rocblas_double_complex* alpha, rocblas_double_complex* x, int* incx, rocblas_double_complex* y, int* incy); } inline float xlange(char* norm_type, int* m, int* n, float* A, int* lda, float* work) { return slange_(norm_type, m, n, A, lda, work); } inline double xlange(char* norm_type, int* m, int* n, double* A, int* lda, double* work) { return dlange_(norm_type, m, n, A, lda, work); } inline float xlange(char* norm_type, int* m, int* n, rocblas_float_complex* A, int* lda, float* work) { return clange_(norm_type, m, n, A, lda, work); } inline double xlange(char* norm_type, int* m, int* n, rocblas_double_complex* A, int* lda, double* work) { return zlange_(norm_type, m, n, A, lda, work); } inline void xaxpy(int* n, double* alpha, double* x, int* incx, double* y, int* incy) { return daxpy_(n, alpha, x, incx, y, incy); } inline void xaxpy(int* n, rocblas_double_complex* alpha, rocblas_double_complex* x, int* incx, rocblas_double_complex* y, int* incy) { return zaxpy_(n, alpha, x, incx, y, incy); } /* Norm of error functions */ template , int> = 0> double norm_error(char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda_gold, T* gold, T* comp, rocblas_int lda_comp = 0) { // norm type can be 'O', 'I', 'F', 'o', 'i', 'f' for one, infinity or // Frobenius norm one norm is max column sum infinity norm is max row sum // Frobenius is l2 norm of matrix entries rocblas_int lda = M; lda_comp = lda_comp > 0 ? lda_comp : lda_gold; host_vector gold_double(N * lda); host_vector comp_double(N * lda); for(rocblas_int i = 0; i < M; i++) { for(rocblas_int j = 0; j < N; j++) { gold_double[i + j * lda] = double(gold[i + j * lda_gold]); comp_double[i + j * lda] = double(comp[i + j * lda_comp]); } } std::vector work(M); rocblas_int incx = 1; double alpha = -1.0; rocblas_int size = lda * N; double gold_norm = xlange(&norm_type, &M, &N, gold_double.data(), &lda, work.data()); xaxpy(&size, &alpha, gold_double.data(), &incx, comp_double.data(), &incx); double error = xlange(&norm_type, &M, &N, comp_double.data(), &lda, work.data()); if(gold_norm > 0) error /= gold_norm; return error; } template , int> = 0> double norm_error(char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda_gold, T* gold, T* comp, rocblas_int lda_comp = 0) { // norm type can be 'O', 'I', 'F', 'o', 'i', 'f' for one, infinity or // Frobenius norm one norm is max column sum infinity norm is max row sum // Frobenius is l2 norm of matrix entries rocblas_int lda = M; lda_comp = lda_comp > 0 ? lda_comp : lda_gold; host_vector gold_double(N * lda); host_vector comp_double(N * lda); for(rocblas_int i = 0; i < M; i++) { for(rocblas_int j = 0; j < N; j++) { gold_double[i + j * lda] = rocblas_double_complex(gold[i + j * lda_gold]); comp_double[i + j * lda] = rocblas_double_complex(comp[i + j * lda_comp]); } } std::vector work(M); rocblas_int incx = 1; rocblas_double_complex alpha = -1.0; rocblas_int size = lda * N; double gold_norm = xlange(&norm_type, &M, &N, gold_double.data(), &lda, work.data()); xaxpy(&size, &alpha, gold_double.data(), &incx, comp_double.data(), &incx); double error = xlange(&norm_type, &M, &N, comp_double.data(), &lda, work.data()); if(gold_norm > 0) error /= gold_norm; return error; } template double norm_error_upperTr(char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda, T* gold, T* comp) { for(rocblas_int i = 0; i < M; ++i) { for(rocblas_int j = 0; j < N; ++j) { if(i > j) { gold[i + j * lda] = T(0); comp[i + j * lda] = T(0); } } } return norm_error(norm_type, M, N, lda, gold, comp); } template double norm_error_lowerTr(char norm_type, rocblas_int M, rocblas_int N, rocblas_int lda, T* gold, T* comp) { for(rocblas_int i = 0; i < M; ++i) { for(rocblas_int j = 0; j < N; ++j) { if(i < j) { gold[i + j * lda] = T(0); comp[i + j * lda] = T(0); } } } return norm_error(norm_type, M, N, lda, gold, comp); } template S snorm(char norm_type, rocblas_int m, rocblas_int n, T* A, rocblas_int lda) { return xlange(&norm_type, &m, &n, A, &lda, (S*)nullptr); } rocSOLVER-rocm-5.5.1/clients/include/rocsolver.hpp000066400000000000000000017027301436600607200220270ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include "clientcommon.hpp" // Most functions within this file exist to provide a consistent interface for our templated tests. // Function overloading is used to select between the float, double, rocblas_float_complex // and rocblas_double_complex variants, and to distinguish the batched case (T**) from the normal // and strided_batched cases (T*). // // The normal and strided_batched cases are distinguished from each other by passing a boolean // parameter, STRIDED. Variants such as the blocked and unblocked versions of algorithms, may be // provided in similar ways. /***** Functions not included in the public API that must be declared *****/ #ifdef __cplusplus extern "C" { #endif rocblas_status rocsolver_sgeqrf_ptr_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float* const A[], const rocblas_int lda, float* const ipiv[], const rocblas_int batch_count); rocblas_status rocsolver_dgeqrf_ptr_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double* const A[], const rocblas_int lda, double* const ipiv[], const rocblas_int batch_count); rocblas_status rocsolver_cgeqrf_ptr_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex* const A[], const rocblas_int lda, rocblas_float_complex* const ipiv[], const rocblas_int batch_count); rocblas_status rocsolver_zgeqrf_ptr_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex* const A[], const rocblas_int lda, rocblas_double_complex* const ipiv[], const rocblas_int batch_count); rocblas_status rocsolver_sgesv_outofplace(rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, float* A, const rocblas_int lda, rocblas_int* ipiv, float* B, const rocblas_int ldb, float* X, const rocblas_int ldx, rocblas_int* info); rocblas_status rocsolver_dgesv_outofplace(rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, double* A, const rocblas_int lda, rocblas_int* ipiv, double* B, const rocblas_int ldb, double* X, const rocblas_int ldx, rocblas_int* info); rocblas_status rocsolver_cgesv_outofplace(rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex* A, const rocblas_int lda, rocblas_int* ipiv, rocblas_float_complex* B, const rocblas_int ldb, rocblas_float_complex* X, const rocblas_int ldx, rocblas_int* info); rocblas_status rocsolver_zgesv_outofplace(rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex* A, const rocblas_int lda, rocblas_int* ipiv, rocblas_double_complex* B, const rocblas_int ldb, rocblas_double_complex* X, const rocblas_int ldx, rocblas_int* info); rocblas_status rocsolver_sgels_outofplace(rocblas_handle handle, rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, float* A, const rocblas_int lda, float* B, const rocblas_int ldb, float* X, const rocblas_int ldx, rocblas_int* info); rocblas_status rocsolver_dgels_outofplace(rocblas_handle handle, rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, double* A, const rocblas_int lda, double* B, const rocblas_int ldb, double* X, const rocblas_int ldx, rocblas_int* info); rocblas_status rocsolver_cgels_outofplace(rocblas_handle handle, rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, rocblas_float_complex* A, const rocblas_int lda, rocblas_float_complex* B, const rocblas_int ldb, rocblas_float_complex* X, const rocblas_int ldx, rocblas_int* info); rocblas_status rocsolver_zgels_outofplace(rocblas_handle handle, rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, rocblas_double_complex* A, const rocblas_int lda, rocblas_double_complex* B, const rocblas_int ldb, rocblas_double_complex* X, const rocblas_int ldx, rocblas_int* info); rocblas_status rocsolver_ssyevdx_inplace(rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, float* A, const rocblas_int lda, const float vl, const float vu, const rocblas_int il, const rocblas_int iu, const float abstol, rocblas_int* nev, float* W, rocblas_int* info); rocblas_status rocsolver_dsyevdx_inplace(rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, double* A, const rocblas_int lda, const double vl, const double vu, const rocblas_int il, const rocblas_int iu, const double abstol, rocblas_int* nev, double* W, rocblas_int* info); rocblas_status rocsolver_cheevdx_inplace(rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, rocblas_float_complex* A, const rocblas_int lda, const float vl, const float vu, const rocblas_int il, const rocblas_int iu, const float abstol, rocblas_int* nev, float* W, rocblas_int* info); rocblas_status rocsolver_zheevdx_inplace(rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, rocblas_double_complex* A, const rocblas_int lda, const double vl, const double vu, const rocblas_int il, const rocblas_int iu, const double abstol, rocblas_int* nev, double* W, rocblas_int* info); rocblas_status rocsolver_ssygvdx_inplace(rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, float* A, const rocblas_int lda, float* B, const rocblas_int ldb, const float vl, const float vu, const rocblas_int il, const rocblas_int iu, const float abstol, rocblas_int* h_nev, float* W, rocblas_int* info); rocblas_status rocsolver_dsygvdx_inplace(rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, double* A, const rocblas_int lda, double* B, const rocblas_int ldb, const double vl, const double vu, const rocblas_int il, const rocblas_int iu, const double abstol, rocblas_int* h_nev, double* W, rocblas_int* info); rocblas_status rocsolver_chegvdx_inplace(rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, rocblas_float_complex* A, const rocblas_int lda, rocblas_float_complex* B, const rocblas_int ldb, const float vl, const float vu, const rocblas_int il, const rocblas_int iu, const float abstol, rocblas_int* h_nev, float* W, rocblas_int* info); rocblas_status rocsolver_zhegvdx_inplace(rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, rocblas_double_complex* A, const rocblas_int lda, rocblas_double_complex* B, const rocblas_int ldb, const double vl, const double vu, const rocblas_int il, const rocblas_int iu, const double abstol, rocblas_int* h_nev, double* W, rocblas_int* info); rocblas_status rocsolver_sgesvdj_notransv(rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, float* A, const rocblas_int lda, const float abstol, float* residual, const rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, float* U, const rocblas_int ldu, float* V, const rocblas_int ldv, rocblas_int* info); rocblas_status rocsolver_dgesvdj_notransv(rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, double* A, const rocblas_int lda, const double abstol, double* residual, const rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, double* U, const rocblas_int ldu, double* V, const rocblas_int ldv, rocblas_int* info); rocblas_status rocsolver_cgesvdj_notransv(rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, rocblas_float_complex* A, const rocblas_int lda, const float abstol, float* residual, const rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, rocblas_float_complex* U, const rocblas_int ldu, rocblas_float_complex* V, const rocblas_int ldv, rocblas_int* info); rocblas_status rocsolver_zgesvdj_notransv(rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, rocblas_double_complex* A, const rocblas_int lda, const double abstol, double* residual, const rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, rocblas_double_complex* U, const rocblas_int ldu, rocblas_double_complex* V, const rocblas_int ldv, rocblas_int* info); rocblas_status rocsolver_sgesvdj_notransv_strided_batched(rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, float* A, const rocblas_int lda, const rocblas_stride strideA, const float abstol, float* residual, const rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, const rocblas_stride strideS, float* U, const rocblas_int ldu, const rocblas_stride strideU, float* V, const rocblas_int ldv, const rocblas_stride strideV, rocblas_int* info, const rocblas_int batch_count); rocblas_status rocsolver_dgesvdj_notransv_strided_batched(rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, double* A, const rocblas_int lda, const rocblas_stride strideA, const double abstol, double* residual, const rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, const rocblas_stride strideS, double* U, const rocblas_int ldu, const rocblas_stride strideU, double* V, const rocblas_int ldv, const rocblas_stride strideV, rocblas_int* info, const rocblas_int batch_count); rocblas_status rocsolver_cgesvdj_notransv_strided_batched(rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, rocblas_float_complex* A, const rocblas_int lda, const rocblas_stride strideA, const float abstol, float* residual, const rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, const rocblas_stride strideS, rocblas_float_complex* U, const rocblas_int ldu, const rocblas_stride strideU, rocblas_float_complex* V, const rocblas_int ldv, const rocblas_stride strideV, rocblas_int* info, const rocblas_int batch_count); rocblas_status rocsolver_zgesvdj_notransv_strided_batched(rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, rocblas_double_complex* A, const rocblas_int lda, const rocblas_stride strideA, const double abstol, double* residual, const rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, const rocblas_stride strideS, rocblas_double_complex* U, const rocblas_int ldu, const rocblas_stride strideU, rocblas_double_complex* V, const rocblas_int ldv, const rocblas_stride strideV, rocblas_int* info, const rocblas_int batch_count); #ifdef __cplusplus } #endif /***************************************************/ /******************** LACGV ********************/ inline rocblas_status rocsolver_lacgv(rocblas_handle handle, rocblas_int n, rocblas_float_complex* x, rocblas_int incx) { return rocsolver_clacgv(handle, n, x, incx); } inline rocblas_status rocsolver_lacgv(rocblas_handle handle, rocblas_int n, rocblas_double_complex* x, rocblas_int incx) { return rocsolver_zlacgv(handle, n, x, incx); } /*****************************************************/ /******************** LASWP ********************/ inline rocblas_status rocsolver_laswp(rocblas_handle handle, rocblas_int n, float* A, rocblas_int lda, rocblas_int k1, rocblas_int k2, rocblas_int* ipiv, rocblas_int inc) { return rocsolver_slaswp(handle, n, A, lda, k1, k2, ipiv, inc); } inline rocblas_status rocsolver_laswp(rocblas_handle handle, rocblas_int n, double* A, rocblas_int lda, rocblas_int k1, rocblas_int k2, rocblas_int* ipiv, rocblas_int inc) { return rocsolver_dlaswp(handle, n, A, lda, k1, k2, ipiv, inc); } inline rocblas_status rocsolver_laswp(rocblas_handle handle, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_int k1, rocblas_int k2, rocblas_int* ipiv, rocblas_int inc) { return rocsolver_claswp(handle, n, A, lda, k1, k2, ipiv, inc); } inline rocblas_status rocsolver_laswp(rocblas_handle handle, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_int k1, rocblas_int k2, rocblas_int* ipiv, rocblas_int inc) { return rocsolver_zlaswp(handle, n, A, lda, k1, k2, ipiv, inc); } /*****************************************************/ /******************** LARFG ********************/ inline rocblas_status rocsolver_larfg(rocblas_handle handle, rocblas_int n, float* alpha, float* x, rocblas_int incx, float* tau) { return rocsolver_slarfg(handle, n, alpha, x, incx, tau); } inline rocblas_status rocsolver_larfg(rocblas_handle handle, rocblas_int n, double* alpha, double* x, rocblas_int incx, double* tau) { return rocsolver_dlarfg(handle, n, alpha, x, incx, tau); } inline rocblas_status rocsolver_larfg(rocblas_handle handle, rocblas_int n, rocblas_float_complex* alpha, rocblas_float_complex* x, rocblas_int incx, rocblas_float_complex* tau) { return rocsolver_clarfg(handle, n, alpha, x, incx, tau); } inline rocblas_status rocsolver_larfg(rocblas_handle handle, rocblas_int n, rocblas_double_complex* alpha, rocblas_double_complex* x, rocblas_int incx, rocblas_double_complex* tau) { return rocsolver_zlarfg(handle, n, alpha, x, incx, tau); } /*****************************************************/ /******************** LARF ********************/ inline rocblas_status rocsolver_larf(rocblas_handle handle, rocblas_side side, rocblas_int m, rocblas_int n, float* x, rocblas_int incx, float* alpha, float* A, rocblas_int lda) { return rocsolver_slarf(handle, side, m, n, x, incx, alpha, A, lda); } inline rocblas_status rocsolver_larf(rocblas_handle handle, rocblas_side side, rocblas_int m, rocblas_int n, double* x, rocblas_int incx, double* alpha, double* A, rocblas_int lda) { return rocsolver_dlarf(handle, side, m, n, x, incx, alpha, A, lda); } inline rocblas_status rocsolver_larf(rocblas_handle handle, rocblas_side side, rocblas_int m, rocblas_int n, rocblas_float_complex* x, rocblas_int incx, rocblas_float_complex* alpha, rocblas_float_complex* A, rocblas_int lda) { return rocsolver_clarf(handle, side, m, n, x, incx, alpha, A, lda); } inline rocblas_status rocsolver_larf(rocblas_handle handle, rocblas_side side, rocblas_int m, rocblas_int n, rocblas_double_complex* x, rocblas_int incx, rocblas_double_complex* alpha, rocblas_double_complex* A, rocblas_int lda) { return rocsolver_zlarf(handle, side, m, n, x, incx, alpha, A, lda); } /*****************************************************/ /******************** LARFT ********************/ inline rocblas_status rocsolver_larft(rocblas_handle handle, rocblas_direct direct, rocblas_storev storev, rocblas_int n, rocblas_int k, float* V, rocblas_int ldv, float* tau, float* F, rocblas_int ldt) { return rocsolver_slarft(handle, direct, storev, n, k, V, ldv, tau, F, ldt); } inline rocblas_status rocsolver_larft(rocblas_handle handle, rocblas_direct direct, rocblas_storev storev, rocblas_int n, rocblas_int k, double* V, rocblas_int ldv, double* tau, double* F, rocblas_int ldt) { return rocsolver_dlarft(handle, direct, storev, n, k, V, ldv, tau, F, ldt); } inline rocblas_status rocsolver_larft(rocblas_handle handle, rocblas_direct direct, rocblas_storev storev, rocblas_int n, rocblas_int k, rocblas_float_complex* V, rocblas_int ldv, rocblas_float_complex* tau, rocblas_float_complex* F, rocblas_int ldt) { return rocsolver_clarft(handle, direct, storev, n, k, V, ldv, tau, F, ldt); } inline rocblas_status rocsolver_larft(rocblas_handle handle, rocblas_direct direct, rocblas_storev storev, rocblas_int n, rocblas_int k, rocblas_double_complex* V, rocblas_int ldv, rocblas_double_complex* tau, rocblas_double_complex* F, rocblas_int ldt) { return rocsolver_zlarft(handle, direct, storev, n, k, V, ldv, tau, F, ldt); } /*****************************************************/ /******************** LARFB ********************/ inline rocblas_status rocsolver_larfb(rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_direct direct, rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, float* V, rocblas_int ldv, float* F, rocblas_int ldt, float* A, rocblas_int lda) { return rocsolver_slarfb(handle, side, trans, direct, storev, m, n, k, V, ldv, F, ldt, A, lda); } inline rocblas_status rocsolver_larfb(rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_direct direct, rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, double* V, rocblas_int ldv, double* F, rocblas_int ldt, double* A, rocblas_int lda) { return rocsolver_dlarfb(handle, side, trans, direct, storev, m, n, k, V, ldv, F, ldt, A, lda); } inline rocblas_status rocsolver_larfb(rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_direct direct, rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* V, rocblas_int ldv, rocblas_float_complex* F, rocblas_int ldt, rocblas_float_complex* A, rocblas_int lda) { return rocsolver_clarfb(handle, side, trans, direct, storev, m, n, k, V, ldv, F, ldt, A, lda); } inline rocblas_status rocsolver_larfb(rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_direct direct, rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* V, rocblas_int ldv, rocblas_double_complex* F, rocblas_int ldt, rocblas_double_complex* A, rocblas_int lda) { return rocsolver_zlarfb(handle, side, trans, direct, storev, m, n, k, V, ldv, F, ldt, A, lda); } /***************************************************************/ /******************** LAUUM ********************/ inline rocblas_status rocsolver_lauum(rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, float* A, const rocblas_int lda) { return rocsolver_slauum(handle, uplo, n, A, lda); } inline rocblas_status rocsolver_lauum(rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, double* A, const rocblas_int lda) { return rocsolver_dlauum(handle, uplo, n, A, lda); } /***************************************************************/ /******************** BDSQR ********************/ inline rocblas_status rocsolver_bdsqr(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nv, rocblas_int nu, rocblas_int nc, float* D, float* E, float* V, rocblas_int ldv, float* U, rocblas_int ldu, float* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_sbdsqr(handle, uplo, n, nv, nu, nc, D, E, V, ldv, U, ldu, C, ldc, info); } inline rocblas_status rocsolver_bdsqr(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nv, rocblas_int nu, rocblas_int nc, double* D, double* E, double* V, rocblas_int ldv, double* U, rocblas_int ldu, double* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_dbdsqr(handle, uplo, n, nv, nu, nc, D, E, V, ldv, U, ldu, C, ldc, info); } inline rocblas_status rocsolver_bdsqr(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nv, rocblas_int nu, rocblas_int nc, float* D, float* E, rocblas_float_complex* V, rocblas_int ldv, rocblas_float_complex* U, rocblas_int ldu, rocblas_float_complex* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_cbdsqr(handle, uplo, n, nv, nu, nc, D, E, V, ldv, U, ldu, C, ldc, info); } inline rocblas_status rocsolver_bdsqr(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nv, rocblas_int nu, rocblas_int nc, double* D, double* E, rocblas_double_complex* V, rocblas_int ldv, rocblas_double_complex* U, rocblas_int ldu, rocblas_double_complex* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_zbdsqr(handle, uplo, n, nv, nu, nc, D, E, V, ldv, U, ldu, C, ldc, info); } /***************************************************************/ /******************** LATRD ********************/ inline rocblas_status rocsolver_latrd(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* E, float* tau, float* W, rocblas_int ldw) { return rocsolver_slatrd(handle, uplo, n, k, A, lda, E, tau, W, ldw); } inline rocblas_status rocsolver_latrd(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* E, double* tau, double* W, rocblas_int ldw) { return rocsolver_dlatrd(handle, uplo, n, k, A, lda, E, tau, W, ldw); } inline rocblas_status rocsolver_latrd(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, float* E, rocblas_float_complex* tau, rocblas_float_complex* W, rocblas_int ldw) { return rocsolver_clatrd(handle, uplo, n, k, A, lda, E, tau, W, ldw); } inline rocblas_status rocsolver_latrd(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, double* E, rocblas_double_complex* tau, rocblas_double_complex* W, rocblas_int ldw) { return rocsolver_zlatrd(handle, uplo, n, k, A, lda, E, tau, W, ldw); } /***************************************************************/ /******************** LABRD ********************/ inline rocblas_status rocsolver_labrd(rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int nb, float* A, rocblas_int lda, float* D, float* E, float* tauq, float* taup, float* X, rocblas_int ldx, float* Y, rocblas_int ldy) { return rocsolver_slabrd(handle, m, n, nb, A, lda, D, E, tauq, taup, X, ldx, Y, ldy); } inline rocblas_status rocsolver_labrd(rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int nb, double* A, rocblas_int lda, double* D, double* E, double* tauq, double* taup, double* X, rocblas_int ldx, double* Y, rocblas_int ldy) { return rocsolver_dlabrd(handle, m, n, nb, A, lda, D, E, tauq, taup, X, ldx, Y, ldy); } inline rocblas_status rocsolver_labrd(rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int nb, rocblas_float_complex* A, rocblas_int lda, float* D, float* E, rocblas_float_complex* tauq, rocblas_float_complex* taup, rocblas_float_complex* X, rocblas_int ldx, rocblas_float_complex* Y, rocblas_int ldy) { return rocsolver_clabrd(handle, m, n, nb, A, lda, D, E, tauq, taup, X, ldx, Y, ldy); } inline rocblas_status rocsolver_labrd(rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int nb, rocblas_double_complex* A, rocblas_int lda, double* D, double* E, rocblas_double_complex* tauq, rocblas_double_complex* taup, rocblas_double_complex* X, rocblas_int ldx, rocblas_double_complex* Y, rocblas_int ldy) { return rocsolver_zlabrd(handle, m, n, nb, A, lda, D, E, tauq, taup, X, ldx, Y, ldy); } /***************************************************************/ /******************** ORGxR_UNGxR ********************/ inline rocblas_status rocsolver_orgxr_ungxr(bool GQR, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* Ipiv) { return GQR ? rocsolver_sorgqr(handle, m, n, k, A, lda, Ipiv) : rocsolver_sorg2r(handle, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orgxr_ungxr(bool GQR, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* Ipiv) { return GQR ? rocsolver_dorgqr(handle, m, n, k, A, lda, Ipiv) : rocsolver_dorg2r(handle, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orgxr_ungxr(bool GQR, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv) { return GQR ? rocsolver_cungqr(handle, m, n, k, A, lda, Ipiv) : rocsolver_cung2r(handle, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orgxr_ungxr(bool GQR, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv) { return GQR ? rocsolver_zungqr(handle, m, n, k, A, lda, Ipiv) : rocsolver_zung2r(handle, m, n, k, A, lda, Ipiv); } /***************************************************************/ /******************** ORGLx_UNGLx ********************/ inline rocblas_status rocsolver_orglx_unglx(bool GLQ, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* Ipiv) { return GLQ ? rocsolver_sorglq(handle, m, n, k, A, lda, Ipiv) : rocsolver_sorgl2(handle, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orglx_unglx(bool GLQ, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* Ipiv) { return GLQ ? rocsolver_dorglq(handle, m, n, k, A, lda, Ipiv) : rocsolver_dorgl2(handle, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orglx_unglx(bool GLQ, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv) { return GLQ ? rocsolver_cunglq(handle, m, n, k, A, lda, Ipiv) : rocsolver_cungl2(handle, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orglx_unglx(bool GLQ, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv) { return GLQ ? rocsolver_zunglq(handle, m, n, k, A, lda, Ipiv) : rocsolver_zungl2(handle, m, n, k, A, lda, Ipiv); } /***************************************************************/ /******************** ORGxL_UNGxL ********************/ inline rocblas_status rocsolver_orgxl_ungxl(bool GQL, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* Ipiv) { return GQL ? rocsolver_sorgql(handle, m, n, k, A, lda, Ipiv) : rocsolver_sorg2l(handle, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orgxl_ungxl(bool GQL, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* Ipiv) { return GQL ? rocsolver_dorgql(handle, m, n, k, A, lda, Ipiv) : rocsolver_dorg2l(handle, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orgxl_ungxl(bool GQL, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv) { return GQL ? rocsolver_cungql(handle, m, n, k, A, lda, Ipiv) : rocsolver_cung2l(handle, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orgxl_ungxl(bool GQL, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv) { return GQL ? rocsolver_zungql(handle, m, n, k, A, lda, Ipiv) : rocsolver_zung2l(handle, m, n, k, A, lda, Ipiv); } /***************************************************************/ /******************** ORGBR_UNGBR ********************/ inline rocblas_status rocsolver_orgbr_ungbr(rocblas_handle handle, rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* Ipiv) { return rocsolver_sorgbr(handle, storev, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orgbr_ungbr(rocblas_handle handle, rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* Ipiv) { return rocsolver_dorgbr(handle, storev, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orgbr_ungbr(rocblas_handle handle, rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv) { return rocsolver_cungbr(handle, storev, m, n, k, A, lda, Ipiv); } inline rocblas_status rocsolver_orgbr_ungbr(rocblas_handle handle, rocblas_storev storev, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv) { return rocsolver_zungbr(handle, storev, m, n, k, A, lda, Ipiv); } /***************************************************************/ /******************** ORGTR_UNGTR ********************/ inline rocblas_status rocsolver_orgtr_ungtr(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, float* Ipiv) { return rocsolver_sorgtr(handle, uplo, n, A, lda, Ipiv); } inline rocblas_status rocsolver_orgtr_ungtr(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, double* Ipiv) { return rocsolver_dorgtr(handle, uplo, n, A, lda, Ipiv); } inline rocblas_status rocsolver_orgtr_ungtr(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv) { return rocsolver_cungtr(handle, uplo, n, A, lda, Ipiv); } inline rocblas_status rocsolver_orgtr_ungtr(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv) { return rocsolver_zungtr(handle, uplo, n, A, lda, Ipiv); } /***************************************************************/ /******************** ORMxR_UNMxR ********************/ inline rocblas_status rocsolver_ormxr_unmxr(bool MQR, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* Ipiv, float* C, rocblas_int ldc) { return MQR ? rocsolver_sormqr(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_sorm2r(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormxr_unmxr(bool MQR, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* Ipiv, double* C, rocblas_int ldc) { return MQR ? rocsolver_dormqr(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_dorm2r(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormxr_unmxr(bool MQR, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv, rocblas_float_complex* C, rocblas_int ldc) { return MQR ? rocsolver_cunmqr(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_cunm2r(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormxr_unmxr(bool MQR, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv, rocblas_double_complex* C, rocblas_int ldc) { return MQR ? rocsolver_zunmqr(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_zunm2r(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } /***************************************************************/ /******************** ORMLx_UNMLx ********************/ inline rocblas_status rocsolver_ormlx_unmlx(bool MLQ, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* Ipiv, float* C, rocblas_int ldc) { return MLQ ? rocsolver_sormlq(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_sorml2(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormlx_unmlx(bool MLQ, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* Ipiv, double* C, rocblas_int ldc) { return MLQ ? rocsolver_dormlq(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_dorml2(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormlx_unmlx(bool MLQ, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv, rocblas_float_complex* C, rocblas_int ldc) { return MLQ ? rocsolver_cunmlq(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_cunml2(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormlx_unmlx(bool MLQ, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv, rocblas_double_complex* C, rocblas_int ldc) { return MLQ ? rocsolver_zunmlq(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_zunml2(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } /***************************************************************/ /******************** ORMxL_UNMxL ********************/ inline rocblas_status rocsolver_ormxl_unmxl(bool MQL, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* Ipiv, float* C, rocblas_int ldc) { return MQL ? rocsolver_sormql(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_sorm2l(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormxl_unmxl(bool MQL, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* Ipiv, double* C, rocblas_int ldc) { return MQL ? rocsolver_dormql(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_dorm2l(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormxl_unmxl(bool MQL, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv, rocblas_float_complex* C, rocblas_int ldc) { return MQL ? rocsolver_cunmql(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_cunm2l(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormxl_unmxl(bool MQL, rocblas_handle handle, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv, rocblas_double_complex* C, rocblas_int ldc) { return MQL ? rocsolver_zunmql(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc) : rocsolver_zunm2l(handle, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } /***************************************************************/ /******************** ORMBR_UNMBR ********************/ inline rocblas_status rocsolver_ormbr_unmbr(rocblas_handle handle, rocblas_storev storev, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, float* A, rocblas_int lda, float* Ipiv, float* C, rocblas_int ldc) { return rocsolver_sormbr(handle, storev, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormbr_unmbr(rocblas_handle handle, rocblas_storev storev, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, double* A, rocblas_int lda, double* Ipiv, double* C, rocblas_int ldc) { return rocsolver_dormbr(handle, storev, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormbr_unmbr(rocblas_handle handle, rocblas_storev storev, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv, rocblas_float_complex* C, rocblas_int ldc) { return rocsolver_cunmbr(handle, storev, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormbr_unmbr(rocblas_handle handle, rocblas_storev storev, rocblas_side side, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int k, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv, rocblas_double_complex* C, rocblas_int ldc) { return rocsolver_zunmbr(handle, storev, side, trans, m, n, k, A, lda, Ipiv, C, ldc); } /***************************************************************/ /******************** ORMTR_UNMTR ********************/ inline rocblas_status rocsolver_ormtr_unmtr(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, float* Ipiv, float* C, rocblas_int ldc) { return rocsolver_sormtr(handle, side, uplo, trans, m, n, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormtr_unmtr(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, double* Ipiv, double* C, rocblas_int ldc) { return rocsolver_dormtr(handle, side, uplo, trans, m, n, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormtr_unmtr(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_float_complex* Ipiv, rocblas_float_complex* C, rocblas_int ldc) { return rocsolver_cunmtr(handle, side, uplo, trans, m, n, A, lda, Ipiv, C, ldc); } inline rocblas_status rocsolver_ormtr_unmtr(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_double_complex* Ipiv, rocblas_double_complex* C, rocblas_int ldc) { return rocsolver_zunmtr(handle, side, uplo, trans, m, n, A, lda, Ipiv, C, ldc); } /***************************************************************/ /******************** STERF ********************/ inline rocblas_status rocsolver_sterf(rocblas_handle handle, rocblas_int n, float* D, float* E, rocblas_int* info) { return rocsolver_ssterf(handle, n, D, E, info); } inline rocblas_status rocsolver_sterf(rocblas_handle handle, rocblas_int n, double* D, double* E, rocblas_int* info) { return rocsolver_dsterf(handle, n, D, E, info); } /********************************************************/ /******************** STEBZ ********************/ inline rocblas_status rocsolver_stebz(rocblas_handle handle, rocblas_erange erange, rocblas_eorder eorder, rocblas_int n, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, float* D, float* E, rocblas_int* nev, rocblas_int* nsplit, float* W, rocblas_int* iblock, rocblas_int* isplit, rocblas_int* info) { return rocsolver_sstebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, D, E, nev, nsplit, W, iblock, isplit, info); } inline rocblas_status rocsolver_stebz(rocblas_handle handle, rocblas_erange erange, rocblas_eorder eorder, rocblas_int n, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, double* D, double* E, rocblas_int* nev, rocblas_int* nsplit, double* W, rocblas_int* iblock, rocblas_int* isplit, rocblas_int* info) { return rocsolver_dstebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, D, E, nev, nsplit, W, iblock, isplit, info); } /********************************************************/ /******************** STEQR ********************/ inline rocblas_status rocsolver_steqr(rocblas_handle handle, rocblas_evect evect, rocblas_int n, float* D, float* E, float* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_ssteqr(handle, evect, n, D, E, C, ldc, info); } inline rocblas_status rocsolver_steqr(rocblas_handle handle, rocblas_evect evect, rocblas_int n, double* D, double* E, double* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_dsteqr(handle, evect, n, D, E, C, ldc, info); } inline rocblas_status rocsolver_steqr(rocblas_handle handle, rocblas_evect evect, rocblas_int n, float* D, float* E, rocblas_float_complex* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_csteqr(handle, evect, n, D, E, C, ldc, info); } inline rocblas_status rocsolver_steqr(rocblas_handle handle, rocblas_evect evect, rocblas_int n, double* D, double* E, rocblas_double_complex* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_zsteqr(handle, evect, n, D, E, C, ldc, info); } /********************************************************/ /******************** STEDC ********************/ inline rocblas_status rocsolver_stedc(rocblas_handle handle, rocblas_evect evect, rocblas_int n, float* D, float* E, float* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_sstedc(handle, evect, n, D, E, C, ldc, info); } inline rocblas_status rocsolver_stedc(rocblas_handle handle, rocblas_evect evect, rocblas_int n, double* D, double* E, double* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_dstedc(handle, evect, n, D, E, C, ldc, info); } inline rocblas_status rocsolver_stedc(rocblas_handle handle, rocblas_evect evect, rocblas_int n, float* D, float* E, rocblas_float_complex* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_cstedc(handle, evect, n, D, E, C, ldc, info); } inline rocblas_status rocsolver_stedc(rocblas_handle handle, rocblas_evect evect, rocblas_int n, double* D, double* E, rocblas_double_complex* C, rocblas_int ldc, rocblas_int* info) { return rocsolver_zstedc(handle, evect, n, D, E, C, ldc, info); } /********************************************************/ /******************** STEIN ********************/ inline rocblas_status rocsolver_stein(rocblas_handle handle, rocblas_int n, float* D, float* E, rocblas_int* nev, float* W, rocblas_int* iblock, rocblas_int* isplit, float* Z, rocblas_int ldz, rocblas_int* ifail, rocblas_int* info) { return rocsolver_sstein(handle, n, D, E, nev, W, iblock, isplit, Z, ldz, ifail, info); } inline rocblas_status rocsolver_stein(rocblas_handle handle, rocblas_int n, double* D, double* E, rocblas_int* nev, double* W, rocblas_int* iblock, rocblas_int* isplit, double* Z, rocblas_int ldz, rocblas_int* ifail, rocblas_int* info) { return rocsolver_dstein(handle, n, D, E, nev, W, iblock, isplit, Z, ldz, ifail, info); } inline rocblas_status rocsolver_stein(rocblas_handle handle, rocblas_int n, float* D, float* E, rocblas_int* nev, float* W, rocblas_int* iblock, rocblas_int* isplit, rocblas_float_complex* Z, rocblas_int ldz, rocblas_int* ifail, rocblas_int* info) { return rocsolver_cstein(handle, n, D, E, nev, W, iblock, isplit, Z, ldz, ifail, info); } inline rocblas_status rocsolver_stein(rocblas_handle handle, rocblas_int n, double* D, double* E, rocblas_int* nev, double* W, rocblas_int* iblock, rocblas_int* isplit, rocblas_double_complex* Z, rocblas_int ldz, rocblas_int* ifail, rocblas_int* info) { return rocsolver_zstein(handle, n, D, E, nev, W, iblock, isplit, Z, ldz, ifail, info); } /********************************************************/ /******************** LASYF ********************/ inline rocblas_status rocsolver_lasyf(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nb, rocblas_int* kb, float* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { return rocsolver_slasyf(handle, uplo, n, nb, kb, A, lda, ipiv, info); } inline rocblas_status rocsolver_lasyf(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nb, rocblas_int* kb, double* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { return rocsolver_dlasyf(handle, uplo, n, nb, kb, A, lda, ipiv, info); } inline rocblas_status rocsolver_lasyf(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nb, rocblas_int* kb, rocblas_float_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { return rocsolver_clasyf(handle, uplo, n, nb, kb, A, lda, ipiv, info); } inline rocblas_status rocsolver_lasyf(rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nb, rocblas_int* kb, rocblas_double_complex* A, rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { return rocsolver_zlasyf(handle, uplo, n, nb, kb, A, lda, ipiv, info); } /********************************************************/ /******************** BDSVDX ********************/ inline rocblas_status rocsolver_bdsvdx(rocblas_handle handle, rocblas_fill uplo, rocblas_svect svect, rocblas_srange srange, rocblas_int n, float* D, float* E, float vl, float vu, rocblas_int il, rocblas_int iu, rocblas_int* nsv, float* S, float* Z, const rocblas_int ldz, rocblas_int* ifail, rocblas_int* info) { return rocsolver_sbdsvdx(handle, uplo, svect, srange, n, D, E, vl, vu, il, iu, nsv, S, Z, ldz, ifail, info); } inline rocblas_status rocsolver_bdsvdx(rocblas_handle handle, rocblas_fill uplo, rocblas_svect svect, rocblas_srange srange, rocblas_int n, double* D, double* E, double vl, double vu, rocblas_int il, rocblas_int iu, rocblas_int* nsv, double* S, double* Z, const rocblas_int ldz, rocblas_int* ifail, rocblas_int* info) { return rocsolver_dbdsvdx(handle, uplo, svect, srange, n, D, E, vl, vu, il, iu, nsv, S, Z, ldz, ifail, info); } /********************************************************/ /******************** POTF2_POTRF ********************/ // normal and strided_batched inline rocblas_status rocsolver_potf2_potrf(bool STRIDED, bool POTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return POTRF ? rocsolver_spotrf_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count) : rocsolver_spotf2_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count); else return POTRF ? rocsolver_spotrf(handle, uplo, n, A, lda, info) : rocsolver_spotf2(handle, uplo, n, A, lda, info); } inline rocblas_status rocsolver_potf2_potrf(bool STRIDED, bool POTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return POTRF ? rocsolver_dpotrf_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count) : rocsolver_dpotf2_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count); else return POTRF ? rocsolver_dpotrf(handle, uplo, n, A, lda, info) : rocsolver_dpotf2(handle, uplo, n, A, lda, info); } inline rocblas_status rocsolver_potf2_potrf(bool STRIDED, bool POTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return POTRF ? rocsolver_cpotrf_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count) : rocsolver_cpotf2_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count); else return POTRF ? rocsolver_cpotrf(handle, uplo, n, A, lda, info) : rocsolver_cpotf2(handle, uplo, n, A, lda, info); } inline rocblas_status rocsolver_potf2_potrf(bool STRIDED, bool POTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return POTRF ? rocsolver_zpotrf_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count) : rocsolver_zpotf2_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count); else return POTRF ? rocsolver_zpotrf(handle, uplo, n, A, lda, info) : rocsolver_zpotf2(handle, uplo, n, A, lda, info); } // batched inline rocblas_status rocsolver_potf2_potrf(bool STRIDED, bool POTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { return POTRF ? rocsolver_spotrf_batched(handle, uplo, n, A, lda, info, batch_count) : rocsolver_spotf2_batched(handle, uplo, n, A, lda, info, batch_count); } inline rocblas_status rocsolver_potf2_potrf(bool STRIDED, bool POTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { return POTRF ? rocsolver_dpotrf_batched(handle, uplo, n, A, lda, info, batch_count) : rocsolver_dpotf2_batched(handle, uplo, n, A, lda, info, batch_count); } inline rocblas_status rocsolver_potf2_potrf(bool STRIDED, bool POTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { return POTRF ? rocsolver_cpotrf_batched(handle, uplo, n, A, lda, info, batch_count) : rocsolver_cpotf2_batched(handle, uplo, n, A, lda, info, batch_count); } inline rocblas_status rocsolver_potf2_potrf(bool STRIDED, bool POTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { return POTRF ? rocsolver_zpotrf_batched(handle, uplo, n, A, lda, info, batch_count) : rocsolver_zpotf2_batched(handle, uplo, n, A, lda, info, batch_count); } /********************************************************/ /******************** POTRS ********************/ // normal and strided_batched inline rocblas_status rocsolver_potrs(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, rocblas_int batch_count) { if(STRIDED) return rocsolver_spotrs_strided_batched(handle, uplo, n, nrhs, A, lda, stA, B, ldb, stB, batch_count); else return rocsolver_spotrs(handle, uplo, n, nrhs, A, lda, B, ldb); } inline rocblas_status rocsolver_potrs(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, rocblas_int batch_count) { if(STRIDED) return rocsolver_dpotrs_strided_batched(handle, uplo, n, nrhs, A, lda, stA, B, ldb, stB, batch_count); else return rocsolver_dpotrs(handle, uplo, n, nrhs, A, lda, B, ldb); } inline rocblas_status rocsolver_potrs(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int batch_count) { if(STRIDED) return rocsolver_cpotrs_strided_batched(handle, uplo, n, nrhs, A, lda, stA, B, ldb, stB, batch_count); else return rocsolver_cpotrs(handle, uplo, n, nrhs, A, lda, B, ldb); } inline rocblas_status rocsolver_potrs(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int batch_count) { if(STRIDED) return rocsolver_zpotrs_strided_batched(handle, uplo, n, nrhs, A, lda, stA, B, ldb, stB, batch_count); else return rocsolver_zpotrs(handle, uplo, n, nrhs, A, lda, B, ldb); } // batched inline rocblas_status rocsolver_potrs(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int batch_count) { return rocsolver_spotrs_batched(handle, uplo, n, nrhs, A, lda, B, ldb, batch_count); } inline rocblas_status rocsolver_potrs(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int batch_count) { return rocsolver_dpotrs_batched(handle, uplo, n, nrhs, A, lda, B, ldb, batch_count); } inline rocblas_status rocsolver_potrs(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int batch_count) { return rocsolver_cpotrs_batched(handle, uplo, n, nrhs, A, lda, B, ldb, batch_count); } inline rocblas_status rocsolver_potrs(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int batch_count) { return rocsolver_zpotrs_batched(handle, uplo, n, nrhs, A, lda, B, ldb, batch_count); } /********************************************************/ /******************** POSV ********************/ // normal and strided_batched inline rocblas_status rocsolver_posv(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return rocsolver_sposv_strided_batched(handle, uplo, n, nrhs, A, lda, stA, B, ldb, stB, info, batch_count); else return rocsolver_sposv(handle, uplo, n, nrhs, A, lda, B, ldb, info); } inline rocblas_status rocsolver_posv(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return rocsolver_dposv_strided_batched(handle, uplo, n, nrhs, A, lda, stA, B, ldb, stB, info, batch_count); else return rocsolver_dposv(handle, uplo, n, nrhs, A, lda, B, ldb, info); } inline rocblas_status rocsolver_posv(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return rocsolver_cposv_strided_batched(handle, uplo, n, nrhs, A, lda, stA, B, ldb, stB, info, batch_count); else return rocsolver_cposv(handle, uplo, n, nrhs, A, lda, B, ldb, info); } inline rocblas_status rocsolver_posv(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return rocsolver_zposv_strided_batched(handle, uplo, n, nrhs, A, lda, stA, B, ldb, stB, info, batch_count); else return rocsolver_zposv(handle, uplo, n, nrhs, A, lda, B, ldb, info); } // batched inline rocblas_status rocsolver_posv(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int batch_count) { return rocsolver_sposv_batched(handle, uplo, n, nrhs, A, lda, B, ldb, info, batch_count); } inline rocblas_status rocsolver_posv(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int batch_count) { return rocsolver_dposv_batched(handle, uplo, n, nrhs, A, lda, B, ldb, info, batch_count); } inline rocblas_status rocsolver_posv(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int batch_count) { return rocsolver_cposv_batched(handle, uplo, n, nrhs, A, lda, B, ldb, info, batch_count); } inline rocblas_status rocsolver_posv(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int batch_count) { return rocsolver_zposv_batched(handle, uplo, n, nrhs, A, lda, B, ldb, info, batch_count); } /********************************************************/ /******************** POTRI ********************/ // normal and strided_batched inline rocblas_status rocsolver_potri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return rocsolver_spotri_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count); else return rocsolver_spotri(handle, uplo, n, A, lda, info); } inline rocblas_status rocsolver_potri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return rocsolver_dpotri_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count); else return rocsolver_dpotri(handle, uplo, n, A, lda, info); } inline rocblas_status rocsolver_potri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return rocsolver_cpotri_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count); else return rocsolver_cpotri(handle, uplo, n, A, lda, info); } inline rocblas_status rocsolver_potri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { if(STRIDED) return rocsolver_zpotri_strided_batched(handle, uplo, n, A, lda, stA, info, batch_count); else return rocsolver_zpotri(handle, uplo, n, A, lda, info); } // batched inline rocblas_status rocsolver_potri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { return rocsolver_spotri_batched(handle, uplo, n, A, lda, info, batch_count); } inline rocblas_status rocsolver_potri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { return rocsolver_dpotri_batched(handle, uplo, n, A, lda, info, batch_count); } inline rocblas_status rocsolver_potri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { return rocsolver_cpotri_batched(handle, uplo, n, A, lda, info, batch_count); } inline rocblas_status rocsolver_potri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int batch_count) { return rocsolver_zpotri_batched(handle, uplo, n, A, lda, info, batch_count); } /********************************************************/ /******************** GETF2_GETRF_NPVT ********************/ // normal and strided_batched inline rocblas_status rocsolver_getf2_getrf_npvt(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { if(STRIDED) return GETRF ? rocsolver_sgetrf_npvt_strided_batched(handle, m, n, A, lda, stA, info, bc) : rocsolver_sgetf2_npvt_strided_batched(handle, m, n, A, lda, stA, info, bc); else return GETRF ? rocsolver_sgetrf_npvt(handle, m, n, A, lda, info) : rocsolver_sgetf2_npvt(handle, m, n, A, lda, info); } inline rocblas_status rocsolver_getf2_getrf_npvt(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { if(STRIDED) return GETRF ? rocsolver_dgetrf_npvt_strided_batched(handle, m, n, A, lda, stA, info, bc) : rocsolver_dgetf2_npvt_strided_batched(handle, m, n, A, lda, stA, info, bc); else return GETRF ? rocsolver_dgetrf_npvt(handle, m, n, A, lda, info) : rocsolver_dgetf2_npvt(handle, m, n, A, lda, info); } inline rocblas_status rocsolver_getf2_getrf_npvt(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { if(STRIDED) return GETRF ? rocsolver_cgetrf_npvt_strided_batched(handle, m, n, A, lda, stA, info, bc) : rocsolver_cgetf2_npvt_strided_batched(handle, m, n, A, lda, stA, info, bc); else return GETRF ? rocsolver_cgetrf_npvt(handle, m, n, A, lda, info) : rocsolver_cgetf2_npvt(handle, m, n, A, lda, info); } inline rocblas_status rocsolver_getf2_getrf_npvt(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { if(STRIDED) return GETRF ? rocsolver_zgetrf_npvt_strided_batched(handle, m, n, A, lda, stA, info, bc) : rocsolver_zgetf2_npvt_strided_batched(handle, m, n, A, lda, stA, info, bc); else return GETRF ? rocsolver_zgetrf_npvt(handle, m, n, A, lda, info) : rocsolver_zgetf2_npvt(handle, m, n, A, lda, info); } // batched inline rocblas_status rocsolver_getf2_getrf_npvt(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return GETRF ? rocsolver_sgetrf_npvt_batched(handle, m, n, A, lda, info, bc) : rocsolver_sgetf2_npvt_batched(handle, m, n, A, lda, info, bc); } inline rocblas_status rocsolver_getf2_getrf_npvt(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return GETRF ? rocsolver_dgetrf_npvt_batched(handle, m, n, A, lda, info, bc) : rocsolver_dgetf2_npvt_batched(handle, m, n, A, lda, info, bc); } inline rocblas_status rocsolver_getf2_getrf_npvt(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return GETRF ? rocsolver_cgetrf_npvt_batched(handle, m, n, A, lda, info, bc) : rocsolver_cgetf2_npvt_batched(handle, m, n, A, lda, info, bc); } inline rocblas_status rocsolver_getf2_getrf_npvt(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return GETRF ? rocsolver_zgetrf_npvt_batched(handle, m, n, A, lda, info, bc) : rocsolver_zgetf2_npvt_batched(handle, m, n, A, lda, info, bc); } /********************************************************/ /******************** GETF2_GETRF ********************/ // normal and strided_batched inline rocblas_status rocsolver_getf2_getrf(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { if(STRIDED) return GETRF ? rocsolver_sgetrf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_sgetf2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, info, bc); else return GETRF ? rocsolver_sgetrf(handle, m, n, A, lda, ipiv, info) : rocsolver_sgetf2(handle, m, n, A, lda, ipiv, info); } inline rocblas_status rocsolver_getf2_getrf(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { if(STRIDED) return GETRF ? rocsolver_dgetrf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_dgetf2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, info, bc); else return GETRF ? rocsolver_dgetrf(handle, m, n, A, lda, ipiv, info) : rocsolver_dgetf2(handle, m, n, A, lda, ipiv, info); } inline rocblas_status rocsolver_getf2_getrf(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { if(STRIDED) return GETRF ? rocsolver_cgetrf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_cgetf2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, info, bc); else return GETRF ? rocsolver_cgetrf(handle, m, n, A, lda, ipiv, info) : rocsolver_cgetf2(handle, m, n, A, lda, ipiv, info); } inline rocblas_status rocsolver_getf2_getrf(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { if(STRIDED) return GETRF ? rocsolver_zgetrf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_zgetf2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, info, bc); else return GETRF ? rocsolver_zgetrf(handle, m, n, A, lda, ipiv, info) : rocsolver_zgetf2(handle, m, n, A, lda, ipiv, info); } // batched inline rocblas_status rocsolver_getf2_getrf(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return GETRF ? rocsolver_sgetrf_batched(handle, m, n, A, lda, ipiv, stP, info, bc) : rocsolver_sgetf2_batched(handle, m, n, A, lda, ipiv, stP, info, bc); } inline rocblas_status rocsolver_getf2_getrf(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return GETRF ? rocsolver_dgetrf_batched(handle, m, n, A, lda, ipiv, stP, info, bc) : rocsolver_dgetf2_batched(handle, m, n, A, lda, ipiv, stP, info, bc); } inline rocblas_status rocsolver_getf2_getrf(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return GETRF ? rocsolver_cgetrf_batched(handle, m, n, A, lda, ipiv, stP, info, bc) : rocsolver_cgetf2_batched(handle, m, n, A, lda, ipiv, stP, info, bc); } inline rocblas_status rocsolver_getf2_getrf(bool STRIDED, bool GETRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return GETRF ? rocsolver_zgetrf_batched(handle, m, n, A, lda, ipiv, stP, info, bc) : rocsolver_zgetf2_batched(handle, m, n, A, lda, ipiv, stP, info, bc); } /********************************************************/ /******************** GESVD ********************/ // normal and strided_batched inline rocblas_status rocsolver_gesvd(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* S, rocblas_stride stS, float* U, rocblas_int ldu, rocblas_stride stU, float* V, rocblas_int ldv, rocblas_stride stV, float* E, rocblas_stride stE, rocblas_workmode fast_alg, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_sgesvd_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, S, stS, U, ldu, stU, V, ldv, stV, E, stE, fast_alg, info, bc) : rocsolver_sgesvd(handle, leftv, rightv, m, n, A, lda, S, U, ldu, V, ldv, E, fast_alg, info); } inline rocblas_status rocsolver_gesvd(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* S, rocblas_stride stS, double* U, rocblas_int ldu, rocblas_stride stU, double* V, rocblas_int ldv, rocblas_stride stV, double* E, rocblas_stride stE, rocblas_workmode fast_alg, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dgesvd_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, S, stS, U, ldu, stU, V, ldv, stV, E, stE, fast_alg, info, bc) : rocsolver_dgesvd(handle, leftv, rightv, m, n, A, lda, S, U, ldu, V, ldv, E, fast_alg, info); } inline rocblas_status rocsolver_gesvd(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, float* S, rocblas_stride stS, rocblas_float_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_float_complex* V, rocblas_int ldv, rocblas_stride stV, float* E, rocblas_stride stE, rocblas_workmode fast_alg, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cgesvd_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, S, stS, U, ldu, stU, V, ldv, stV, E, stE, fast_alg, info, bc) : rocsolver_cgesvd(handle, leftv, rightv, m, n, A, lda, S, U, ldu, V, ldv, E, fast_alg, info); } inline rocblas_status rocsolver_gesvd(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, double* S, rocblas_stride stS, rocblas_double_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_double_complex* V, rocblas_int ldv, rocblas_stride stV, double* E, rocblas_stride stE, rocblas_workmode fast_alg, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zgesvd_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, S, stS, U, ldu, stU, V, ldv, stV, E, stE, fast_alg, info, bc) : rocsolver_zgesvd(handle, leftv, rightv, m, n, A, lda, S, U, ldu, V, ldv, E, fast_alg, info); } // batched inline rocblas_status rocsolver_gesvd(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* S, rocblas_stride stS, float* U, rocblas_int ldu, rocblas_stride stU, float* V, rocblas_int ldv, rocblas_stride stV, float* E, rocblas_stride stE, rocblas_workmode fast_alg, rocblas_int* info, rocblas_int bc) { return rocsolver_sgesvd_batched(handle, leftv, rightv, m, n, A, lda, S, stS, U, ldu, stU, V, ldv, stV, E, stE, fast_alg, info, bc); } inline rocblas_status rocsolver_gesvd(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* S, rocblas_stride stS, double* U, rocblas_int ldu, rocblas_stride stU, double* V, rocblas_int ldv, rocblas_stride stV, double* E, rocblas_stride stE, rocblas_workmode fast_alg, rocblas_int* info, rocblas_int bc) { return rocsolver_dgesvd_batched(handle, leftv, rightv, m, n, A, lda, S, stS, U, ldu, stU, V, ldv, stV, E, stE, fast_alg, info, bc); } inline rocblas_status rocsolver_gesvd(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, float* S, rocblas_stride stS, rocblas_float_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_float_complex* V, rocblas_int ldv, rocblas_stride stV, float* E, rocblas_stride stE, rocblas_workmode fast_alg, rocblas_int* info, rocblas_int bc) { return rocsolver_cgesvd_batched(handle, leftv, rightv, m, n, A, lda, S, stS, U, ldu, stU, V, ldv, stV, E, stE, fast_alg, info, bc); } inline rocblas_status rocsolver_gesvd(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, double* S, rocblas_stride stS, rocblas_double_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_double_complex* V, rocblas_int ldv, rocblas_stride stV, double* E, rocblas_stride stE, rocblas_workmode fast_alg, rocblas_int* info, rocblas_int bc) { return rocsolver_zgesvd_batched(handle, leftv, rightv, m, n, A, lda, S, stS, U, ldu, stU, V, ldv, stV, E, stE, fast_alg, info, bc); } /********************************************************/ /******************** GESVDJ ********************/ // normal and strided_batched inline rocblas_status rocsolver_gesvdj(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, rocblas_stride stS, float* U, rocblas_int ldu, rocblas_stride stU, float* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_sgesvdj_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc) : rocsolver_sgesvdj(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, U, ldu, V, ldv, info); } inline rocblas_status rocsolver_gesvdj(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, rocblas_stride stS, double* U, rocblas_int ldu, rocblas_stride stU, double* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dgesvdj_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc) : rocsolver_dgesvdj(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, U, ldu, V, ldv, info); } inline rocblas_status rocsolver_gesvdj(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, rocblas_stride stS, rocblas_float_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_float_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cgesvdj_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc) : rocsolver_cgesvdj(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, U, ldu, V, ldv, info); } inline rocblas_status rocsolver_gesvdj(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, rocblas_stride stS, rocblas_double_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_double_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zgesvdj_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc) : rocsolver_zgesvdj(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, U, ldu, V, ldv, info); } // batched inline rocblas_status rocsolver_gesvdj(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, rocblas_stride stS, float* U, rocblas_int ldu, rocblas_stride stU, float* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return rocsolver_sgesvdj_batched(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc); } inline rocblas_status rocsolver_gesvdj(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, rocblas_stride stS, double* U, rocblas_int ldu, rocblas_stride stU, double* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return rocsolver_dgesvdj_batched(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc); } inline rocblas_status rocsolver_gesvdj(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, rocblas_stride stS, rocblas_float_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_float_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return rocsolver_cgesvdj_batched(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc); } inline rocblas_status rocsolver_gesvdj(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, rocblas_stride stS, rocblas_double_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_double_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return rocsolver_zgesvdj_batched(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc); } /********************************************************/ /******************** GESVDJ_NOTRANSV ********************/ // normal and strided_batched inline rocblas_status rocsolver_gesvdj_notransv(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, rocblas_stride stS, float* U, rocblas_int ldu, rocblas_stride stU, float* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_sgesvdj_notransv_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc) : rocsolver_sgesvdj_notransv(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, U, ldu, V, ldv, info); } inline rocblas_status rocsolver_gesvdj_notransv(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, rocblas_stride stS, double* U, rocblas_int ldu, rocblas_stride stU, double* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dgesvdj_notransv_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc) : rocsolver_dgesvdj_notransv(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, U, ldu, V, ldv, info); } inline rocblas_status rocsolver_gesvdj_notransv(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, rocblas_stride stS, rocblas_float_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_float_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cgesvdj_notransv_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc) : rocsolver_cgesvdj_notransv(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, U, ldu, V, ldv, info); } inline rocblas_status rocsolver_gesvdj_notransv(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, rocblas_stride stS, rocblas_double_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_double_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zgesvdj_notransv_strided_batched(handle, leftv, rightv, m, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc) : rocsolver_zgesvdj_notransv(handle, leftv, rightv, m, n, A, lda, abstol, residual, max_sweeps, n_sweeps, S, U, ldu, V, ldv, info); } // batched inline rocblas_status rocsolver_gesvdj_notransv(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, rocblas_stride stS, float* U, rocblas_int ldu, rocblas_stride stU, float* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_sgesvdj_notransv_batched(handle, leftv, rightv, m, n, A, lda, abstol, // residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc); } inline rocblas_status rocsolver_gesvdj_notransv(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, rocblas_stride stS, double* U, rocblas_int ldu, rocblas_stride stU, double* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_dgesvdj_notransv_batched(handle, leftv, rightv, m, n, A, lda, abstol, // residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc); } inline rocblas_status rocsolver_gesvdj_notransv(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* S, rocblas_stride stS, rocblas_float_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_float_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_cgesvdj_notransv_batched(handle, leftv, rightv, m, n, A, lda, abstol, // residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc); } inline rocblas_status rocsolver_gesvdj_notransv(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* S, rocblas_stride stS, rocblas_double_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_double_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_zgesvdj_notransv_batched(handle, leftv, rightv, m, n, A, lda, abstol, // residual, max_sweeps, n_sweeps, S, stS, U, ldu, stU, V, ldv, stV, info, bc); } /********************************************************/ /******************** GESVDX ********************/ // normal and strided_batched inline rocblas_status rocsolver_gesvdx(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, rocblas_int* ns, float* S, rocblas_stride stS, float* U, rocblas_int ldu, rocblas_stride stU, float* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_sgesvdx_strided_batched(handle, leftv, rightv, srange, m, n, A, lda, stA, vl, vu, il, iu, ns, S, stS, U, ldu, stU, V, ldv, stV, ifail, stF, info, bc) : rocsolver_sgesvdx(handle, leftv, rightv, srange, m, n, A, lda, vl, vu, il, iu, ns, S, U, ldu, V, ldv, ifail, info); } inline rocblas_status rocsolver_gesvdx(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, rocblas_int* ns, double* S, rocblas_stride stS, double* U, rocblas_int ldu, rocblas_stride stU, double* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dgesvdx_strided_batched(handle, leftv, rightv, srange, m, n, A, lda, stA, vl, vu, il, iu, ns, S, stS, U, ldu, stU, V, ldv, stV, ifail, stF, info, bc) : rocsolver_dgesvdx(handle, leftv, rightv, srange, m, n, A, lda, vl, vu, il, iu, ns, S, U, ldu, V, ldv, ifail, info); } inline rocblas_status rocsolver_gesvdx(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, rocblas_int* ns, float* S, rocblas_stride stS, rocblas_float_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_float_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cgesvdx_strided_batched(handle, leftv, rightv, srange, m, n, A, lda, stA, vl, vu, il, iu, ns, S, stS, U, ldu, stU, V, ldv, stV, ifail, stF, info, bc) : rocsolver_cgesvdx(handle, leftv, rightv, srange, m, n, A, lda, vl, vu, il, iu, ns, S, U, ldu, V, ldv, ifail, info); } inline rocblas_status rocsolver_gesvdx(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, rocblas_int* ns, double* S, rocblas_stride stS, rocblas_double_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_double_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zgesvdx_strided_batched(handle, leftv, rightv, srange, m, n, A, lda, stA, vl, vu, il, iu, ns, S, stS, U, ldu, stU, V, ldv, stV, ifail, stF, info, bc) : rocsolver_zgesvdx(handle, leftv, rightv, srange, m, n, A, lda, vl, vu, il, iu, ns, S, U, ldu, V, ldv, ifail, info); } // batched inline rocblas_status rocsolver_gesvdx(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, rocblas_int* ns, float* S, rocblas_stride stS, float* U, rocblas_int ldu, rocblas_stride stU, float* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_sgesvdx_batched(handle, leftv, rightv, srange, m, n, A, lda, vl, vu, il, iu, ns, S, stS, U, ldu, stU, V, ldv, stV, ifail, stF, info, bc); } inline rocblas_status rocsolver_gesvdx(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, rocblas_int* ns, double* S, rocblas_stride stS, double* U, rocblas_int ldu, rocblas_stride stU, double* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_dgesvdx_batched(handle, leftv, rightv, srange, m, n, A, lda, vl, vu, il, iu, ns, S, stS, U, ldu, stU, V, ldv, stV, ifail, stF, info, bc); } inline rocblas_status rocsolver_gesvdx(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, rocblas_int* ns, float* S, rocblas_stride stS, rocblas_float_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_float_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_cgesvdx_batched(handle, leftv, rightv, srange, m, n, A, lda, vl, vu, il, iu, ns, S, stS, U, ldu, stU, V, ldv, stV, ifail, stF, info, bc); } inline rocblas_status rocsolver_gesvdx(bool STRIDED, rocblas_handle handle, rocblas_svect leftv, rocblas_svect rightv, rocblas_srange srange, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, rocblas_int* ns, double* S, rocblas_stride stS, rocblas_double_complex* U, rocblas_int ldu, rocblas_stride stU, rocblas_double_complex* V, rocblas_int ldv, rocblas_stride stV, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_zgesvdx_batched(handle, leftv, rightv, srange, m, n, A, lda, vl, vu, il, iu, ns, S, stS, U, ldu, stU, V, ldv, stV, ifail, stF, info, bc); } /********************************************************/ /******************** GETRS ********************/ // normal and strided_batched inline rocblas_status rocsolver_getrs(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, float* B, rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return STRIDED ? rocsolver_sgetrs_strided_batched(handle, trans, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, bc) : rocsolver_sgetrs(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } inline rocblas_status rocsolver_getrs(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, double* B, rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return STRIDED ? rocsolver_dgetrs_strided_batched(handle, trans, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, bc) : rocsolver_dgetrs(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } inline rocblas_status rocsolver_getrs(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return STRIDED ? rocsolver_cgetrs_strided_batched(handle, trans, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, bc) : rocsolver_cgetrs(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } inline rocblas_status rocsolver_getrs(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return STRIDED ? rocsolver_zgetrs_strided_batched(handle, trans, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, bc) : rocsolver_zgetrs(handle, trans, n, nrhs, A, lda, ipiv, B, ldb); } // batched inline rocblas_status rocsolver_getrs(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int n, rocblas_int nrhs, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, float* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return rocsolver_sgetrs_batched(handle, trans, n, nrhs, A, lda, ipiv, stP, B, ldb, bc); } inline rocblas_status rocsolver_getrs(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int n, rocblas_int nrhs, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, double* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return rocsolver_dgetrs_batched(handle, trans, n, nrhs, A, lda, ipiv, stP, B, ldb, bc); } inline rocblas_status rocsolver_getrs(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return rocsolver_cgetrs_batched(handle, trans, n, nrhs, A, lda, ipiv, stP, B, ldb, bc); } inline rocblas_status rocsolver_getrs(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return rocsolver_zgetrs_batched(handle, trans, n, nrhs, A, lda, ipiv, stP, B, ldb, bc); } /********************************************************/ /******************** GESV ********************/ // normal and strided_batched inline rocblas_status rocsolver_gesv(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, float* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_sgesv_strided_batched(handle, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, info, bc) : rocsolver_sgesv(handle, n, nrhs, A, lda, ipiv, B, ldb, info); } inline rocblas_status rocsolver_gesv(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, double* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dgesv_strided_batched(handle, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, info, bc) : rocsolver_dgesv(handle, n, nrhs, A, lda, ipiv, B, ldb, info); } inline rocblas_status rocsolver_gesv(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cgesv_strided_batched(handle, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, info, bc) : rocsolver_cgesv(handle, n, nrhs, A, lda, ipiv, B, ldb, info); } inline rocblas_status rocsolver_gesv(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zgesv_strided_batched(handle, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, info, bc) : rocsolver_zgesv(handle, n, nrhs, A, lda, ipiv, B, ldb, info); } // batched inline rocblas_status rocsolver_gesv(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, float* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return rocsolver_sgesv_batched(handle, n, nrhs, A, lda, ipiv, stP, B, ldb, info, bc); } inline rocblas_status rocsolver_gesv(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, double* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return rocsolver_dgesv_batched(handle, n, nrhs, A, lda, ipiv, stP, B, ldb, info, bc); } inline rocblas_status rocsolver_gesv(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return rocsolver_cgesv_batched(handle, n, nrhs, A, lda, ipiv, stP, B, ldb, info, bc); } inline rocblas_status rocsolver_gesv(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return rocsolver_zgesv_batched(handle, n, nrhs, A, lda, ipiv, stP, B, ldb, info, bc); } /********************************************************/ /******************** GESV_OUTOFPLACE ********************/ // normal and strided_batched inline rocblas_status rocsolver_gesv_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, float* B, rocblas_int ldb, rocblas_stride stB, float* X, rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_sgesv_outofplace_strided_batched(handle, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, X, ldx, stX, info, bc) : rocsolver_sgesv_outofplace(handle, n, nrhs, A, lda, ipiv, B, ldb, X, ldx, info); } inline rocblas_status rocsolver_gesv_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, double* B, rocblas_int ldb, rocblas_stride stB, double* X, rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_dgesv_outofplace_strided_batched(handle, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, X, ldx, stX, info, bc) : rocsolver_dgesv_outofplace(handle, n, nrhs, A, lda, ipiv, B, ldb, X, ldx, info); } inline rocblas_status rocsolver_gesv_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_float_complex* X, rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_cgesv_outofplace_strided_batched(handle, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, X, ldx, stX, info, bc) : rocsolver_cgesv_outofplace(handle, n, nrhs, A, lda, ipiv, B, ldb, X, ldx, info); } inline rocblas_status rocsolver_gesv_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_double_complex* X, rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_zgesv_outofplace_strided_batched(handle, n, nrhs, A, lda, stA, ipiv, stP, B, ldb, stB, X, ldx, stX, info, bc) : rocsolver_zgesv_outofplace(handle, n, nrhs, A, lda, ipiv, B, ldb, X, ldx, info); } // batched inline rocblas_status rocsolver_gesv_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, float* const B[], rocblas_int ldb, rocblas_stride stB, float* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_sgesv_outofplace_batched(handle, n, nrhs, A, lda, ipiv, stP, B, ldb, info, bc); } inline rocblas_status rocsolver_gesv_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, double* const B[], rocblas_int ldb, rocblas_stride stB, double* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_dgesv_outofplace_batched(handle, n, nrhs, A, lda, ipiv, stP, B, ldb, info, bc); } inline rocblas_status rocsolver_gesv_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_float_complex* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_cgesv_outofplace_batched(handle, n, nrhs, A, lda, ipiv, stP, B, ldb, info, bc); } inline rocblas_status rocsolver_gesv_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_double_complex* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_zgesv_outofplace_batched(handle, n, nrhs, A, lda, ipiv, stP, B, ldb, info, bc); } /********************************************************/ /******************** GETRI_OUTOFPLACE ********************/ // normal and strided_batched inline rocblas_status rocsolver_getri_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, float* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_sgetri_outofplace_strided_batched(handle, n, A, lda, stA, ipiv, stP, C, ldc, stC, info, bc) : rocsolver_sgetri_outofplace(handle, n, A, lda, ipiv, C, ldc, info); } inline rocblas_status rocsolver_getri_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, double* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dgetri_outofplace_strided_batched(handle, n, A, lda, stA, ipiv, stP, C, ldc, stC, info, bc) : rocsolver_dgetri_outofplace(handle, n, A, lda, ipiv, C, ldc, info); } inline rocblas_status rocsolver_getri_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_float_complex* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cgetri_outofplace_strided_batched(handle, n, A, lda, stA, ipiv, stP, C, ldc, stC, info, bc) : rocsolver_cgetri_outofplace(handle, n, A, lda, ipiv, C, ldc, info); } inline rocblas_status rocsolver_getri_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_double_complex* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zgetri_outofplace_strided_batched(handle, n, A, lda, stA, ipiv, stP, C, ldc, stC, info, bc) : rocsolver_zgetri_outofplace(handle, n, A, lda, ipiv, C, ldc, info); } // batched inline rocblas_status rocsolver_getri_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, float* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_sgetri_outofplace_batched(handle, n, A, lda, ipiv, stP, C, ldc, info, bc); } inline rocblas_status rocsolver_getri_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, double* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_dgetri_outofplace_batched(handle, n, A, lda, ipiv, stP, C, ldc, info, bc); } inline rocblas_status rocsolver_getri_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_float_complex* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_cgetri_outofplace_batched(handle, n, A, lda, ipiv, stP, C, ldc, info, bc); } inline rocblas_status rocsolver_getri_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_double_complex* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_zgetri_outofplace_batched(handle, n, A, lda, ipiv, stP, C, ldc, info, bc); } /********************************************************/ /******************** GETRI_NPVT_OUTOFPLACE ********************/ // normal and strided_batched inline rocblas_status rocsolver_getri_npvt_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_sgetri_npvt_outofplace_strided_batched(handle, n, A, lda, stA, C, ldc, stC, info, bc) : rocsolver_sgetri_npvt_outofplace(handle, n, A, lda, C, ldc, info); } inline rocblas_status rocsolver_getri_npvt_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dgetri_npvt_outofplace_strided_batched(handle, n, A, lda, stA, C, ldc, stC, info, bc) : rocsolver_dgetri_npvt_outofplace(handle, n, A, lda, C, ldc, info); } inline rocblas_status rocsolver_getri_npvt_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cgetri_npvt_outofplace_strided_batched(handle, n, A, lda, stA, C, ldc, stC, info, bc) : rocsolver_cgetri_npvt_outofplace(handle, n, A, lda, C, ldc, info); } inline rocblas_status rocsolver_getri_npvt_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zgetri_npvt_outofplace_strided_batched(handle, n, A, lda, stA, C, ldc, stC, info, bc) : rocsolver_zgetri_npvt_outofplace(handle, n, A, lda, C, ldc, info); } // batched inline rocblas_status rocsolver_getri_npvt_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_sgetri_npvt_outofplace_batched(handle, n, A, lda, C, ldc, info, bc); } inline rocblas_status rocsolver_getri_npvt_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_dgetri_npvt_outofplace_batched(handle, n, A, lda, C, ldc, info, bc); } inline rocblas_status rocsolver_getri_npvt_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_cgetri_npvt_outofplace_batched(handle, n, A, lda, C, ldc, info, bc); } inline rocblas_status rocsolver_getri_npvt_outofplace(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_zgetri_npvt_outofplace_batched(handle, n, A, lda, C, ldc, info, bc); } /********************************************************/ /******************** GETRI ********************/ // normal and strided_batched inline rocblas_status rocsolver_getri(bool STRIDED, rocblas_handle handle, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_sgetri_strided_batched(handle, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_sgetri(handle, n, A, lda, ipiv, info); } inline rocblas_status rocsolver_getri(bool STRIDED, rocblas_handle handle, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dgetri_strided_batched(handle, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_dgetri(handle, n, A, lda, ipiv, info); } inline rocblas_status rocsolver_getri(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cgetri_strided_batched(handle, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_cgetri(handle, n, A, lda, ipiv, info); } inline rocblas_status rocsolver_getri(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zgetri_strided_batched(handle, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_zgetri(handle, n, A, lda, ipiv, info); } // batched inline rocblas_status rocsolver_getri(bool STRIDED, rocblas_handle handle, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return rocsolver_sgetri_batched(handle, n, A, lda, ipiv, stP, info, bc); } inline rocblas_status rocsolver_getri(bool STRIDED, rocblas_handle handle, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return rocsolver_dgetri_batched(handle, n, A, lda, ipiv, stP, info, bc); } inline rocblas_status rocsolver_getri(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return rocsolver_cgetri_batched(handle, n, A, lda, ipiv, stP, info, bc); } inline rocblas_status rocsolver_getri(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return rocsolver_zgetri_batched(handle, n, A, lda, ipiv, stP, info, bc); } /********************************************************/ /******************** GETRI_NPVT ********************/ // normal and strided_batched inline rocblas_status rocsolver_getri_npvt(bool STRIDED, rocblas_handle handle, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_sgetri_npvt_strided_batched(handle, n, A, lda, stA, info, bc) : rocsolver_sgetri_npvt(handle, n, A, lda, info); } inline rocblas_status rocsolver_getri_npvt(bool STRIDED, rocblas_handle handle, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dgetri_npvt_strided_batched(handle, n, A, lda, stA, info, bc) : rocsolver_dgetri_npvt(handle, n, A, lda, info); } inline rocblas_status rocsolver_getri_npvt(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cgetri_npvt_strided_batched(handle, n, A, lda, stA, info, bc) : rocsolver_cgetri_npvt(handle, n, A, lda, info); } inline rocblas_status rocsolver_getri_npvt(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zgetri_npvt_strided_batched(handle, n, A, lda, stA, info, bc) : rocsolver_zgetri_npvt(handle, n, A, lda, info); } // batched inline rocblas_status rocsolver_getri_npvt(bool STRIDED, rocblas_handle handle, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return rocsolver_sgetri_npvt_batched(handle, n, A, lda, info, bc); } inline rocblas_status rocsolver_getri_npvt(bool STRIDED, rocblas_handle handle, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return rocsolver_dgetri_npvt_batched(handle, n, A, lda, info, bc); } inline rocblas_status rocsolver_getri_npvt(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return rocsolver_cgetri_npvt_batched(handle, n, A, lda, info, bc); } inline rocblas_status rocsolver_getri_npvt(bool STRIDED, rocblas_handle handle, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return rocsolver_zgetri_npvt_batched(handle, n, A, lda, info, bc); } /********************************************************/ /******************** TRTRI ********************/ // normal and strided_batched inline rocblas_status rocsolver_trtri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_strtri_strided_batched(handle, uplo, diag, n, A, lda, stA, info, bc) : rocsolver_strtri(handle, uplo, diag, n, A, lda, info); } inline rocblas_status rocsolver_trtri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dtrtri_strided_batched(handle, uplo, diag, n, A, lda, stA, info, bc) : rocsolver_dtrtri(handle, uplo, diag, n, A, lda, info); } inline rocblas_status rocsolver_trtri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_ctrtri_strided_batched(handle, uplo, diag, n, A, lda, stA, info, bc) : rocsolver_ctrtri(handle, uplo, diag, n, A, lda, info); } inline rocblas_status rocsolver_trtri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_ztrtri_strided_batched(handle, uplo, diag, n, A, lda, stA, info, bc) : rocsolver_ztrtri(handle, uplo, diag, n, A, lda, info); } // batched inline rocblas_status rocsolver_trtri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return rocsolver_strtri_batched(handle, uplo, diag, n, A, lda, info, bc); } inline rocblas_status rocsolver_trtri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return rocsolver_dtrtri_batched(handle, uplo, diag, n, A, lda, info, bc); } inline rocblas_status rocsolver_trtri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return rocsolver_ctrtri_batched(handle, uplo, diag, n, A, lda, info, bc); } inline rocblas_status rocsolver_trtri(bool STRIDED, rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* info, rocblas_int bc) { return rocsolver_ztrtri_batched(handle, uplo, diag, n, A, lda, info, bc); } /********************************************************/ /******************** GEQR2_GEQRF ********************/ // normal and strided_batched inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEQRF ? rocsolver_sgeqrf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_sgeqr2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GEQRF ? rocsolver_sgeqrf(handle, m, n, A, lda, ipiv) : rocsolver_sgeqr2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEQRF ? rocsolver_dgeqrf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_dgeqr2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GEQRF ? rocsolver_dgeqrf(handle, m, n, A, lda, ipiv) : rocsolver_dgeqr2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEQRF ? rocsolver_cgeqrf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_cgeqr2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GEQRF ? rocsolver_cgeqrf(handle, m, n, A, lda, ipiv) : rocsolver_cgeqr2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEQRF ? rocsolver_zgeqrf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_zgeqr2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GEQRF ? rocsolver_zgeqrf(handle, m, n, A, lda, ipiv) : rocsolver_zgeqr2(handle, m, n, A, lda, ipiv); } // batched inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* ipiv, rocblas_stride stP, rocblas_int bc) { return GEQRF ? rocsolver_sgeqrf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_sgeqr2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* ipiv, rocblas_stride stP, rocblas_int bc) { return GEQRF ? rocsolver_dgeqrf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_dgeqr2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* ipiv, rocblas_stride stP, rocblas_int bc) { return GEQRF ? rocsolver_cgeqrf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_cgeqr2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* ipiv, rocblas_stride stP, rocblas_int bc) { return GEQRF ? rocsolver_zgeqrf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_zgeqr2_batched(handle, m, n, A, lda, ipiv, stP, bc); } // ptr_batched inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* const ipiv[], rocblas_stride stP, rocblas_int bc) { return GEQRF ? rocsolver_sgeqrf_ptr_batched(handle, m, n, A, lda, ipiv, bc) : rocblas_status_not_implemented; } inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* const ipiv[], rocblas_stride stP, rocblas_int bc) { return GEQRF ? rocsolver_dgeqrf_ptr_batched(handle, m, n, A, lda, ipiv, bc) : rocblas_status_not_implemented; } inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const ipiv[], rocblas_stride stP, rocblas_int bc) { return GEQRF ? rocsolver_cgeqrf_ptr_batched(handle, m, n, A, lda, ipiv, bc) : rocblas_status_not_implemented; } inline rocblas_status rocsolver_geqr2_geqrf(bool STRIDED, bool GEQRF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const ipiv[], rocblas_stride stP, rocblas_int bc) { return GEQRF ? rocsolver_zgeqrf_ptr_batched(handle, m, n, A, lda, ipiv, bc) : rocblas_status_not_implemented; } /********************************************************/ /******************** GERQ2_GERQF ********************/ // normal and strided_batched inline rocblas_status rocsolver_gerq2_gerqf(bool STRIDED, bool GERQF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GERQF ? rocsolver_sgerqf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_sgerq2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GERQF ? rocsolver_sgerqf(handle, m, n, A, lda, ipiv) : rocsolver_sgerq2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_gerq2_gerqf(bool STRIDED, bool GERQF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GERQF ? rocsolver_dgerqf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_dgerq2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GERQF ? rocsolver_dgerqf(handle, m, n, A, lda, ipiv) : rocsolver_dgerq2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_gerq2_gerqf(bool STRIDED, bool GERQF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GERQF ? rocsolver_cgerqf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_cgerq2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GERQF ? rocsolver_cgerqf(handle, m, n, A, lda, ipiv) : rocsolver_cgerq2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_gerq2_gerqf(bool STRIDED, bool GERQF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GERQF ? rocsolver_zgerqf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_zgerq2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GERQF ? rocsolver_zgerqf(handle, m, n, A, lda, ipiv) : rocsolver_zgerq2(handle, m, n, A, lda, ipiv); } // batched inline rocblas_status rocsolver_gerq2_gerqf(bool STRIDED, bool GERQF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* ipiv, rocblas_stride stP, rocblas_int bc) { return GERQF ? rocsolver_sgerqf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_sgerq2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_gerq2_gerqf(bool STRIDED, bool GERQF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* ipiv, rocblas_stride stP, rocblas_int bc) { return GERQF ? rocsolver_dgerqf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_dgerq2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_gerq2_gerqf(bool STRIDED, bool GERQF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* ipiv, rocblas_stride stP, rocblas_int bc) { return GERQF ? rocsolver_cgerqf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_cgerq2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_gerq2_gerqf(bool STRIDED, bool GERQF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* ipiv, rocblas_stride stP, rocblas_int bc) { return GERQF ? rocsolver_zgerqf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_zgerq2_batched(handle, m, n, A, lda, ipiv, stP, bc); } /********************************************************/ /******************** GEQL2_GEQLF ********************/ // normal and strided_batched inline rocblas_status rocsolver_geql2_geqlf(bool STRIDED, bool GEQLF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEQLF ? rocsolver_sgeqlf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_sgeql2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GEQLF ? rocsolver_sgeqlf(handle, m, n, A, lda, ipiv) : rocsolver_sgeql2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_geql2_geqlf(bool STRIDED, bool GEQLF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEQLF ? rocsolver_dgeqlf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_dgeql2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GEQLF ? rocsolver_dgeqlf(handle, m, n, A, lda, ipiv) : rocsolver_dgeql2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_geql2_geqlf(bool STRIDED, bool GEQLF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEQLF ? rocsolver_cgeqlf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_cgeql2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GEQLF ? rocsolver_cgeqlf(handle, m, n, A, lda, ipiv) : rocsolver_cgeql2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_geql2_geqlf(bool STRIDED, bool GEQLF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEQLF ? rocsolver_zgeqlf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_zgeql2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GEQLF ? rocsolver_zgeqlf(handle, m, n, A, lda, ipiv) : rocsolver_zgeql2(handle, m, n, A, lda, ipiv); } // batched inline rocblas_status rocsolver_geql2_geqlf(bool STRIDED, bool GEQLF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* ipiv, rocblas_stride stP, rocblas_int bc) { return GEQLF ? rocsolver_sgeqlf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_sgeql2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_geql2_geqlf(bool STRIDED, bool GEQLF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* ipiv, rocblas_stride stP, rocblas_int bc) { return GEQLF ? rocsolver_dgeqlf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_dgeql2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_geql2_geqlf(bool STRIDED, bool GEQLF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* ipiv, rocblas_stride stP, rocblas_int bc) { return GEQLF ? rocsolver_cgeqlf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_cgeql2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_geql2_geqlf(bool STRIDED, bool GEQLF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* ipiv, rocblas_stride stP, rocblas_int bc) { return GEQLF ? rocsolver_zgeqlf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_zgeql2_batched(handle, m, n, A, lda, ipiv, stP, bc); } /********************************************************/ /******************** GELQ2_GELQF ********************/ // normal and strided_batched inline rocblas_status rocsolver_gelq2_gelqf(bool STRIDED, bool GELQF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GELQF ? rocsolver_sgelqf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_sgelq2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GELQF ? rocsolver_sgelqf(handle, m, n, A, lda, ipiv) : rocsolver_sgelq2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_gelq2_gelqf(bool STRIDED, bool GELQF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GELQF ? rocsolver_dgelqf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_dgelq2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GELQF ? rocsolver_dgelqf(handle, m, n, A, lda, ipiv) : rocsolver_dgelq2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_gelq2_gelqf(bool STRIDED, bool GELQF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GELQF ? rocsolver_cgelqf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_cgelq2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GELQF ? rocsolver_cgelqf(handle, m, n, A, lda, ipiv) : rocsolver_cgelq2(handle, m, n, A, lda, ipiv); } inline rocblas_status rocsolver_gelq2_gelqf(bool STRIDED, bool GELQF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* ipiv, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GELQF ? rocsolver_zgelqf_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc) : rocsolver_zgelq2_strided_batched(handle, m, n, A, lda, stA, ipiv, stP, bc); else return GELQF ? rocsolver_zgelqf(handle, m, n, A, lda, ipiv) : rocsolver_zgelq2(handle, m, n, A, lda, ipiv); } // batched inline rocblas_status rocsolver_gelq2_gelqf(bool STRIDED, bool GELQF, rocblas_handle handle, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* ipiv, rocblas_stride stP, rocblas_int bc) { return GELQF ? rocsolver_sgelqf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_sgelq2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_gelq2_gelqf(bool STRIDED, bool GELQF, rocblas_handle handle, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* ipiv, rocblas_stride stP, rocblas_int bc) { return GELQF ? rocsolver_dgelqf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_dgelq2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_gelq2_gelqf(bool STRIDED, bool GELQF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* ipiv, rocblas_stride stP, rocblas_int bc) { return GELQF ? rocsolver_cgelqf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_cgelq2_batched(handle, m, n, A, lda, ipiv, stP, bc); } inline rocblas_status rocsolver_gelq2_gelqf(bool STRIDED, bool GELQF, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* ipiv, rocblas_stride stP, rocblas_int bc) { return GELQF ? rocsolver_zgelqf_batched(handle, m, n, A, lda, ipiv, stP, bc) : rocsolver_zgelq2_batched(handle, m, n, A, lda, ipiv, stP, bc); } /********************************************************/ /******************** GELS ********************/ // normal and strided_batched inline rocblas_status rocsolver_gels(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_sgels_strided_batched(handle, trans, m, n, nrhs, A, lda, stA, B, ldb, stB, info, bc); else return rocsolver_sgels(handle, trans, m, n, nrhs, A, lda, B, ldb, info); } inline rocblas_status rocsolver_gels(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_dgels_strided_batched(handle, trans, m, n, nrhs, A, lda, stA, B, ldb, stB, info, bc); else return rocsolver_dgels(handle, trans, m, n, nrhs, A, lda, B, ldb, info); } inline rocblas_status rocsolver_gels(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_cgels_strided_batched(handle, trans, m, n, nrhs, A, lda, stA, B, ldb, stB, info, bc); else return rocsolver_cgels(handle, trans, m, n, nrhs, A, lda, B, ldb, info); } inline rocblas_status rocsolver_gels(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_zgels_strided_batched(handle, trans, m, n, nrhs, A, lda, stA, B, ldb, stB, info, bc); else return rocsolver_zgels(handle, trans, m, n, nrhs, A, lda, B, ldb, info); } // batched inline rocblas_status rocsolver_gels(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return rocsolver_sgels_batched(handle, trans, m, n, nrhs, A, lda, B, ldb, info, bc); } inline rocblas_status rocsolver_gels(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return rocsolver_dgels_batched(handle, trans, m, n, nrhs, A, lda, B, ldb, info, bc); } inline rocblas_status rocsolver_gels(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return rocsolver_cgels_batched(handle, trans, m, n, nrhs, A, lda, B, ldb, info, bc); } inline rocblas_status rocsolver_gels(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int* info, rocblas_int bc) { return rocsolver_zgels_batched(handle, trans, m, n, nrhs, A, lda, B, ldb, info, bc); } /********************************************************/ /******************** GELS_OUTOFPLACE ********************/ // normal and strided_batched inline rocblas_status rocsolver_gels_outofplace(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, float* X, rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocblas_status_not_implemented; // rocsolver_sgels_outofplace_strided_batched(handle, trans, m, n, nrhs, A, lda, stA, B, ldb, stB, X, ldx, stX, info, bc); else return rocsolver_sgels_outofplace(handle, trans, m, n, nrhs, A, lda, B, ldb, X, ldx, info); } inline rocblas_status rocsolver_gels_outofplace(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, double* X, rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocblas_status_not_implemented; // rocsolver_dgels_outofplace_strided_batched(handle, trans, m, n, nrhs, A, lda, stA, B, ldb, stB, X, ldx, stX, info, bc); else return rocsolver_dgels_outofplace(handle, trans, m, n, nrhs, A, lda, B, ldb, X, ldx, info); } inline rocblas_status rocsolver_gels_outofplace(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_float_complex* X, rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocblas_status_not_implemented; // rocsolver_cgels_outofplace_strided_batched(handle, trans, m, n, nrhs, A, lda, stA, B, ldb, stB, X, ldx, stX, info, bc); else return rocsolver_cgels_outofplace(handle, trans, m, n, nrhs, A, lda, B, ldb, X, ldx, info); } inline rocblas_status rocsolver_gels_outofplace(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_double_complex* X, rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocblas_status_not_implemented; // rocsolver_zgels_outofplace_strided_batched(handle, trans, m, n, nrhs, A, lda, stA, B, ldb, stB, X, ldx, stX, info, bc); else return rocsolver_zgels_outofplace(handle, trans, m, n, nrhs, A, lda, B, ldb, X, ldx, info); } // batched inline rocblas_status rocsolver_gels_outofplace(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, float* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_sgels_outofplace_batched(handle, trans, m, n, nrhs, A, lda, B, ldb, X, ldx, info, bc); } inline rocblas_status rocsolver_gels_outofplace(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, double* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_dgels_outofplace_batched(handle, trans, m, n, nrhs, A, lda, B, ldb, X, ldx, info, bc); } inline rocblas_status rocsolver_gels_outofplace(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_float_complex* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_cgels_outofplace_batched(handle, trans, m, n, nrhs, A, lda, B, ldb, X, ldx, info, bc); } inline rocblas_status rocsolver_gels_outofplace(bool STRIDED, rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, rocblas_int nrhs, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_double_complex* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_zgels_outofplace_batched(handle, trans, m, n, nrhs, A, lda, B, ldb, X, ldx, info, bc); } /********************************************************/ /******************** GEBD2_GEBRD ********************/ // normal and strided_batched inline rocblas_status rocsolver_gebd2_gebrd(bool STRIDED, bool GEBRD, rocblas_handle handle, rocblas_int m, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, float* tauq, rocblas_stride stQ, float* taup, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEBRD ? rocsolver_sgebrd_strided_batched(handle, m, n, A, lda, stA, D, stD, E, stE, tauq, stQ, taup, stP, bc) : rocsolver_sgebd2_strided_batched(handle, m, n, A, lda, stA, D, stD, E, stE, tauq, stQ, taup, stP, bc); else return GEBRD ? rocsolver_sgebrd(handle, m, n, A, lda, D, E, tauq, taup) : rocsolver_sgebd2(handle, m, n, A, lda, D, E, tauq, taup); } inline rocblas_status rocsolver_gebd2_gebrd(bool STRIDED, bool GEBRD, rocblas_handle handle, rocblas_int m, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, double* tauq, rocblas_stride stQ, double* taup, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEBRD ? rocsolver_dgebrd_strided_batched(handle, m, n, A, lda, stA, D, stD, E, stE, tauq, stQ, taup, stP, bc) : rocsolver_dgebd2_strided_batched(handle, m, n, A, lda, stA, D, stD, E, stE, tauq, stQ, taup, stP, bc); else return GEBRD ? rocsolver_dgebrd(handle, m, n, A, lda, D, E, tauq, taup) : rocsolver_dgebd2(handle, m, n, A, lda, D, E, tauq, taup); } inline rocblas_status rocsolver_gebd2_gebrd(bool STRIDED, bool GEBRD, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_float_complex* tauq, rocblas_stride stQ, rocblas_float_complex* taup, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEBRD ? rocsolver_cgebrd_strided_batched(handle, m, n, A, lda, stA, D, stD, E, stE, tauq, stQ, taup, stP, bc) : rocsolver_cgebd2_strided_batched(handle, m, n, A, lda, stA, D, stD, E, stE, tauq, stQ, taup, stP, bc); else return GEBRD ? rocsolver_cgebrd(handle, m, n, A, lda, D, E, tauq, taup) : rocsolver_cgebd2(handle, m, n, A, lda, D, E, tauq, taup); } inline rocblas_status rocsolver_gebd2_gebrd(bool STRIDED, bool GEBRD, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_double_complex* tauq, rocblas_stride stQ, rocblas_double_complex* taup, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return GEBRD ? rocsolver_zgebrd_strided_batched(handle, m, n, A, lda, stA, D, stD, E, stE, tauq, stQ, taup, stP, bc) : rocsolver_zgebd2_strided_batched(handle, m, n, A, lda, stA, D, stD, E, stE, tauq, stQ, taup, stP, bc); else return GEBRD ? rocsolver_zgebrd(handle, m, n, A, lda, D, E, tauq, taup) : rocsolver_zgebd2(handle, m, n, A, lda, D, E, tauq, taup); } // batched inline rocblas_status rocsolver_gebd2_gebrd(bool STRIDED, bool GEBRD, rocblas_handle handle, rocblas_int m, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, float* tauq, rocblas_stride stQ, float* taup, rocblas_stride stP, rocblas_int bc) { return GEBRD ? rocsolver_sgebrd_batched(handle, m, n, A, lda, D, stD, E, stE, tauq, stQ, taup, stP, bc) : rocsolver_sgebd2_batched(handle, m, n, A, lda, D, stD, E, stE, tauq, stQ, taup, stP, bc); } inline rocblas_status rocsolver_gebd2_gebrd(bool STRIDED, bool GEBRD, rocblas_handle handle, rocblas_int m, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, double* tauq, rocblas_stride stQ, double* taup, rocblas_stride stP, rocblas_int bc) { return GEBRD ? rocsolver_dgebrd_batched(handle, m, n, A, lda, D, stD, E, stE, tauq, stQ, taup, stP, bc) : rocsolver_dgebd2_batched(handle, m, n, A, lda, D, stD, E, stE, tauq, stQ, taup, stP, bc); } inline rocblas_status rocsolver_gebd2_gebrd(bool STRIDED, bool GEBRD, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_float_complex* tauq, rocblas_stride stQ, rocblas_float_complex* taup, rocblas_stride stP, rocblas_int bc) { return GEBRD ? rocsolver_cgebrd_batched(handle, m, n, A, lda, D, stD, E, stE, tauq, stQ, taup, stP, bc) : rocsolver_cgebd2_batched(handle, m, n, A, lda, D, stD, E, stE, tauq, stQ, taup, stP, bc); } inline rocblas_status rocsolver_gebd2_gebrd(bool STRIDED, bool GEBRD, rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_double_complex* tauq, rocblas_stride stQ, rocblas_double_complex* taup, rocblas_stride stP, rocblas_int bc) { return GEBRD ? rocsolver_zgebrd_batched(handle, m, n, A, lda, D, stD, E, stE, tauq, stQ, taup, stP, bc) : rocsolver_zgebd2_batched(handle, m, n, A, lda, D, stD, E, stE, tauq, stQ, taup, stP, bc); } /********************************************************/ /******************** SYTD2/SYTRD_HETD2/HETRD ********************/ // normal and strided_batched inline rocblas_status rocsolver_sytxx_hetxx(bool STRIDED, bool SYTRD, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, float* tau, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return SYTRD ? rocsolver_ssytrd_strided_batched(handle, uplo, n, A, lda, stA, D, stD, E, stE, tau, stP, bc) : rocsolver_ssytd2_strided_batched(handle, uplo, n, A, lda, stA, D, stD, E, stE, tau, stP, bc); else return SYTRD ? rocsolver_ssytrd(handle, uplo, n, A, lda, D, E, tau) : rocsolver_ssytd2(handle, uplo, n, A, lda, D, E, tau); } inline rocblas_status rocsolver_sytxx_hetxx(bool STRIDED, bool SYTRD, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, double* tau, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return SYTRD ? rocsolver_dsytrd_strided_batched(handle, uplo, n, A, lda, stA, D, stD, E, stE, tau, stP, bc) : rocsolver_dsytd2_strided_batched(handle, uplo, n, A, lda, stA, D, stD, E, stE, tau, stP, bc); else return SYTRD ? rocsolver_dsytrd(handle, uplo, n, A, lda, D, E, tau) : rocsolver_dsytd2(handle, uplo, n, A, lda, D, E, tau); } inline rocblas_status rocsolver_sytxx_hetxx(bool STRIDED, bool SYTRD, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_float_complex* tau, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return SYTRD ? rocsolver_chetrd_strided_batched(handle, uplo, n, A, lda, stA, D, stD, E, stE, tau, stP, bc) : rocsolver_chetd2_strided_batched(handle, uplo, n, A, lda, stA, D, stD, E, stE, tau, stP, bc); else return SYTRD ? rocsolver_chetrd(handle, uplo, n, A, lda, D, E, tau) : rocsolver_chetd2(handle, uplo, n, A, lda, D, E, tau); } inline rocblas_status rocsolver_sytxx_hetxx(bool STRIDED, bool SYTRD, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_double_complex* tau, rocblas_stride stP, rocblas_int bc) { if(STRIDED) return SYTRD ? rocsolver_zhetrd_strided_batched(handle, uplo, n, A, lda, stA, D, stD, E, stE, tau, stP, bc) : rocsolver_zhetd2_strided_batched(handle, uplo, n, A, lda, stA, D, stD, E, stE, tau, stP, bc); else return SYTRD ? rocsolver_zhetrd(handle, uplo, n, A, lda, D, E, tau) : rocsolver_zhetd2(handle, uplo, n, A, lda, D, E, tau); } // batched inline rocblas_status rocsolver_sytxx_hetxx(bool STRIDED, bool SYTRD, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, float* tau, rocblas_stride stP, rocblas_int bc) { return SYTRD ? rocsolver_ssytrd_batched(handle, uplo, n, A, lda, D, stD, E, stE, tau, stP, bc) : rocsolver_ssytd2_batched(handle, uplo, n, A, lda, D, stD, E, stE, tau, stP, bc); } inline rocblas_status rocsolver_sytxx_hetxx(bool STRIDED, bool SYTRD, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, double* tau, rocblas_stride stP, rocblas_int bc) { return SYTRD ? rocsolver_dsytrd_batched(handle, uplo, n, A, lda, D, stD, E, stE, tau, stP, bc) : rocsolver_dsytd2_batched(handle, uplo, n, A, lda, D, stD, E, stE, tau, stP, bc); } inline rocblas_status rocsolver_sytxx_hetxx(bool STRIDED, bool SYTRD, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_float_complex* tau, rocblas_stride stP, rocblas_int bc) { return SYTRD ? rocsolver_chetrd_batched(handle, uplo, n, A, lda, D, stD, E, stE, tau, stP, bc) : rocsolver_chetd2_batched(handle, uplo, n, A, lda, D, stD, E, stE, tau, stP, bc); } inline rocblas_status rocsolver_sytxx_hetxx(bool STRIDED, bool SYTRD, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_double_complex* tau, rocblas_stride stP, rocblas_int bc) { return SYTRD ? rocsolver_zhetrd_batched(handle, uplo, n, A, lda, D, stD, E, stE, tau, stP, bc) : rocsolver_zhetd2_batched(handle, uplo, n, A, lda, D, stD, E, stE, tau, stP, bc); } /********************************************************/ /******************** SYGS2/SYGST_HEGS2/HEGST ********************/ // normal and strided_batched inline rocblas_status rocsolver_sygsx_hegsx(bool STRIDED, bool SYGST, rocblas_handle handle, rocblas_eform itype, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { if(STRIDED) return SYGST ? rocsolver_ssygst_strided_batched(handle, itype, uplo, n, A, lda, stA, B, ldb, stB, bc) : rocsolver_ssygs2_strided_batched(handle, itype, uplo, n, A, lda, stA, B, ldb, stB, bc); else return SYGST ? rocsolver_ssygst(handle, itype, uplo, n, A, lda, B, ldb) : rocsolver_ssygs2(handle, itype, uplo, n, A, lda, B, ldb); } inline rocblas_status rocsolver_sygsx_hegsx(bool STRIDED, bool SYGST, rocblas_handle handle, rocblas_eform itype, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { if(STRIDED) return SYGST ? rocsolver_dsygst_strided_batched(handle, itype, uplo, n, A, lda, stA, B, ldb, stB, bc) : rocsolver_dsygs2_strided_batched(handle, itype, uplo, n, A, lda, stA, B, ldb, stB, bc); else return SYGST ? rocsolver_dsygst(handle, itype, uplo, n, A, lda, B, ldb) : rocsolver_dsygs2(handle, itype, uplo, n, A, lda, B, ldb); } inline rocblas_status rocsolver_sygsx_hegsx(bool STRIDED, bool SYGST, rocblas_handle handle, rocblas_eform itype, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { if(STRIDED) return SYGST ? rocsolver_chegst_strided_batched(handle, itype, uplo, n, A, lda, stA, B, ldb, stB, bc) : rocsolver_chegs2_strided_batched(handle, itype, uplo, n, A, lda, stA, B, ldb, stB, bc); else return SYGST ? rocsolver_chegst(handle, itype, uplo, n, A, lda, B, ldb) : rocsolver_chegs2(handle, itype, uplo, n, A, lda, B, ldb); } inline rocblas_status rocsolver_sygsx_hegsx(bool STRIDED, bool SYGST, rocblas_handle handle, rocblas_eform itype, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { if(STRIDED) return SYGST ? rocsolver_zhegst_strided_batched(handle, itype, uplo, n, A, lda, stA, B, ldb, stB, bc) : rocsolver_zhegs2_strided_batched(handle, itype, uplo, n, A, lda, stA, B, ldb, stB, bc); else return SYGST ? rocsolver_zhegst(handle, itype, uplo, n, A, lda, B, ldb) : rocsolver_zhegs2(handle, itype, uplo, n, A, lda, B, ldb); } // batched inline rocblas_status rocsolver_sygsx_hegsx(bool STRIDED, bool SYGST, rocblas_handle handle, rocblas_eform itype, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return SYGST ? rocsolver_ssygst_batched(handle, itype, uplo, n, A, lda, B, ldb, bc) : rocsolver_ssygs2_batched(handle, itype, uplo, n, A, lda, B, ldb, bc); } inline rocblas_status rocsolver_sygsx_hegsx(bool STRIDED, bool SYGST, rocblas_handle handle, rocblas_eform itype, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return SYGST ? rocsolver_dsygst_batched(handle, itype, uplo, n, A, lda, B, ldb, bc) : rocsolver_dsygs2_batched(handle, itype, uplo, n, A, lda, B, ldb, bc); } inline rocblas_status rocsolver_sygsx_hegsx(bool STRIDED, bool SYGST, rocblas_handle handle, rocblas_eform itype, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return SYGST ? rocsolver_chegst_batched(handle, itype, uplo, n, A, lda, B, ldb, bc) : rocsolver_chegs2_batched(handle, itype, uplo, n, A, lda, B, ldb, bc); } inline rocblas_status rocsolver_sygsx_hegsx(bool STRIDED, bool SYGST, rocblas_handle handle, rocblas_eform itype, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_int bc) { return SYGST ? rocsolver_zhegst_batched(handle, itype, uplo, n, A, lda, B, ldb, bc) : rocsolver_zhegs2_batched(handle, itype, uplo, n, A, lda, B, ldb, bc); } /********************************************************/ /******************** SYEV/HEEV ********************/ // normal and strided_batched inline rocblas_status rocsolver_syev_heev(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_ssyev_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, E, stE, info, bc) : rocsolver_ssyev(handle, evect, uplo, n, A, lda, D, E, info); } inline rocblas_status rocsolver_syev_heev(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dsyev_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, E, stE, info, bc) : rocsolver_dsyev(handle, evect, uplo, n, A, lda, D, E, info); } inline rocblas_status rocsolver_syev_heev(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cheev_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, E, stE, info, bc) : rocsolver_cheev(handle, evect, uplo, n, A, lda, D, E, info); } inline rocblas_status rocsolver_syev_heev(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zheev_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, E, stE, info, bc) : rocsolver_zheev(handle, evect, uplo, n, A, lda, D, E, info); } // batched inline rocblas_status rocsolver_syev_heev(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_ssyev_batched(handle, evect, uplo, n, A, lda, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_syev_heev(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_dsyev_batched(handle, evect, uplo, n, A, lda, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_syev_heev(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_cheev_batched(handle, evect, uplo, n, A, lda, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_syev_heev(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_zheev_batched(handle, evect, uplo, n, A, lda, D, stD, E, stE, info, bc); } /********************************************************/ /******************** SYEVD/HEEVD ********************/ // normal and strided_batched inline rocblas_status rocsolver_syevd_heevd(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_ssyevd_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, E, stE, info, bc) : rocsolver_ssyevd(handle, evect, uplo, n, A, lda, D, E, info); } inline rocblas_status rocsolver_syevd_heevd(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dsyevd_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, E, stE, info, bc) : rocsolver_dsyevd(handle, evect, uplo, n, A, lda, D, E, info); } inline rocblas_status rocsolver_syevd_heevd(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cheevd_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, E, stE, info, bc) : rocsolver_cheevd(handle, evect, uplo, n, A, lda, D, E, info); } inline rocblas_status rocsolver_syevd_heevd(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zheevd_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, E, stE, info, bc) : rocsolver_zheevd(handle, evect, uplo, n, A, lda, D, E, info); } // batched inline rocblas_status rocsolver_syevd_heevd(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_ssyevd_batched(handle, evect, uplo, n, A, lda, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_syevd_heevd(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_dsyevd_batched(handle, evect, uplo, n, A, lda, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_syevd_heevd(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_cheevd_batched(handle, evect, uplo, n, A, lda, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_syevd_heevd(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_zheevd_batched(handle, evect, uplo, n, A, lda, D, stD, E, stE, info, bc); } /********************************************************/ /******************** SYEVJ/HEEVJ ********************/ // normal and strided_batched inline rocblas_status rocsolver_syevj_heevj(bool STRIDED, rocblas_handle handle, rocblas_esort esort, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_ssyevj_strided_batched(handle, esort, evect, uplo, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc) : rocsolver_ssyevj(handle, esort, evect, uplo, n, A, lda, abstol, residual, max_sweeps, n_sweeps, W, info); } inline rocblas_status rocsolver_syevj_heevj(bool STRIDED, rocblas_handle handle, rocblas_esort esort, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dsyevj_strided_batched(handle, esort, evect, uplo, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc) : rocsolver_dsyevj(handle, esort, evect, uplo, n, A, lda, abstol, residual, max_sweeps, n_sweeps, W, info); } inline rocblas_status rocsolver_syevj_heevj(bool STRIDED, rocblas_handle handle, rocblas_esort esort, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cheevj_strided_batched(handle, esort, evect, uplo, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc) : rocsolver_cheevj(handle, esort, evect, uplo, n, A, lda, abstol, residual, max_sweeps, n_sweeps, W, info); } inline rocblas_status rocsolver_syevj_heevj(bool STRIDED, rocblas_handle handle, rocblas_esort esort, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zheevj_strided_batched(handle, esort, evect, uplo, n, A, lda, stA, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc) : rocsolver_zheevj(handle, esort, evect, uplo, n, A, lda, abstol, residual, max_sweeps, n_sweeps, W, info); } // batched inline rocblas_status rocsolver_syevj_heevj(bool STRIDED, rocblas_handle handle, rocblas_esort esort, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocsolver_ssyevj_batched(handle, esort, evect, uplo, n, A, lda, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); } inline rocblas_status rocsolver_syevj_heevj(bool STRIDED, rocblas_handle handle, rocblas_esort esort, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocsolver_dsyevj_batched(handle, esort, evect, uplo, n, A, lda, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); } inline rocblas_status rocsolver_syevj_heevj(bool STRIDED, rocblas_handle handle, rocblas_esort esort, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocsolver_cheevj_batched(handle, esort, evect, uplo, n, A, lda, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); } inline rocblas_status rocsolver_syevj_heevj(bool STRIDED, rocblas_handle handle, rocblas_esort esort, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocsolver_zheevj_batched(handle, esort, evect, uplo, n, A, lda, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); } /********************************************************/ /******************** SYEVX/HEEVX ********************/ // normal and strided_batched inline rocblas_status rocsolver_syevx_heevx(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, float* Z, rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_ssyevx_strided_batched(handle, evect, erange, uplo, n, A, lda, stA, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, stZ, ifail, stF, info, bc) : rocsolver_ssyevx(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, Z, ldz, ifail, info); } inline rocblas_status rocsolver_syevx_heevx(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, double* Z, rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dsyevx_strided_batched(handle, evect, erange, uplo, n, A, lda, stA, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, stZ, ifail, stF, info, bc) : rocsolver_dsyevx(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, Z, ldz, ifail, info); } inline rocblas_status rocsolver_syevx_heevx(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_float_complex* Z, rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cheevx_strided_batched(handle, evect, erange, uplo, n, A, lda, stA, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, stZ, ifail, stF, info, bc) : rocsolver_cheevx(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, Z, ldz, ifail, info); } inline rocblas_status rocsolver_syevx_heevx(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_double_complex* Z, rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zheevx_strided_batched(handle, evect, erange, uplo, n, A, lda, stA, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, stZ, ifail, stF, info, bc) : rocsolver_zheevx(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, Z, ldz, ifail, info); } // batched inline rocblas_status rocsolver_syevx_heevx(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, float* const Z[], rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_ssyevx_batched(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, ifail, stF, info, bc); } inline rocblas_status rocsolver_syevx_heevx(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, double* const Z[], rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_dsyevx_batched(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, ifail, stF, info, bc); } inline rocblas_status rocsolver_syevx_heevx(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_float_complex* const Z[], rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_cheevx_batched(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, ifail, stF, info, bc); } inline rocblas_status rocsolver_syevx_heevx(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_double_complex* const Z[], rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_zheevx_batched(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, ifail, stF, info, bc); } /********************************************************/ /******************** SYEVDX/HEEVDX_INPLACE ********************/ // normal and strided_batched inline rocblas_status rocsolver_syevdx_heevdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_ssyevdx_inplace_strided_batched(handle, evect, erange, uplo, n, A, lda, stA, vl, vu, il, iu, abstol, nev, W, stW, info, bc) : rocsolver_ssyevdx_inplace(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, info); } inline rocblas_status rocsolver_syevdx_heevdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_dsyevdx_inplace_strided_batched(handle, evect, erange, uplo, n, A, lda, stA, vl, vu, il, iu, abstol, nev, W, stW, info, bc) : rocsolver_dsyevdx_inplace(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, info); } inline rocblas_status rocsolver_syevdx_heevdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_cheevdx_inplace_strided_batched(handle, evect, erange, uplo, n, A, lda, stA, vl, vu, il, iu, abstol, nev, W, stW, info, bc) : rocsolver_cheevdx_inplace(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, info); } inline rocblas_status rocsolver_syevdx_heevdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_zheevdx_inplace_strided_batched(handle, evect, erange, uplo, n, A, lda, stA, vl, vu, il, iu, abstol, nev, W, stW, info, bc) : rocsolver_zheevdx_inplace(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, info); } // batched inline rocblas_status rocsolver_syevdx_heevdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_ssyevdx_inplace_batched(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, stW, info, bc); } inline rocblas_status rocsolver_syevdx_heevdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_dsyevdx_inplace_batched(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, stW, info, bc); } inline rocblas_status rocsolver_syevdx_heevdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_cheevdx_inplace_batched(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, stW, info, bc); } inline rocblas_status rocsolver_syevdx_heevdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_zheevdx_inplace_batched(handle, evect, erange, uplo, n, A, lda, vl, vu, il, iu, abstol, nev, W, stW, info, bc); } /********************************************************/ /******************** SYGV_HEGV ********************/ // normal and strided_batched inline rocblas_status rocsolver_sygv_hegv(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_ssygv_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, D, stD, E, stE, info, bc); else return rocsolver_ssygv(handle, itype, evect, uplo, n, A, lda, B, ldb, D, E, info); } inline rocblas_status rocsolver_sygv_hegv(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_dsygv_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, D, stD, E, stE, info, bc); else return rocsolver_dsygv(handle, itype, evect, uplo, n, A, lda, B, ldb, D, E, info); } inline rocblas_status rocsolver_sygv_hegv(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_chegv_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, D, stD, E, stE, info, bc); else return rocsolver_chegv(handle, itype, evect, uplo, n, A, lda, B, ldb, D, E, info); } inline rocblas_status rocsolver_sygv_hegv(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_zhegv_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, D, stD, E, stE, info, bc); else return rocsolver_zhegv(handle, itype, evect, uplo, n, A, lda, B, ldb, D, E, info); } // batched inline rocblas_status rocsolver_sygv_hegv(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_ssygv_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_sygv_hegv(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_dsygv_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_sygv_hegv(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_chegv_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_sygv_hegv(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_zhegv_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, E, stE, info, bc); } /********************************************************/ /******************** SYGVD_HEGVD ********************/ // normal and strided_batched inline rocblas_status rocsolver_sygvd_hegvd(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_ssygvd_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, D, stD, E, stE, info, bc); else return rocsolver_ssygvd(handle, itype, evect, uplo, n, A, lda, B, ldb, D, E, info); } inline rocblas_status rocsolver_sygvd_hegvd(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_dsygvd_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, D, stD, E, stE, info, bc); else return rocsolver_dsygvd(handle, itype, evect, uplo, n, A, lda, B, ldb, D, E, info); } inline rocblas_status rocsolver_sygvd_hegvd(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_chegvd_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, D, stD, E, stE, info, bc); else return rocsolver_chegvd(handle, itype, evect, uplo, n, A, lda, B, ldb, D, E, info); } inline rocblas_status rocsolver_sygvd_hegvd(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_zhegvd_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, D, stD, E, stE, info, bc); else return rocsolver_zhegvd(handle, itype, evect, uplo, n, A, lda, B, ldb, D, E, info); } // batched inline rocblas_status rocsolver_sygvd_hegvd(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_ssygvd_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_sygvd_hegvd(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_dsygvd_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_sygvd_hegvd(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, float* D, rocblas_stride stD, float* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_chegvd_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, E, stE, info, bc); } inline rocblas_status rocsolver_sygvd_hegvd(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, double* D, rocblas_stride stD, double* E, rocblas_stride stE, rocblas_int* info, rocblas_int bc) { return rocsolver_zhegvd_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, E, stE, info, bc); } /********************************************************/ /******************** SYGVJ_HEGVJ ********************/ // normal and strided_batched inline rocblas_status rocsolver_sygvj_hegvj(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_ssygvj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); else return rocsolver_ssygvj(handle, itype, evect, uplo, n, A, lda, B, ldb, abstol, residual, max_sweeps, n_sweeps, W, info); } inline rocblas_status rocsolver_sygvj_hegvj(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_dsygvj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); else return rocsolver_dsygvj(handle, itype, evect, uplo, n, A, lda, B, ldb, abstol, residual, max_sweeps, n_sweeps, W, info); } inline rocblas_status rocsolver_sygvj_hegvj(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_chegvj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); else return rocsolver_chegvj(handle, itype, evect, uplo, n, A, lda, B, ldb, abstol, residual, max_sweeps, n_sweeps, W, info); } inline rocblas_status rocsolver_sygvj_hegvj(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { if(STRIDED) return rocsolver_zhegvj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, stB, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); else return rocsolver_zhegvj(handle, itype, evect, uplo, n, A, lda, B, ldb, abstol, residual, max_sweeps, n_sweeps, W, info); } // batched inline rocblas_status rocsolver_sygvj_hegvj(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocsolver_ssygvj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); } inline rocblas_status rocsolver_sygvj_hegvj(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocsolver_dsygvj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); } inline rocblas_status rocsolver_sygvj_hegvj(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, float abstol, float* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocsolver_chegvj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); } inline rocblas_status rocsolver_sygvj_hegvj(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, double abstol, double* residual, rocblas_int max_sweeps, rocblas_int* n_sweeps, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocsolver_zhegvj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, abstol, residual, max_sweeps, n_sweeps, W, stW, info, bc); } /********************************************************/ /******************** SYGVX/HEGVX ********************/ // normal and strided_batched inline rocblas_status rocsolver_sygvx_hegvx(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, float* Z, rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_ssygvx_strided_batched(handle, itype, evect, erange, uplo, n, A, lda, stA, B, ldb, stB, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, stZ, ifail, stF, info, bc) : rocsolver_ssygvx(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, Z, ldz, ifail, info); } inline rocblas_status rocsolver_sygvx_hegvx(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, double* Z, rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dsygvx_strided_batched(handle, itype, evect, erange, uplo, n, A, lda, stA, B, ldb, stB, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, stZ, ifail, stF, info, bc) : rocsolver_dsygvx(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, Z, ldz, ifail, info); } inline rocblas_status rocsolver_sygvx_hegvx(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_float_complex* Z, rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_chegvx_strided_batched(handle, itype, evect, erange, uplo, n, A, lda, stA, B, ldb, stB, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, stZ, ifail, stF, info, bc) : rocsolver_chegvx(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, Z, ldz, ifail, info); } inline rocblas_status rocsolver_sygvx_hegvx(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_double_complex* Z, rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zhegvx_strided_batched(handle, itype, evect, erange, uplo, n, A, lda, stA, B, ldb, stB, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, stZ, ifail, stF, info, bc) : rocsolver_zhegvx(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, Z, ldz, ifail, info); } // batched inline rocblas_status rocsolver_sygvx_hegvx(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, float* const Z[], rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_ssygvx_batched(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, ifail, stF, info, bc); } inline rocblas_status rocsolver_sygvx_hegvx(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, double* const Z[], rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_dsygvx_batched(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, ifail, stF, info, bc); } inline rocblas_status rocsolver_sygvx_hegvx(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, float vl, float vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_float_complex* const Z[], rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_chegvx_batched(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, ifail, stF, info, bc); } inline rocblas_status rocsolver_sygvx_hegvx(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_double_complex* const Z[], rocblas_int ldz, rocblas_stride stZ, rocblas_int* ifail, rocblas_stride stF, rocblas_int* info, rocblas_int bc) { return rocsolver_zhegvx_batched(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, stW, Z, ldz, ifail, stF, info, bc); } /********************************************************/ /******************** SYGVX/HEGVX_INPLACE ********************/ // normal and strided_batched inline rocblas_status rocsolver_sygvdx_hegvdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_ssygvdx_inplace_strided_batched(handle, itype, evect, erange, uplo, n, A, lda, stA, B, ldb, stB, vl, vu, il, iu, abstol, nev, W, stW, info, bc) : rocsolver_ssygvdx_inplace(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, info); } inline rocblas_status rocsolver_sygvdx_hegvdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_dsygvdx_inplace_strided_batched(handle, itype, evect, erange, uplo, n, A, lda, stA, B, ldb, stB, vl, vu, il, iu, abstol, nev, W, stW, info, bc) : rocsolver_dsygvdx_inplace(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, info); } inline rocblas_status rocsolver_sygvdx_hegvdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_chegvdx_inplace_strided_batched(handle, itype, evect, erange, uplo, n, A, lda, stA, B, ldb, stB, vl, vu, il, iu, abstol, nev, W, stW, info, bc) : rocsolver_chegvdx_inplace(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, info); } inline rocblas_status rocsolver_sygvdx_hegvdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocblas_status_not_implemented // rocsolver_zhegvdx_inplace_strided_batched(handle, itype, evect, erange, uplo, n, A, lda, stA, B, ldb, stB, vl, vu, il, iu, abstol, nev, W, stW, info, bc) : rocsolver_zhegvdx_inplace(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, info); } // batched inline rocblas_status rocsolver_sygvdx_hegvdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, float vl, float vu, rocblas_int il, rocblas_int iu, float abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_ssygvdx_inplace_batched(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, stW, info, bc); } inline rocblas_status rocsolver_sygvdx_hegvdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_dsygvdx_inplace_batched(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, stW, info, bc); } inline rocblas_status rocsolver_sygvdx_hegvdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, float vl, float vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, float* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_chegvdx_inplace_batched(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, stW, info, bc); } inline rocblas_status rocsolver_sygvdx_hegvdx_inplace(bool STRIDED, rocblas_handle handle, rocblas_eform itype, rocblas_evect evect, rocblas_erange erange, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, double vl, double vu, rocblas_int il, rocblas_int iu, double abstol, rocblas_int* nev, double* W, rocblas_stride stW, rocblas_int* info, rocblas_int bc) { return rocblas_status_not_implemented; // rocsolver_zhegvdx_inplace_batched(handle, itype, evect, erange, uplo, n, A, lda, B, ldb, vl, vu, il, iu, abstol, nev, W, stW, info, bc); } /********************************************************/ /******************** SYTF2_SYTRF ********************/ // normal and strided_batched inline rocblas_status rocsolver_sytf2_sytrf(bool STRIDED, bool SYTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, float* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { if(STRIDED) return SYTRF ? rocsolver_ssytrf_strided_batched(handle, uplo, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_ssytf2_strided_batched(handle, uplo, n, A, lda, stA, ipiv, stP, info, bc); else return SYTRF ? rocsolver_ssytrf(handle, uplo, n, A, lda, ipiv, info) : rocsolver_ssytf2(handle, uplo, n, A, lda, ipiv, info); } inline rocblas_status rocsolver_sytf2_sytrf(bool STRIDED, bool SYTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, double* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { if(STRIDED) return SYTRF ? rocsolver_dsytrf_strided_batched(handle, uplo, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_dsytf2_strided_batched(handle, uplo, n, A, lda, stA, ipiv, stP, info, bc); else return SYTRF ? rocsolver_dsytrf(handle, uplo, n, A, lda, ipiv, info) : rocsolver_dsytf2(handle, uplo, n, A, lda, ipiv, info); } inline rocblas_status rocsolver_sytf2_sytrf(bool STRIDED, bool SYTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { if(STRIDED) return SYTRF ? rocsolver_csytrf_strided_batched(handle, uplo, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_csytf2_strided_batched(handle, uplo, n, A, lda, stA, ipiv, stP, info, bc); else return SYTRF ? rocsolver_csytrf(handle, uplo, n, A, lda, ipiv, info) : rocsolver_csytf2(handle, uplo, n, A, lda, ipiv, info); } inline rocblas_status rocsolver_sytf2_sytrf(bool STRIDED, bool SYTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { if(STRIDED) return SYTRF ? rocsolver_zsytrf_strided_batched(handle, uplo, n, A, lda, stA, ipiv, stP, info, bc) : rocsolver_zsytf2_strided_batched(handle, uplo, n, A, lda, stA, ipiv, stP, info, bc); else return SYTRF ? rocsolver_zsytrf(handle, uplo, n, A, lda, ipiv, info) : rocsolver_zsytf2(handle, uplo, n, A, lda, ipiv, info); } // batched inline rocblas_status rocsolver_sytf2_sytrf(bool STRIDED, bool SYTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, float* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return SYTRF ? rocsolver_ssytrf_batched(handle, uplo, n, A, lda, ipiv, stP, info, bc) : rocsolver_ssytf2_batched(handle, uplo, n, A, lda, ipiv, stP, info, bc); } inline rocblas_status rocsolver_sytf2_sytrf(bool STRIDED, bool SYTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, double* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return SYTRF ? rocsolver_dsytrf_batched(handle, uplo, n, A, lda, ipiv, stP, info, bc) : rocsolver_dsytf2_batched(handle, uplo, n, A, lda, ipiv, stP, info, bc); } inline rocblas_status rocsolver_sytf2_sytrf(bool STRIDED, bool SYTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return SYTRF ? rocsolver_csytrf_batched(handle, uplo, n, A, lda, ipiv, stP, info, bc) : rocsolver_csytf2_batched(handle, uplo, n, A, lda, ipiv, stP, info, bc); } inline rocblas_status rocsolver_sytf2_sytrf(bool STRIDED, bool SYTRF, rocblas_handle handle, rocblas_fill uplo, rocblas_int n, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_int* ipiv, rocblas_stride stP, rocblas_int* info, rocblas_int bc) { return SYTRF ? rocsolver_zsytrf_batched(handle, uplo, n, A, lda, ipiv, stP, info, bc) : rocsolver_zsytf2_batched(handle, uplo, n, A, lda, ipiv, stP, info, bc); } /********************************************************/ /******************** GEBLTTRF_NPVT ********************/ // normal and strided_batched inline rocblas_status rocsolver_geblttrf_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, float* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_sgeblttrf_npvt_strided_batched(handle, nb, nblocks, A, lda, stA, B, ldb, stB, C, ldc, stC, info, bc) : rocsolver_sgeblttrf_npvt(handle, nb, nblocks, A, lda, B, ldb, C, ldc, info); } inline rocblas_status rocsolver_geblttrf_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, double* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_dgeblttrf_npvt_strided_batched(handle, nb, nblocks, A, lda, stA, B, ldb, stB, C, ldc, stC, info, bc) : rocsolver_dgeblttrf_npvt(handle, nb, nblocks, A, lda, B, ldb, C, ldc, info); } inline rocblas_status rocsolver_geblttrf_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_float_complex* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_cgeblttrf_npvt_strided_batched(handle, nb, nblocks, A, lda, stA, B, ldb, stB, C, ldc, stC, info, bc) : rocsolver_cgeblttrf_npvt(handle, nb, nblocks, A, lda, B, ldb, C, ldc, info); } inline rocblas_status rocsolver_geblttrf_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_double_complex* C, rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return STRIDED ? rocsolver_zgeblttrf_npvt_strided_batched(handle, nb, nblocks, A, lda, stA, B, ldb, stB, C, ldc, stC, info, bc) : rocsolver_zgeblttrf_npvt(handle, nb, nblocks, A, lda, B, ldb, C, ldc, info); } // batched inline rocblas_status rocsolver_geblttrf_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, float* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_sgeblttrf_npvt_batched(handle, nb, nblocks, A, lda, B, ldb, C, ldc, info, bc); } inline rocblas_status rocsolver_geblttrf_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, double* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_dgeblttrf_npvt_batched(handle, nb, nblocks, A, lda, B, ldb, C, ldc, info, bc); } inline rocblas_status rocsolver_geblttrf_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_float_complex* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_cgeblttrf_npvt_batched(handle, nb, nblocks, A, lda, B, ldb, C, ldc, info, bc); } inline rocblas_status rocsolver_geblttrf_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_double_complex* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_int* info, rocblas_int bc) { return rocsolver_zgeblttrf_npvt_batched(handle, nb, nblocks, A, lda, B, ldb, C, ldc, info, bc); } /********************************************************/ /******************** GEBLTTRS_NPVT ********************/ // normal and strided_batched inline rocblas_status rocsolver_geblttrs_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_int nrhs, float* A, rocblas_int lda, rocblas_stride stA, float* B, rocblas_int ldb, rocblas_stride stB, float* C, rocblas_int ldc, rocblas_stride stC, float* X, rocblas_int ldx, rocblas_stride stX, rocblas_int bc) { return STRIDED ? rocsolver_sgeblttrs_npvt_strided_batched(handle, nb, nblocks, nrhs, A, lda, stA, B, ldb, stB, C, ldc, stC, X, ldx, stX, bc) : rocsolver_sgeblttrs_npvt(handle, nb, nblocks, nrhs, A, lda, B, ldb, C, ldc, X, ldx); } inline rocblas_status rocsolver_geblttrs_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_int nrhs, double* A, rocblas_int lda, rocblas_stride stA, double* B, rocblas_int ldb, rocblas_stride stB, double* C, rocblas_int ldc, rocblas_stride stC, double* X, rocblas_int ldx, rocblas_stride stX, rocblas_int bc) { return STRIDED ? rocsolver_dgeblttrs_npvt_strided_batched(handle, nb, nblocks, nrhs, A, lda, stA, B, ldb, stB, C, ldc, stC, X, ldx, stX, bc) : rocsolver_dgeblttrs_npvt(handle, nb, nblocks, nrhs, A, lda, B, ldb, C, ldc, X, ldx); } inline rocblas_status rocsolver_geblttrs_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_int nrhs, rocblas_float_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_float_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_float_complex* C, rocblas_int ldc, rocblas_stride stC, rocblas_float_complex* X, rocblas_int ldx, rocblas_stride stX, rocblas_int bc) { return STRIDED ? rocsolver_cgeblttrs_npvt_strided_batched(handle, nb, nblocks, nrhs, A, lda, stA, B, ldb, stB, C, ldc, stC, X, ldx, stX, bc) : rocsolver_cgeblttrs_npvt(handle, nb, nblocks, nrhs, A, lda, B, ldb, C, ldc, X, ldx); } inline rocblas_status rocsolver_geblttrs_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_int nrhs, rocblas_double_complex* A, rocblas_int lda, rocblas_stride stA, rocblas_double_complex* B, rocblas_int ldb, rocblas_stride stB, rocblas_double_complex* C, rocblas_int ldc, rocblas_stride stC, rocblas_double_complex* X, rocblas_int ldx, rocblas_stride stX, rocblas_int bc) { return STRIDED ? rocsolver_zgeblttrs_npvt_strided_batched(handle, nb, nblocks, nrhs, A, lda, stA, B, ldb, stB, C, ldc, stC, X, ldx, stX, bc) : rocsolver_zgeblttrs_npvt(handle, nb, nblocks, nrhs, A, lda, B, ldb, C, ldc, X, ldx); } // batched inline rocblas_status rocsolver_geblttrs_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_int nrhs, float* const A[], rocblas_int lda, rocblas_stride stA, float* const B[], rocblas_int ldb, rocblas_stride stB, float* const C[], rocblas_int ldc, rocblas_stride stC, float* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int bc) { return rocsolver_sgeblttrs_npvt_batched(handle, nb, nblocks, nrhs, A, lda, B, ldb, C, ldc, X, ldx, bc); } inline rocblas_status rocsolver_geblttrs_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_int nrhs, double* const A[], rocblas_int lda, rocblas_stride stA, double* const B[], rocblas_int ldb, rocblas_stride stB, double* const C[], rocblas_int ldc, rocblas_stride stC, double* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int bc) { return rocsolver_dgeblttrs_npvt_batched(handle, nb, nblocks, nrhs, A, lda, B, ldb, C, ldc, X, ldx, bc); } inline rocblas_status rocsolver_geblttrs_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_int nrhs, rocblas_float_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_float_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_float_complex* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_float_complex* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int bc) { return rocsolver_cgeblttrs_npvt_batched(handle, nb, nblocks, nrhs, A, lda, B, ldb, C, ldc, X, ldx, bc); } inline rocblas_status rocsolver_geblttrs_npvt(bool STRIDED, rocblas_handle handle, rocblas_int nb, rocblas_int nblocks, rocblas_int nrhs, rocblas_double_complex* const A[], rocblas_int lda, rocblas_stride stA, rocblas_double_complex* const B[], rocblas_int ldb, rocblas_stride stB, rocblas_double_complex* const C[], rocblas_int ldc, rocblas_stride stC, rocblas_double_complex* const X[], rocblas_int ldx, rocblas_stride stX, rocblas_int bc) { return rocsolver_zgeblttrs_npvt_batched(handle, nb, nblocks, nrhs, A, lda, B, ldb, C, ldc, X, ldx, bc); } /********************************************************/ rocSOLVER-rocm-5.5.1/clients/include/rocsolver_arguments.hpp000066400000000000000000000172561436600607200241150ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include #include #include #include "rocblascommon/program_options.hpp" using variables_map = roc::variables_map; using variable_value = roc::variable_value; class Arguments : private std::map { using base = std::map; // names of arguments that have not yet been used by tests std::set to_consume; public: // test options rocblas_int norm_check = 0; rocblas_int unit_check = 1; rocblas_int timing = 0; rocblas_int perf = 0; rocblas_int singular = 0; rocblas_int iters = 5; rocblas_int mem_query = 0; rocblas_int profile = 0; rocblas_int profile_kernels = 0; rocblas_int batch_count = 1; // get and set function arguments template const T& peek(const std::string& name) const { return at(name).as(); } template const T& get(const std::string& name) { to_consume.erase(name); auto val = find(name); if(val != end() && !val->second.empty()) return val->second.as(); else throw std::invalid_argument("No value provided for " + name); } template const T get(const std::string& name, const T& default_value) { to_consume.erase(name); auto val = find(name); if(val != end() && !val->second.empty() && !val->second.defaulted()) return val->second.as(); else return default_value; } template void set(const std::string& name, const T& val) { to_consume.insert(name); base::operator[](name) = variable_value(val, false); } void populate(const variables_map& vm) { for(auto& pair : vm) { base::operator[](pair.first) = pair.second; if(!pair.second.empty() && !pair.second.defaulted()) to_consume.insert(pair.first); } // remove test arguments to_consume.erase("help"); to_consume.erase("function"); to_consume.erase("precision"); to_consume.erase("batch_count"); to_consume.erase("verify"); to_consume.erase("iters"); to_consume.erase("mem_query"); to_consume.erase("profile"); to_consume.erase("profile_kernels"); to_consume.erase("perf"); to_consume.erase("singular"); to_consume.erase("device"); } void clear() { to_consume.clear(); base::clear(); } // validate function arguments void validate_precision(const std::string name) const { auto val = find(name); if(val == end()) return; char precision = val->second.as(); if(precision != 's' && precision != 'd' && precision != 'c' && precision != 'z') throw std::invalid_argument("Invalid value for " + name); } void validate_operation(const std::string name) const { auto val = find(name); if(val == end()) return; char trans = val->second.as(); if(trans != 'N' && trans != 'T' && trans != 'C') throw std::invalid_argument("Invalid value for " + name); } void validate_side(const std::string name) const { auto val = find(name); if(val == end()) return; char side = val->second.as(); if(side != 'L' && side != 'R' && side != 'B') throw std::invalid_argument("Invalid value for " + name); } void validate_fill(const std::string name) const { auto val = find(name); if(val == end()) return; char uplo = val->second.as(); if(uplo != 'U' && uplo != 'L' && uplo != 'F') throw std::invalid_argument("Invalid value for " + name); } void validate_diag(const std::string name) const { auto val = find(name); if(val == end()) return; char diag = val->second.as(); if(diag != 'N' && diag != 'U') throw std::invalid_argument("Invalid value for " + name); } void validate_direct(const std::string name) const { auto val = find(name); if(val == end()) return; char direct = val->second.as(); if(direct != 'F' && direct != 'B') throw std::invalid_argument("Invalid value for " + name); } void validate_storev(const std::string name) const { auto val = find(name); if(val == end()) return; char storev = val->second.as(); if(storev != 'R' && storev != 'C') throw std::invalid_argument("Invalid value for " + name); } void validate_svect(const std::string name) const { auto val = find(name); if(val == end()) return; char svect = val->second.as(); if(svect != 'A' && svect != 'S' && svect != 'V' && svect != 'O' && svect != 'N') throw std::invalid_argument("Invalid value for " + name); } void validate_srange(const std::string name) const { auto val = find(name); if(val == end()) return; char range = val->second.as(); if(range != 'A' && range != 'V' && range != 'I') throw std::invalid_argument("Invalid value for " + name); } void validate_workmode(const std::string name) const { auto val = find(name); if(val == end()) return; char workmode = val->second.as(); if(workmode != 'O' && workmode != 'I') throw std::invalid_argument("Invalid value for " + name); } void validate_evect(const std::string name) const { auto val = find(name); if(val == end()) return; char evect = val->second.as(); if(evect != 'V' && evect != 'I' && evect != 'N') throw std::invalid_argument("Invalid value for " + name); } void validate_erange(const std::string name) const { auto val = find(name); if(val == end()) return; char range = val->second.as(); if(range != 'A' && range != 'V' && range != 'I') throw std::invalid_argument("Invalid value for " + name); } void validate_eorder(const std::string name) const { auto val = find(name); if(val == end()) return; char order = val->second.as(); if(order != 'B' && order != 'E') throw std::invalid_argument("Invalid value for " + name); } void validate_esort(const std::string name) const { auto val = find(name); if(val == end()) return; char sort = val->second.as(); if(sort != 'A' && sort != 'N') throw std::invalid_argument("Invalid value for " + name); } void validate_itype(const std::string name) const { auto val = find(name); if(val == end()) return; char itype = val->second.as(); if(itype != '1' && itype != '2' && itype != '3') throw std::invalid_argument("Invalid value for " + name); } void validate_consumed() const { if(!to_consume.empty()) throw std::invalid_argument( fmt::format("Not all arguments were consumed: {}", fmt::join(to_consume, " "))); } }; rocSOLVER-rocm-5.5.1/clients/include/rocsolver_dispatcher.hpp000066400000000000000000000516751436600607200242410ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "rocblas/rocblas.h" #include "rocsolver_arguments.hpp" #include #include #include "testing_bdsqr.hpp" #include "testing_bdsvdx.hpp" #include "testing_gebd2_gebrd.hpp" #include "testing_geblttrf_npvt.hpp" #include "testing_geblttrs_npvt.hpp" #include "testing_gelq2_gelqf.hpp" #include "testing_gels.hpp" #include "testing_geql2_geqlf.hpp" #include "testing_geqr2_geqrf.hpp" #include "testing_gerq2_gerqf.hpp" #include "testing_gesv.hpp" #include "testing_gesvd.hpp" #include "testing_gesvdj.hpp" #include "testing_gesvdx.hpp" #include "testing_getf2_getrf.hpp" #include "testing_getf2_getrf_npvt.hpp" #include "testing_getri.hpp" #include "testing_getri_npvt.hpp" #include "testing_getri_npvt_outofplace.hpp" #include "testing_getri_outofplace.hpp" #include "testing_getrs.hpp" #include "testing_labrd.hpp" #include "testing_lacgv.hpp" #include "testing_larf.hpp" #include "testing_larfb.hpp" #include "testing_larfg.hpp" #include "testing_larft.hpp" #include "testing_laswp.hpp" #include "testing_lasyf.hpp" #include "testing_latrd.hpp" #include "testing_lauum.hpp" #include "testing_orgbr_ungbr.hpp" #include "testing_orglx_unglx.hpp" #include "testing_orgtr_ungtr.hpp" #include "testing_orgxl_ungxl.hpp" #include "testing_orgxr_ungxr.hpp" #include "testing_ormbr_unmbr.hpp" #include "testing_ormlx_unmlx.hpp" #include "testing_ormtr_unmtr.hpp" #include "testing_ormxl_unmxl.hpp" #include "testing_ormxr_unmxr.hpp" #include "testing_posv.hpp" #include "testing_potf2_potrf.hpp" #include "testing_potri.hpp" #include "testing_potrs.hpp" #include "testing_stebz.hpp" #include "testing_stedc.hpp" #include "testing_stein.hpp" #include "testing_steqr.hpp" #include "testing_sterf.hpp" #include "testing_syev_heev.hpp" #include "testing_syevd_heevd.hpp" #include "testing_syevj_heevj.hpp" #include "testing_syevx_heevx.hpp" #include "testing_sygsx_hegsx.hpp" #include "testing_sygv_hegv.hpp" #include "testing_sygvd_hegvd.hpp" #include "testing_sygvj_hegvj.hpp" #include "testing_sygvx_hegvx.hpp" #include "testing_sytf2_sytrf.hpp" #include "testing_sytxx_hetxx.hpp" #include "testing_trtri.hpp" struct str_less { bool operator()(const char* a, const char* b) const { return strcmp(a, b) < 0; } }; // Map from const char* to function taking const Arguments& using comparison above using func_map = std::map; // Function dispatcher for rocSOLVER tests class rocsolver_dispatcher { template static rocblas_status run_function(const char* name, Arguments& argus) { // Map for functions that support all precisions static const func_map map = { {"laswp", testing_laswp}, {"larfg", testing_larfg}, {"larf", testing_larf}, {"larft", testing_larft}, {"larfb", testing_larfb}, {"latrd", testing_latrd}, {"labrd", testing_labrd}, {"bdsqr", testing_bdsqr}, {"steqr", testing_steqr}, {"stedc", testing_stedc}, {"stein", testing_stein}, {"lasyf", testing_lasyf}, // potrf {"potf2", testing_potf2_potrf}, {"potf2_batched", testing_potf2_potrf}, {"potf2_strided_batched", testing_potf2_potrf}, {"potrf", testing_potf2_potrf}, {"potrf_batched", testing_potf2_potrf}, {"potrf_strided_batched", testing_potf2_potrf}, // potrs {"potrs", testing_potrs}, {"potrs_batched", testing_potrs}, {"potrs_strided_batched", testing_potrs}, // posv {"posv", testing_posv}, {"posv_batched", testing_posv}, {"posv_strided_batched", testing_posv}, // potri {"potri", testing_potri}, {"potri_batched", testing_potri}, {"potri_strided_batched", testing_potri}, // getrf_npvt {"getf2_npvt", testing_getf2_getrf_npvt}, {"getf2_npvt_batched", testing_getf2_getrf_npvt}, {"getf2_npvt_strided_batched", testing_getf2_getrf_npvt}, {"getrf_npvt", testing_getf2_getrf_npvt}, {"getrf_npvt_batched", testing_getf2_getrf_npvt}, {"getrf_npvt_strided_batched", testing_getf2_getrf_npvt}, // getrf {"getf2", testing_getf2_getrf}, {"getf2_batched", testing_getf2_getrf}, {"getf2_strided_batched", testing_getf2_getrf}, {"getrf", testing_getf2_getrf}, {"getrf_batched", testing_getf2_getrf}, {"getrf_strided_batched", testing_getf2_getrf}, // geqrf {"geqr2", testing_geqr2_geqrf}, {"geqr2_batched", testing_geqr2_geqrf}, {"geqr2_strided_batched", testing_geqr2_geqrf}, {"geqrf", testing_geqr2_geqrf}, {"geqrf_batched", testing_geqr2_geqrf}, {"geqrf_strided_batched", testing_geqr2_geqrf}, {"geqrf_ptr_batched", testing_geqr2_geqrf}, // gerqf {"gerq2", testing_gerq2_gerqf}, {"gerq2_batched", testing_gerq2_gerqf}, {"gerq2_strided_batched", testing_gerq2_gerqf}, {"gerqf", testing_gerq2_gerqf}, {"gerqf_batched", testing_gerq2_gerqf}, {"gerqf_strided_batched", testing_gerq2_gerqf}, // geqlf {"geql2", testing_geql2_geqlf}, {"geql2_batched", testing_geql2_geqlf}, {"geql2_strided_batched", testing_geql2_geqlf}, {"geqlf", testing_geql2_geqlf}, {"geqlf_batched", testing_geql2_geqlf}, {"geqlf_strided_batched", testing_geql2_geqlf}, // gelqf {"gelq2", testing_gelq2_gelqf}, {"gelq2_batched", testing_gelq2_gelqf}, {"gelq2_strided_batched", testing_gelq2_gelqf}, {"gelqf", testing_gelq2_gelqf}, {"gelqf_batched", testing_gelq2_gelqf}, {"gelqf_strided_batched", testing_gelq2_gelqf}, // getrs {"getrs", testing_getrs}, {"getrs_batched", testing_getrs}, {"getrs_strided_batched", testing_getrs}, // gesv {"gesv", testing_gesv}, {"gesv_batched", testing_gesv}, {"gesv_strided_batched", testing_gesv}, // gesvd {"gesvd", testing_gesvd}, {"gesvd_batched", testing_gesvd}, {"gesvd_strided_batched", testing_gesvd}, // gesvdj {"gesvdj", testing_gesvdj}, {"gesvdj_batched", testing_gesvdj}, {"gesvdj_strided_batched", testing_gesvdj}, // gesvdx {"gesvdx", testing_gesvdx}, {"gesvdx_batched", testing_gesvdx}, {"gesvdx_strided_batched", testing_gesvdx}, // trtri {"trtri", testing_trtri}, {"trtri_batched", testing_trtri}, {"trtri_strided_batched", testing_trtri}, // getri {"getri", testing_getri}, {"getri_batched", testing_getri}, {"getri_strided_batched", testing_getri}, // getri_npvt {"getri_npvt", testing_getri_npvt}, {"getri_npvt_batched", testing_getri_npvt}, {"getri_npvt_strided_batched", testing_getri_npvt}, // getri_outofplace {"getri_outofplace", testing_getri_outofplace}, {"getri_outofplace_batched", testing_getri_outofplace}, {"getri_outofplace_strided_batched", testing_getri_outofplace}, // getri_npvt_outofplace {"getri_npvt_outofplace", testing_getri_npvt_outofplace}, {"getri_npvt_outofplace_batched", testing_getri_npvt_outofplace}, {"getri_npvt_outofplace_strided_batched", testing_getri_npvt_outofplace}, // gels {"gels", testing_gels}, {"gels_batched", testing_gels}, {"gels_strided_batched", testing_gels}, // gebrd {"gebd2", testing_gebd2_gebrd}, {"gebd2_batched", testing_gebd2_gebrd}, {"gebd2_strided_batched", testing_gebd2_gebrd}, {"gebrd", testing_gebd2_gebrd}, {"gebrd_batched", testing_gebd2_gebrd}, {"gebrd_strided_batched", testing_gebd2_gebrd}, // sytrf {"sytf2", testing_sytf2_sytrf}, {"sytf2_batched", testing_sytf2_sytrf}, {"sytf2_strided_batched", testing_sytf2_sytrf}, {"sytrf", testing_sytf2_sytrf}, {"sytrf_batched", testing_sytf2_sytrf}, {"sytrf_strided_batched", testing_sytf2_sytrf}, // geblttrf_npvt {"geblttrf_npvt", testing_geblttrf_npvt}, {"geblttrf_npvt_batched", testing_geblttrf_npvt}, {"geblttrf_npvt_strided_batched", testing_geblttrf_npvt}, // geblttrs_npvt {"geblttrs_npvt", testing_geblttrs_npvt}, {"geblttrs_npvt_batched", testing_geblttrs_npvt}, {"geblttrs_npvt_strided_batched", testing_geblttrs_npvt}, }; // Grab function from the map and execute auto match = map.find(name); if(match != map.end()) { match->second(argus); return rocblas_status_success; } else return rocblas_status_invalid_value; } template , int> = 0> static rocblas_status run_function_limited_precision(const char* name, Arguments& argus) { // Map for functions that support only single and double precisions static const func_map map_real = { {"sterf", testing_sterf}, {"stebz", testing_stebz}, {"bdsvdx", testing_bdsvdx}, {"lauum", testing_lauum}, // orgxx {"org2r", testing_orgxr_ungxr}, {"orgqr", testing_orgxr_ungxr}, {"org2l", testing_orgxl_ungxl}, {"orgql", testing_orgxl_ungxl}, {"orgl2", testing_orglx_unglx}, {"orglq", testing_orglx_unglx}, {"orgbr", testing_orgbr_ungbr}, {"orgtr", testing_orgtr_ungtr}, // ormxx {"orm2r", testing_ormxr_unmxr}, {"ormqr", testing_ormxr_unmxr}, {"orm2l", testing_ormxl_unmxl}, {"ormql", testing_ormxl_unmxl}, {"orml2", testing_ormlx_unmlx}, {"ormlq", testing_ormlx_unmlx}, {"ormbr", testing_ormbr_unmbr}, {"ormtr", testing_ormtr_unmtr}, // sytrd {"sytd2", testing_sytxx_hetxx}, {"sytd2_batched", testing_sytxx_hetxx}, {"sytd2_strided_batched", testing_sytxx_hetxx}, {"sytrd", testing_sytxx_hetxx}, {"sytrd_batched", testing_sytxx_hetxx}, {"sytrd_strided_batched", testing_sytxx_hetxx}, // sygst {"sygs2", testing_sygsx_hegsx}, {"sygs2_batched", testing_sygsx_hegsx}, {"sygs2_strided_batched", testing_sygsx_hegsx}, {"sygst", testing_sygsx_hegsx}, {"sygst_batched", testing_sygsx_hegsx}, {"sygst_strided_batched", testing_sygsx_hegsx}, // syev {"syev", testing_syev_heev}, {"syev_batched", testing_syev_heev}, {"syev_strided_batched", testing_syev_heev}, // syevd {"syevd", testing_syevd_heevd}, {"syevd_batched", testing_syevd_heevd}, {"syevd_strided_batched", testing_syevd_heevd}, // syevj {"syevj", testing_syevj_heevj}, {"syevj_batched", testing_syevj_heevj}, {"syevj_strided_batched", testing_syevj_heevj}, // syevx {"syevx", testing_syevx_heevx}, {"syevx_batched", testing_syevx_heevx}, {"syevx_strided_batched", testing_syevx_heevx}, // sygv {"sygv", testing_sygv_hegv}, {"sygv_batched", testing_sygv_hegv}, {"sygv_strided_batched", testing_sygv_hegv}, // sygvd {"sygvd", testing_sygvd_hegvd}, {"sygvd_batched", testing_sygvd_hegvd}, {"sygvd_strided_batched", testing_sygvd_hegvd}, // sygvj {"sygvj", testing_sygvj_hegvj}, {"sygvj_batched", testing_sygvj_hegvj}, {"sygvj_strided_batched", testing_sygvj_hegvj}, // sygvx {"sygvx", testing_sygvx_hegvx}, {"sygvx_batched", testing_sygvx_hegvx}, {"sygvx_strided_batched", testing_sygvx_hegvx}, }; // Grab function from the map and execute auto match = map_real.find(name); if(match != map_real.end()) { match->second(argus); return rocblas_status_success; } else return rocblas_status_invalid_value; } template , int> = 0> static rocblas_status run_function_limited_precision(const char* name, Arguments& argus) { // Map for functions that support only single-complex and double-complex precisions static const func_map map_complex = { {"lacgv", testing_lacgv}, // ungxx {"ung2r", testing_orgxr_ungxr}, {"ungqr", testing_orgxr_ungxr}, {"ung2l", testing_orgxl_ungxl}, {"ungql", testing_orgxl_ungxl}, {"ungl2", testing_orglx_unglx}, {"unglq", testing_orglx_unglx}, {"ungbr", testing_orgbr_ungbr}, {"ungtr", testing_orgtr_ungtr}, // unmxx {"unm2r", testing_ormxr_unmxr}, {"unmqr", testing_ormxr_unmxr}, {"unm2l", testing_ormxl_unmxl}, {"unmql", testing_ormxl_unmxl}, {"unml2", testing_ormlx_unmlx}, {"unmlq", testing_ormlx_unmlx}, {"unmbr", testing_ormbr_unmbr}, {"unmtr", testing_ormtr_unmtr}, // hetrd {"hetd2", testing_sytxx_hetxx}, {"hetd2_batched", testing_sytxx_hetxx}, {"hetd2_strided_batched", testing_sytxx_hetxx}, {"hetrd", testing_sytxx_hetxx}, {"hetrd_batched", testing_sytxx_hetxx}, {"hetrd_strided_batched", testing_sytxx_hetxx}, // hegst {"hegs2", testing_sygsx_hegsx}, {"hegs2_batched", testing_sygsx_hegsx}, {"hegs2_strided_batched", testing_sygsx_hegsx}, {"hegst", testing_sygsx_hegsx}, {"hegst_batched", testing_sygsx_hegsx}, {"hegst_strided_batched", testing_sygsx_hegsx}, // heev {"heev", testing_syev_heev}, {"heev_batched", testing_syev_heev}, {"heev_strided_batched", testing_syev_heev}, // heevd {"heevd", testing_syevd_heevd}, {"heevd_batched", testing_syevd_heevd}, {"heevd_strided_batched", testing_syevd_heevd}, // heevj {"heevj", testing_syevj_heevj}, {"heevj_batched", testing_syevj_heevj}, {"heevj_strided_batched", testing_syevj_heevj}, // heevx {"heevx", testing_syevx_heevx}, {"heevx_batched", testing_syevx_heevx}, {"heevx_strided_batched", testing_syevx_heevx}, // hegv {"hegv", testing_sygv_hegv}, {"hegv_batched", testing_sygv_hegv}, {"hegv_strided_batched", testing_sygv_hegv}, // hegvd {"hegvd", testing_sygvd_hegvd}, {"hegvd_batched", testing_sygvd_hegvd}, {"hegvd_strided_batched", testing_sygvd_hegvd}, // hegvj {"hegvj", testing_sygvj_hegvj}, {"hegvj_batched", testing_sygvj_hegvj}, {"hegvj_strided_batched", testing_sygvj_hegvj}, // hegvx {"hegvx", testing_sygvx_hegvx}, {"hegvx_batched", testing_sygvx_hegvx}, {"hegvx_strided_batched", testing_sygvx_hegvx}, }; // Grab function from the map and execute auto match = map_complex.find(name); if(match != map_complex.end()) { match->second(argus); return rocblas_status_success; } else return rocblas_status_invalid_value; } public: static void invoke(const std::string& name, char precision, Arguments& argus) { rocblas_status status; if(precision == 's') status = run_function(name.c_str(), argus); else if(precision == 'd') status = run_function(name.c_str(), argus); else if(precision == 'c') status = run_function(name.c_str(), argus); else if(precision == 'z') status = run_function(name.c_str(), argus); else throw std::invalid_argument("Invalid value for --precision"); if(status == rocblas_status_invalid_value) { if(precision == 's') status = run_function_limited_precision(name.c_str(), argus); else if(precision == 'd') status = run_function_limited_precision(name.c_str(), argus); else if(precision == 'c') status = run_function_limited_precision(name.c_str(), argus); else if(precision == 'z') status = run_function_limited_precision(name.c_str(), argus); } if(status == rocblas_status_invalid_value) { std::string msg = "Invalid combination --function "; msg += name; msg += " --precision "; msg += precision; throw std::invalid_argument(msg); } } }; rocSOLVER-rocm-5.5.1/clients/include/rocsolver_test.hpp000066400000000000000000000062751436600607200230660ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include #include #include #include #include // If USE_ROCBLAS_REALLOC_ON_DEMAND is false, automatic reallocation is disable and we will manually // reallocate workspace #define USE_ROCBLAS_REALLOC_ON_DEMAND true #ifdef ROCSOLVER_CLIENTS_TEST #define ROCSOLVER_TEST_CHECK(T, max_error, tol) ASSERT_LE((max_error), (tol)*get_epsilon()) #else // ROCSOLVER_CLIENTS_BENCH #define ROCSOLVER_TEST_CHECK(T, max_error, tol) #endif typedef enum rocsolver_inform_type_ { inform_quick_return, inform_invalid_size, inform_invalid_args, inform_mem_query, } rocsolver_inform_type; inline void rocsolver_bench_inform(rocsolver_inform_type it, size_t arg = 0) { switch(it) { case inform_quick_return: fmt::print("Quick return...\n"); break; case inform_invalid_size: fmt::print("Invalid size arguments...\n"); break; case inform_invalid_args: fmt::print("Invalid value in arguments...\n"); break; case inform_mem_query: fmt::print("{} bytes of device memory are required...\n", arg); break; } fmt::print("No performance data to collect.\n"); fmt::print("No computations to verify.\n"); std::fflush(stdout); } // recursive format function (base case) inline void format_bench_table(std::string&) {} // recursive format function template inline void format_bench_table(std::string& str, T arg, Ts... args) { str += fmt::format("{:<15}", arg); if(sizeof...(Ts) > 0) str += ' '; format_bench_table(str, args...); } template void rocsolver_bench_output(Ts... args) { std::string table_row; format_bench_table(table_row, args...); std::puts(table_row.c_str()); std::fflush(stdout); } inline void rocsolver_bench_header(const char* title) { fmt::print("\n{:=<44}\n{}\n{:=<44}\n", "", title, ""); } inline void rocsolver_bench_endl() { std::putc('\n', stdout); std::fflush(stdout); } template , int> = 0> inline T sconj(T scalar) { return scalar; } template , int> = 0> inline T sconj(T scalar) { return std::conj(scalar); } // A struct implicity convertable to and from char, used so we can customize Google Test // output for LAPACK char arguments without affecting the default char output. class printable_char { char value; public: printable_char(char c) : value(c) { if(c < 0x20 || c >= 0x7F) throw std::invalid_argument(fmt::format( "printable_char must be a printable ASCII character (received {:#x})", c)); } operator char() const { return value; } }; // gtest printers inline std::ostream& operator<<(std::ostream& os, rocblas_status x) { return os << rocblas_status_to_string(x); } inline std::ostream& operator<<(std::ostream& os, printable_char x) { return os << char(x); } rocSOLVER-rocm-5.5.1/clients/include/testing_bdsqr.hpp000066400000000000000000000571501436600607200226570ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void bdsqr_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nv, const rocblas_int nu, const rocblas_int nc, S dD, S dE, T dV, const rocblas_int ldv, T dU, const rocblas_int ldu, T dC, const rocblas_int ldc, rocblas_int* dinfo) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_bdsqr(nullptr, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dinfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, rocblas_fill_full, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dinfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, (S) nullptr, dE, dV, ldv, dU, ldu, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD, (S) nullptr, dV, ldv, dU, ldu, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD, dE, (T) nullptr, ldv, dU, ldu, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, (T) nullptr, ldu, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, (T) nullptr, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, 0, nv, nu, nc, (S) nullptr, (S) nullptr, (T) nullptr, ldv, (T) nullptr, ldu, (T) nullptr, ldc, dinfo), rocblas_status_success); } template void testing_bdsqr_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 2; rocblas_int nv = 2; rocblas_int nu = 2; rocblas_int nc = 2; rocblas_int ldv = 2; rocblas_int ldu = 2; rocblas_int ldc = 2; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments bdsqr_checkBadArgs(handle, uplo, n, nv, nu, nc, dD.data(), dE.data(), dV.data(), ldv, dU.data(), ldu, dC.data(), ldc, dinfo.data()); } template void bdsqr_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nv, const rocblas_int nu, const rocblas_int nc, Sd& dD, Sd& dE, Td& dV, const rocblas_int ldv, Td& dU, const rocblas_int ldu, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hV, Th& hU, Th& hC, Uh& hInfo, std::vector& D, std::vector& E) { if(CPU) { rocblas_init(hD, true); rocblas_init(hE, false); // Adding possible gaps to fully test the algorithm. for(rocblas_int i = 0; i < n - 1; ++i) { hE[0][i] -= 5; hD[0][i] -= 4; } hD[0][n - 1] -= 4; // (Forcing non-convergence expecting lapack and rocsolver to give // the same orthogonal equivalent matrix is not possible. Testing // implicitly the equivalent matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). // make copy of original data to test vectors if required if(nv || nu || nc) { for(rocblas_int i = 0; i < nv - 1; ++i) { E[i] = hE[0][i]; D[i] = hD[0][i]; } D[nv - 1] = hD[0][nv - 1]; } // make V,U and C identities so that results are actually singular vectors // of B if(nv > 0) { memset(hV[0], 0, ldv * nv * sizeof(T)); for(rocblas_int i = 0; i < min(n, nv); ++i) hV[0][i + i * ldv] = T(1.0); } if(nu > 0) { memset(hU[0], 0, ldu * n * sizeof(T)); for(rocblas_int i = 0; i < min(n, nu); ++i) hU[0][i + i * ldu] = T(1.0); } if(nc > 0) { memset(hC[0], 0, ldc * nc * sizeof(T)); for(rocblas_int i = 0; i < min(n, nc); ++i) hC[0][i + i * ldc] = T(1.0); } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); if(nv > 0) CHECK_HIP_ERROR(dV.transfer_from(hV)); if(nu > 0) CHECK_HIP_ERROR(dU.transfer_from(hU)); if(nc > 0) CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void bdsqr_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nv, const rocblas_int nu, const rocblas_int nc, Sd& dD, Sd& dE, Td& dV, const rocblas_int ldv, Td& dU, const rocblas_int ldu, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hDRes, Sh& hE, Sh& hERes, Th& hV, Th& hU, Th& hC, Uh& hInfo, Uh& hInfoRes, double* max_err, double* max_errv) { using S = decltype(std::real(T{})); std::vector hW(4 * n); std::vector D(nv); std::vector E(nv); // input data initialization bdsqr_initData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E); // execute computations // CPU lapack cpu_bdsqr(uplo, n, nv, nu, nc, hD[0], hE[0], hV[0], ldv, hU[0], ldu, hC[0], ldc, hW.data(), hInfo[0]); // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD.data(), dE.data(), dV.data(), ldv, dU.data(), ldu, dC.data(), ldc, dInfo.data())); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hERes.transfer_from(dE)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(nv > 0) CHECK_HIP_ERROR(hV.transfer_from(dV)); if(nu > 0) CHECK_HIP_ERROR(hU.transfer_from(dU)); if(nc > 0) CHECK_HIP_ERROR(hC.transfer_from(dC)); // Check info for non-covergence *max_err = 0; if(hInfo[0][0] != hInfoRes[0][0]) *max_err = 1; // (We expect the used input matrices to always converge. Testing // implicitely the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). // error is ||hD - hDRes|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) double err; T tmp; *max_errv = 0; err = norm_error('F', 1, n, 1, hD[0], hDRes[0]); *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required if(hInfo[0][0] == 0 && (nv || nu || nc)) { err = 0; if(uplo == rocblas_fill_upper) { // check singular vectors implicitely (A'*u_i = s_i*v_i) for(rocblas_int i = 0; i < nv; ++i) { for(rocblas_int j = 0; j < n; ++j) { if(i > 0) tmp = D[i] * hU[0][i + j * ldu] + E[i - 1] * hU[0][(i - 1) + j * ldu] - hDRes[0][j] * hV[0][j + i * ldv]; else tmp = D[i] * hU[0][i + j * ldu] - hDRes[0][j] * hV[0][j + i * ldv]; err += std::abs(tmp) * std::abs(tmp); } } } else { // check singular vectors implicitely (A*v_i = s_i*u_i) for(rocblas_int i = 0; i < nv; ++i) { for(rocblas_int j = 0; j < n; ++j) { if(i > 0) tmp = D[i] * hV[0][j + i * ldv] + E[i - 1] * hV[0][j + (i - 1) * ldv] - hDRes[0][j] * hU[0][i + j * ldu]; else tmp = D[i] * hV[0][j + i * ldv] - hDRes[0][j] * hU[0][i + j * ldu]; err += std::abs(tmp) * std::abs(tmp); } } } double normD = double(snorm('F', 1, n, D.data(), 1)); double normE = double(snorm('F', 1, n - 1, E.data(), 1)); err = std::sqrt(err) / std::sqrt(normD * normD + normE * normE); *max_errv = err > *max_errv ? err : *max_errv; // C should be the transpose of U if(nc) { err = 0; for(rocblas_int i = 0; i < nv; ++i) { for(rocblas_int j = 0; j < n; ++j) { tmp = hC[0][j + i * ldc] - hU[0][i + j * ldu]; err += std::abs(tmp) * std::abs(tmp); } } err = std::sqrt(err); *max_errv = err > *max_errv ? err : *max_errv; } } } template void bdsqr_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nv, const rocblas_int nu, const rocblas_int nc, Sd& dD, Sd& dE, Td& dV, const rocblas_int ldv, Td& dU, const rocblas_int ldu, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hV, Th& hU, Th& hC, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { using S = decltype(std::real(T{})); std::vector hW(4 * n); std::vector D(nv); std::vector E(nv); if(!perf) { bdsqr_initData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_bdsqr(uplo, n, nv, nu, nc, hD[0], hE[0], hV[0], ldv, hU[0], ldu, hC[0], ldc, hW.data(), hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } bdsqr_initData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E); // cold calls for(int iter = 0; iter < 2; iter++) { bdsqr_initData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E); CHECK_ROCBLAS_ERROR(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD.data(), dE.data(), dV.data(), ldv, dU.data(), ldu, dC.data(), ldc, dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { bdsqr_initData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E); start = get_time_us_sync(stream); rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD.data(), dE.data(), dV.data(), ldv, dU.data(), ldu, dC.data(), ldc, dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_bdsqr(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int nv = argus.get("nv"); rocblas_int nu = argus.get("nu"); rocblas_int nc = argus.get("nc"); rocblas_int ldv = argus.get("ldv", nv > 0 ? n : 1); rocblas_int ldu = argus.get("ldu", nu); rocblas_int ldc = argus.get("ldc", nc > 0 ? n : 1); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int hot_calls = argus.iters; // size for testing singular vectors rocblas_int nT, nvT = 0, nuT = 0, ncT = 0, lduT = 1, ldcT = 1, ldvT = 1; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldv, (T*)nullptr, ldu, (T*)nullptr, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes // (TESTING OF SINGULAR VECTORS IS DONE IMPLICITLY (NOT EXPLICITLY COMPARING // WITH LAPACK) // SO, WE ALWAYS NEED TO COMPUTE THE SAME NUMBER OF ELEMENTS OF THE RIGHT AND // LEFT VECTORS) if(nc) { nT = min(n, max(nc, max(nu, nv))); nuT = nT; nvT = nT; ncT = nT; ldvT = n; ldcT = n; lduT = nT; } else if(nv || nu) { nT = min(n, max(nv, nu)); nuT = nT; nvT = nT; lduT = nT; ldvT = n; } // E, V, U, and C could have size zero in cases that are not quick-return or // invalid cases setting the size to one to avoid possible memory-access // errors in the rest of the unit test size_t size_D = size_t(n); size_t size_E = n > 1 ? size_t(n - 1) : 1; size_t size_V = max(size_t(ldv) * nv, 1); size_t size_U = max(size_t(ldu) * n, 1); size_t size_C = max(size_t(ldc) * nc, 1); size_t size_VT = max(size_t(ldvT) * nvT, 1); size_t size_UT = max(size_t(lduT) * n, 1); size_t size_CT = max(size_t(ldcT) * ncT, 1); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || nv < 0 || nu < 0 || nc < 0 || ldu < nu || ldv < 1 || ldc < 1) || (nv > 0 && ldv < n) || (nc > 0 && ldc < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldv, (T*)nullptr, ldu, (T*)nullptr, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldv, (T*)nullptr, ldu, (T*)nullptr, ldc, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD.data(), dE.data(), (T*)nullptr, ldv, (T*)nullptr, ldu, (T*)nullptr, ldc, dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { host_strided_batch_vector hDRes(size_D, 1, size_D, 1); host_strided_batch_vector hERes(size_E, 1, size_E, 1); host_strided_batch_vector hV(size_VT, 1, size_VT, 1); host_strided_batch_vector hU(size_UT, 1, size_UT, 1); host_strided_batch_vector hC(size_CT, 1, size_CT, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); device_strided_batch_vector dV(size_VT, 1, size_VT, 1); device_strided_batch_vector dU(size_UT, 1, size_UT, 1); device_strided_batch_vector dC(size_CT, 1, size_CT, 1); if(size_VT) CHECK_HIP_ERROR(dV.memcheck()); if(size_UT) CHECK_HIP_ERROR(dU.memcheck()); if(size_CT) CHECK_HIP_ERROR(dC.memcheck()); bdsqr_getError(handle, uplo, n, nvT, nuT, ncT, dD, dE, dV, ldvT, dU, lduT, dC, ldcT, dInfo, hD, hDRes, hE, hERes, hV, hU, hC, hInfo, hInfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { host_strided_batch_vector hV(size_V, 1, size_V, 1); host_strided_batch_vector hU(size_U, 1, size_U, 1); host_strided_batch_vector hC(size_C, 1, size_C, 1); device_strided_batch_vector dV(size_V, 1, size_V, 1); device_strided_batch_vector dU(size_U, 1, size_U, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); bdsqr_getPerfData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); if(nv || nu || nc) ROCSOLVER_TEST_CHECK(T, max_errorv, 2 * n); } // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { if(nv || nu || nc) max_error = (max_error >= max_errorv) ? max_error : max_errorv; rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "n", "nv", "nu", "nc", "ldv", "ldu", "ldc"); rocsolver_bench_output(uploC, n, nv, nu, nc, ldv, ldu, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_BDSQR(...) extern template void testing_bdsqr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_BDSQR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_bdsvdx.hpp000066400000000000000000000542631436600607200230400ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void bdsvdx_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_svect svect, const rocblas_srange srange, const rocblas_int n, U dD, U dE, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, rocblas_int* dNsv, U dS, U dZ, const rocblas_int ldz, rocblas_int* dIfail, rocblas_int* dInfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(nullptr, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, rocblas_fill_full, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, rocblas_svect_all, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, rocblas_srange(0), n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, (U) nullptr, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, (U) nullptr, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, (rocblas_int*)nullptr, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, (U) nullptr, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, (U) nullptr, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, (rocblas_int*)nullptr, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, (rocblas_int*)nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, 0, (U) nullptr, (U) nullptr, vl, vu, il, iu, dNsv, (U) nullptr, (U) nullptr, ldz, (rocblas_int*)nullptr, dInfo), rocblas_status_success); } template void testing_bdsvdx_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 2; rocblas_fill uplo = rocblas_fill_upper; rocblas_svect svect = rocblas_svect_singular; rocblas_srange srange = rocblas_srange_all; rocblas_int ldz = 4; T vl = 0; T vu = 0; rocblas_int il = 0; rocblas_int iu = 0; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dZ(1, 1, 1, 1); device_strided_batch_vector dNsv(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dNsv.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments bdsvdx_checkBadArgs(handle, uplo, svect, srange, n, dD.data(), dE.data(), vl, vu, il, iu, dNsv.data(), dS.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()); } template void bdsvdx_initData(const rocblas_handle handle, const rocblas_int n, Td& dD, Td& dE, Th& hD, Th& hE) { if(CPU) { rocblas_init(hD, true); rocblas_init(hE, true); // scale matrix and add fixed splits in the matrix to test split handling // (scaling ensures that all singular values are in [0, 20]) for(rocblas_int i = 0; i < n; i++) { hD[0][i] += 10; hE[0][i] = (hE[0][i] - 5) / 10; if(i == n / 4 || i == n / 2 || i == n - 1) hE[0][i] = 0; if(i == n / 7 || i == n / 5 || i == n / 3) hD[0][i] *= -1; } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); } } template void bdsvdx_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_svect svect, const rocblas_srange srange, const rocblas_int n, Td& dD, Td& dE, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, Ud& dNsv, Td& dS, Td& dZ, const rocblas_int ldz, Ud& dIfail, Ud& dInfo, Th& hD, Th& hE, Uh& hNsv, Uh& hNsvRes, Th& hS, Th& hSRes, Th& hZ, Th& hZRes, Uh& hIfailRes, Uh& hInfo, Uh& hInfoRes, double* max_err) { std::vector work(14 * n); std::vector iwork(12 * n); // input data initialization bdsvdx_initData(handle, n, dD, dE, hD, hE); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD.data(), dE.data(), vl, vu, il, iu, dNsv.data(), dS.data(), dZ.data(), ldz, dIfail.data(), dInfo.data())); CHECK_HIP_ERROR(hNsvRes.transfer_from(dNsv)); CHECK_HIP_ERROR(hSRes.transfer_from(dS)); CHECK_HIP_ERROR(hZRes.transfer_from(dZ)); CHECK_HIP_ERROR(hIfailRes.transfer_from(dIfail)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack // WORKAROUND: For some test cases, LAPACK's bdsvdx is returning incorrect singular values // when srange is rocblas_srange_index. In this case, we use rocblas_srange_all to get // all the singular values and offset and use il as an offset into the result array. rocblas_int ioffset = 0; if(srange == rocblas_srange_index) { cpu_bdsvdx(uplo, rocblas_svect_none, rocblas_srange_all, n, hD[0], hE[0], vl, vu, il, iu, hNsv[0], hS[0], hZ[0], ldz, work.data(), iwork.data(), hInfo[0]); ioffset = il - 1; hNsv[0][0] = iu - il + 1; } else { cpu_bdsvdx(uplo, rocblas_svect_none, srange, n, hD[0], hE[0], vl, vu, il, iu, hNsv[0], hS[0], hZ[0], ldz, work.data(), iwork.data(), hInfo[0]); } // check info if(hInfo[0][0] != hInfoRes[0][0]) *max_err = 1; else *max_err = 0; // if finding singular values succeded, check values double err; if(hInfoRes[0][0] == 0) { // check number of computed singular values rocblas_int nn = hNsvRes[0][0]; *max_err += std::abs(nn - hNsv[0][0]); // error is ||hS - hSRes|| / ||hS|| // using frobenius norm err = norm_error('F', 1, nn, 1, hS[0] + ioffset, hSRes[0]); *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required // U is stored in hZRes, and V is stored in hZRes+n if(svect != rocblas_svect_none) { err = 0; // form bidiagonal matrix B std::vector B(n * n); for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) B[i + j * n] = hD[0][i]; else if(i + 1 == j && uplo == rocblas_fill_upper) B[i + j * n] = hE[0][i]; else if(i == j + 1 && uplo == rocblas_fill_lower) B[i + j * n] = hE[0][j]; else B[i + j * n] = 0; } } // check singular vectors implicitly (B*v_k = s_k*u_k) for(rocblas_int k = 0; k < nn; ++k) { cpu_gemv(rocblas_operation_none, n, n, T(1), B.data(), n, hZRes[0] + n + k * ldz, 1, -hSRes[0][k], hZRes[0] + k * ldz, 1); } err = double(snorm('F', n, nn, hZRes[0], ldz)) / double(snorm('F', n, n, B.data(), n)); *max_err = err > *max_err ? err : *max_err; // check ifail err = 0; for(int j = 0; j < nn; j++) { if(hIfailRes[0][j] != 0) err++; } *max_err = err > *max_err ? err : *max_err; } } else { if(svect != rocblas_svect_none) { // check ifail err = 0; for(int j = 0; j < hInfoRes[0][0]; j++) { if(hIfailRes[0][j] == 0) err++; } *max_err = err > *max_err ? err : *max_err; } } } template void bdsvdx_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_svect svect, const rocblas_srange srange, const rocblas_int n, Td& dD, Td& dE, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, Ud& dNsv, Td& dS, Td& dZ, const rocblas_int ldz, Ud& dIfail, Ud& dInfo, Th& hD, Th& hE, Uh& hNsv, Th& hS, Th& hZ, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { std::vector work(14 * n); std::vector iwork(12 * n); bdsvdx_initData(handle, n, dD, dE, hD, hE); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_bdsvdx(uplo, svect, srange, n, hD[0], hE[0], vl, vu, il, iu, hNsv[0], hS[0], hZ[0], ldz, work.data(), iwork.data(), hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } bdsvdx_initData(handle, n, dD, dE, hD, hE); // cold calls for(int iter = 0; iter < 2; iter++) { bdsvdx_initData(handle, n, dD, dE, hD, hE); CHECK_ROCBLAS_ERROR(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD.data(), dE.data(), vl, vu, il, iu, dNsv.data(), dS.data(), dZ.data(), ldz, dIfail.data(), dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { bdsvdx_initData(handle, n, dD, dE, hD, hE); start = get_time_us_sync(stream); rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD.data(), dE.data(), vl, vu, il, iu, dNsv.data(), dS.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_bdsvdx(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); char svectC = argus.get("svect"); char srangeC = argus.get("srange"); rocblas_int n = argus.get("n"); T vl = T(argus.get("vl", 0)); T vu = T(argus.get("vu", srangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", srangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", srangeC == 'I' ? 1 : 0); rocblas_int ldz = argus.get("ldz", 2 * n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_svect svect = char2rocblas_svect(svectC); rocblas_srange srange = char2rocblas_srange(srangeC); rocblas_int hot_calls = argus.iters; // check non-supported values if((uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) || (svect != rocblas_svect_none && svect != rocblas_svect_singular)) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, (T*)nullptr, (T*)nullptr, vl, vu, il, iu, (rocblas_int*)nullptr, (T*)nullptr, (T*)nullptr, ldz, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_D = n; size_t size_E = n; size_t size_S = n; size_t size_Z = ldz * n; size_t size_Ifail = n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_SRes = (argus.unit_check || argus.norm_check) ? size_S : 0; size_t size_ZRes = (argus.unit_check || argus.norm_check) ? size_Z : 0; size_t size_IfailRes = (argus.unit_check || argus.norm_check) ? size_Ifail : 0; // check invalid sizes bool invalid_size = (n < 0) || (svect == rocblas_svect_none && ldz < 1) || (svect != rocblas_svect_none && ldz < 2 * n) || (srange == rocblas_srange_value && (vl < 0 || vl >= vu)) || (srange == rocblas_srange_index && ((iu > n) || (n > 0 && il > iu))) || (srange == rocblas_srange_index && (il < 1 || iu < 0)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, (T*)nullptr, (T*)nullptr, vl, vu, il, iu, (rocblas_int*)nullptr, (T*)nullptr, (T*)nullptr, ldz, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_bdsvdx(handle, uplo, svect, srange, n, (T*)nullptr, (T*)nullptr, vl, vu, il, iu, (rocblas_int*)nullptr, (T*)nullptr, (T*)nullptr, ldz, (rocblas_int*)nullptr, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations // host host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hS(size_S, 1, size_S, 1); host_strided_batch_vector hSRes(size_SRes, 1, size_SRes, 1); host_strided_batch_vector hZ(size_Z, 1, size_Z, 1); host_strided_batch_vector hZRes(size_ZRes, 1, size_ZRes, 1); host_strided_batch_vector hNsv(1, 1, 1, 1); host_strided_batch_vector hNsvRes(1, 1, 1, 1); host_strided_batch_vector hIfailRes(size_IfailRes, 1, size_IfailRes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); // device device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dS(size_S, 1, size_S, 1); device_strided_batch_vector dZ(size_Z, 1, size_Z, 1); device_strided_batch_vector dNsv(1, 1, 1, 1); device_strided_batch_vector dIfail(size_Ifail, 1, size_Ifail, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); if(size_Ifail) CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dNsv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD.data(), dE.data(), vl, vu, il, iu, dNsv.data(), dS.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) bdsvdx_getError(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo, hD, hE, hNsv, hNsvRes, hS, hSRes, hZ, hZRes, hIfailRes, hInfo, hInfoRes, &max_error); // collect performance data if(argus.timing) bdsvdx_getPerfData(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo, hD, hE, hNsv, hS, hZ, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "svect", "srange", "n", "vl", "vu", "il", "iu", "ldz"); rocsolver_bench_output(uploC, svectC, srangeC, n, vl, vu, il, iu, ldz); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_BDSVDX(...) extern template void testing_bdsvdx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_BDSVDX, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_gebd2_gebrd.hpp000066400000000000000000000644611436600607200236750ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void gebd2_gebrd_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, S dD, const rocblas_stride stD, S dE, const rocblas_stride stE, U dTauq, const rocblas_stride stQ, U dTaup, const rocblas_stride stP, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, nullptr, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, (T) nullptr, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, (S) nullptr, stD, dE, stE, dTauq, stQ, dTaup, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, dD, stD, (S) nullptr, stE, dTauq, stQ, dTaup, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, stQ, dTaup, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, 0, n, (T) nullptr, lda, stA, (S) nullptr, stD, (S) nullptr, stE, (U) nullptr, stQ, (U) nullptr, stP, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, 0, (T) nullptr, lda, stA, (S) nullptr, stD, (S) nullptr, stE, (U) nullptr, stQ, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, 0), rocblas_status_success); } template void testing_gebd2_gebrd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_stride stQ = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTauq(1, 1, 1, 1); device_strided_batch_vector dTaup(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTauq.memcheck()); CHECK_HIP_ERROR(dTaup.memcheck()); // check bad arguments gebd2_gebrd_checkBadArgs(handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTauq(1, 1, 1, 1); device_strided_batch_vector dTaup(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTauq.memcheck()); CHECK_HIP_ERROR(dTaup.memcheck()); // check bad arguments gebd2_gebrd_checkBadArgs(handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc); } } template void gebd2_gebrd_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Ud& dTauq, const rocblas_stride stQ, Ud& dTaup, const rocblas_stride stP, const rocblas_int bc, Th& hA, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j || (m >= n && j == i + 1) || (m < n && i == j + 1)) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gebd2_gebrd_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Ud& dTauq, const rocblas_stride stQ, Ud& dTaup, const rocblas_stride stP, const rocblas_int bc, Th& hA, Th& hARes, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; constexpr bool VERIFY_IMPLICIT_TEST = false; std::vector hW(max(m, n)); // input data initialization gebd2_gebrd_initData(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup); // execute computations // use verify_implicit_test to check correctness of the implicit test using // CPU lapack if(!VERIFY_IMPLICIT_TEST) { // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hTauq.transfer_from(dTauq)); CHECK_HIP_ERROR(hTaup.transfer_from(dTaup)); } else { // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { memcpy(hARes[b], hA[b], lda * n * sizeof(T)); GEBRD ? cpu_gebrd(m, n, hARes[b], lda, hD[b], hE[b], hTauq[b], hTaup[b], hW.data(), max(m, n)) : cpu_gebd2(m, n, hARes[b], lda, hD[b], hE[b], hTauq[b], hTaup[b], hW.data()); } } // reconstruct A from the factorization for implicit testing std::vector vec(max(m, n)); vec[0] = 1; for(rocblas_int b = 0; b < bc; ++b) { T* a = hARes[b]; T* tauq = hTauq[b]; T* taup = hTaup[b]; if(m >= n) { for(int j = n - 1; j >= 0; j--) { if(j < n - 1) { if(COMPLEX) { cpu_lacgv(1, taup + j, 1); cpu_lacgv(n - j - 1, a + j + (j + 1) * lda, lda); } for(int i = 1; i < n - j - 1; i++) { vec[i] = a[j + (j + i + 1) * lda]; a[j + (j + i + 1) * lda] = 0; } cpu_larf(rocblas_side_right, m - j, n - j - 1, vec.data(), 1, taup + j, a + j + (j + 1) * lda, lda, hW.data()); if(COMPLEX) cpu_lacgv(1, taup + j, 1); } for(int i = 1; i < m - j; i++) { vec[i] = a[(j + i) + j * lda]; a[(j + i) + j * lda] = 0; } cpu_larf(rocblas_side_left, m - j, n - j, vec.data(), 1, tauq + j, a + j + j * lda, lda, hW.data()); } } else { for(int j = m - 1; j >= 0; j--) { if(j < m - 1) { for(int i = 1; i < m - j - 1; i++) { vec[i] = a[(j + i + 1) + j * lda]; a[(j + i + 1) + j * lda] = 0; } cpu_larf(rocblas_side_left, m - j - 1, n - j, vec.data(), 1, tauq + j, a + (j + 1) + j * lda, lda, hW.data()); } if(COMPLEX) { cpu_lacgv(1, taup + j, 1); cpu_lacgv(n - j, a + j + j * lda, lda); } for(int i = 1; i < n - j; i++) { vec[i] = a[j + (j + i) * lda]; a[j + (j + i) * lda] = 0; } cpu_larf(rocblas_side_right, m - j, n - j, vec.data(), 1, taup + j, a + j + j * lda, lda, hW.data()); if(COMPLEX) cpu_lacgv(1, taup + j, 1); } } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void gebd2_gebrd_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Ud& dTauq, const rocblas_stride stQ, Ud& dTaup, const rocblas_stride stP, const rocblas_int bc, Th& hA, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(max(m, n)); if(!perf) { gebd2_gebrd_initData(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { GEBRD ? cpu_gebrd(m, n, hA[b], lda, hD[b], hE[b], hTauq[b], hTaup[b], hW.data(), max(m, n)) : cpu_gebd2(m, n, hA[b], lda, hD[b], hE[b], hTauq[b], hTaup[b], hW.data()); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gebd2_gebrd_initData(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup); // cold calls for(int iter = 0; iter < 2; iter++) { gebd2_gebrd_initData(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup); CHECK_ROCBLAS_ERROR(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gebd2_gebrd_initData(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup); start = get_time_us_sync(stream); rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gebd2_gebrd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stD = argus.get("strideD", min(m, n)); rocblas_stride stE = argus.get("strideE", min(m, n) - 1); rocblas_stride stQ = argus.get("strideQ", min(m, n)); rocblas_stride stP = argus.get("strideP", min(m, n)); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = lda * n; size_t size_D = min(m, n); size_t size_E = min(m, n) - 1; size_t size_Q = min(m, n); size_t size_P = min(m, n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stQ, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stQ, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stQ, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stQ, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hE(size_E, 1, stE, bc); host_strided_batch_vector hTaup(size_P, 1, stP, bc); host_strided_batch_vector hTauq(size_Q, 1, stQ, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dTauq(size_Q, 1, stQ, bc); device_strided_batch_vector dTaup(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_Q) CHECK_HIP_ERROR(dTauq.memcheck()); if(size_P) CHECK_HIP_ERROR(dTaup.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gebd2_gebrd_getError(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hARes, hD, hE, hTauq, hTaup, &max_error); // collect performance data if(argus.timing) gebd2_gebrd_getPerfData( handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hE(size_E, 1, stE, bc); host_strided_batch_vector hTaup(size_P, 1, stP, bc); host_strided_batch_vector hTauq(size_Q, 1, stQ, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dTauq(size_Q, 1, stQ, bc); device_strided_batch_vector dTaup(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_Q) CHECK_HIP_ERROR(dTauq.memcheck()); if(size_P) CHECK_HIP_ERROR(dTaup.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gebd2_gebrd_getError(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hARes, hD, hE, hTauq, hTaup, &max_error); // collect performance data if(argus.timing) gebd2_gebrd_getPerfData( handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m*n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, m * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEBD2_GEBRD(...) \ extern template void testing_gebd2_gebrd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEBD2_GEBRD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_geblttrf_npvt.hpp000066400000000000000000000661141436600607200244240ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void geblttrf_npvt_checkBadArgs(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, T dC, const rocblas_int ldc, const rocblas_stride stC, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, nullptr, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, (T) nullptr, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA, lda, stA, (T) nullptr, ldb, stB, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, 0, nblocks, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, (U) nullptr, 0), rocblas_status_success); } template void testing_geblttrf_npvt_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int nb = 1; rocblas_int nblocks = 2; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldc = 1; rocblas_stride stA = 2; rocblas_stride stB = 2; rocblas_stride stC = 2; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_batch_vector dC(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments geblttrf_npvt_checkBadArgs(handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments geblttrf_npvt_checkBadArgs(handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc); } } template void geblttrf_npvt_initData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, Td& dA, const rocblas_int lda, Td& dB, const rocblas_int ldb, Td& dC, const rocblas_int ldc, const rocblas_int bc, Th& hA, Th& hB, Th& hC, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, false); rocblas_init(hC, false); rocblas_int n = nb * nblocks; for(rocblas_int b = 0; b < bc; ++b) { // scale to avoid singularities // leaving matrix as diagonal dominant so that pivoting is not required for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { for(rocblas_int k = 0; k < nblocks; k++) { if(i == j) hB[b][i + j * ldb + k * ldb * nb] += 400; else hB[b][i + j * ldb + k * ldb * nb] -= 4; } for(rocblas_int k = 0; k < nblocks - 1; k++) { hA[b][i + j * lda + k * lda * nb] -= 4; hC[b][i + j * ldc + k * ldc * nb] -= 4; } } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes) rocblas_int jj = n / 4 + b; jj -= (jj / n) * n; rocblas_int j = jj % nb; rocblas_int k = jj / nb; for(rocblas_int i = 0; i < nb; i++) { // zero the jj-th column hB[b][i + j * ldb + k * ldb * nb] = 0; if(k < nblocks - 1) hA[b][i + j * lda + k * lda * nb] = 0; if(k > 0) hC[b][i + j * ldc + (k - 1) * ldc * nb] = 0; } jj = n / 2 + b; jj -= (jj / n) * n; j = jj % nb; k = jj / nb; for(rocblas_int i = 0; i < nb; i++) { // zero the jj-th column hB[b][i + j * ldb + k * ldb * nb] = 0; if(k < nblocks - 1) hA[b][i + j * lda + k * lda * nb] = 0; if(k > 0) hC[b][i + j * ldc + (k - 1) * ldc * nb] = 0; } jj = n - 1 + b; jj -= (jj / n) * n; j = jj % nb; k = jj / nb; for(rocblas_int i = 0; i < nb; i++) { // zero the jj-th column hB[b][i + j * ldb + k * ldb * nb] = 0; if(k < nblocks - 1) hA[b][i + j * lda + k * lda * nb] = 0; if(k > 0) hC[b][i + j * ldc + (k - 1) * ldc * nb] = 0; } } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void geblttrf_npvt_getError(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hBRes, Th& hC, Th& hCRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { int n = nb * nblocks; std::vector L(n * n); std::vector U(n * n); std::vector M(n * n); std::vector MRes(n * n); // input data initialization geblttrf_npvt_initData(handle, nb, nblocks, dA, lda, dB, ldb, dC, ldc, bc, hA, hB, hC, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); CHECK_HIP_ERROR(hCRes.transfer_from(dC)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { if(hInfoRes[b][0] <= 0) err++; } else { if(hInfoRes[b][0] != 0) err++; } } *max_err += err; for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { // compute diagonal blocks and store in full matrix L for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { if(i <= j) L[i + j * n + k * (n + 1) * nb] = hBRes[b][i + j * ldb + k * ldb * nb]; else L[i + j * n + k * (n + 1) * nb] = 0; } } cpu_trmm(rocblas_side_left, rocblas_fill_lower, rocblas_operation_none, rocblas_diagonal_unit, nb, nb, T(1), hBRes[b] + k * ldb * nb, ldb, L.data() + k * (n + 1) * nb, n); } // move blocks A, updated C, and I into full matrices L and U for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { if(k < nblocks - 1) { for(rocblas_int j = 0; j < nb; j++) { U[i + (j + nb) * n + k * (n + 1) * nb] = hCRes[b][i + j * ldc + k * ldc * nb]; L[(i + nb) + j * n + k * (n + 1) * nb] = hA[b][i + j * lda + k * lda * nb]; } } U[i + i * n + k * (n + 1) * nb] = 1; } } // reconstruct input matrix from factors and store it in MRes cpu_gemm(rocblas_operation_none, rocblas_operation_none, n, n, n, T(1), L.data(), n, U.data(), n, T(0), MRes.data(), n); // form original matrix from original blocks for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { M[i + j * n + k * (n + 1) * nb] = hB[b][i + j * ldb + k * ldb * nb]; if(k < nblocks - 1) { M[(i + nb) + j * n + k * (n + 1) * nb] = hA[b][i + j * lda + k * lda * nb]; M[i + (j + nb) * n + k * (n + 1) * nb] = hC[b][i + j * ldc + k * ldc * nb]; } } } } // error is ||M - MRes|| / ||M|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm err = norm_error('F', n, n, n, M.data(), MRes.data()); *max_err = err > *max_err ? err : *max_err; } } } template void geblttrf_npvt_getPerfData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { // there is no direct CPU/LAPACK equivalent for this function, therefore // we return an invalid CPU time *cpu_time_used = nan(""); } geblttrf_npvt_initData(handle, nb, nblocks, dA, lda, dB, ldb, dC, ldc, bc, hA, hB, hC, singular); // cold calls for(int iter = 0; iter < 2; iter++) { geblttrf_npvt_initData(handle, nb, nblocks, dA, lda, dB, ldb, dC, ldc, bc, hA, hB, hC, singular); CHECK_ROCBLAS_ERROR(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { geblttrf_npvt_initData(handle, nb, nblocks, dA, lda, dB, ldb, dC, ldc, bc, hA, hB, hC, singular); start = get_time_us_sync(stream); rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_geblttrf_npvt(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int nb = argus.get("nb"); rocblas_int nblocks = argus.get("nblocks"); rocblas_int lda = argus.get("lda", nb); rocblas_int ldb = argus.get("ldb", nb); rocblas_int ldc = argus.get("ldc", nb); rocblas_stride stA = argus.get("strideA", lda * nb * nblocks); rocblas_stride stB = argus.get("strideB", ldb * nb * nblocks); rocblas_stride stC = argus.get("strideC", ldc * nb * nblocks); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; rocblas_stride stCRes = (argus.unit_check || argus.norm_check) ? stC : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * nb * nblocks; size_t size_B = size_t(ldb) * nb * nblocks; size_t size_C = size_t(ldc) * nb * nblocks; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; size_t size_CRes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (nb < 0 || nblocks < 0 || lda < nb || ldb < nb || ldc < nb || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_geblttrf_npvt( STRIDED, handle, nb, nblocks, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hC(size_C, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_batch_vector hCRes(size_CRes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dC(size_C, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(nb == 0 || nblocks == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geblttrf_npvt_getError(handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc, hA, hB, hBRes, hC, hCRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) geblttrf_npvt_getPerfData( handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc, hA, hB, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hC(size_C, 1, stC, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hCRes(size_CRes, 1, stCRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dC(size_C, 1, stC, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(nb == 0 || nblocks == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geblttrf_npvt_getError(handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc, hA, hB, hBRes, hC, hCRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) geblttrf_npvt_getPerfData( handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc, hA, hB, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using nb * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, nb); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("nb", "nblocks", "lda", "ldb", "ldc", "batch_c"); rocsolver_bench_output(nb, nblocks, lda, ldb, ldc, bc); } else if(STRIDED) { rocsolver_bench_output("nb", "nblocks", "lda", "strideA", "ldb", "strideB", "ldc", "strideC", "batch_c"); rocsolver_bench_output(nb, nblocks, lda, stA, ldb, stB, ldc, stC, bc); } else { rocsolver_bench_output("nb", "nblocks", "lda", "ldb", "ldc"); rocsolver_bench_output(nb, nblocks, lda, ldb, ldc); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEBLTTRF_NPVT(...) \ extern template void testing_geblttrf_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEBLTTRF_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_geblttrs_npvt.hpp000066400000000000000000000644341436600607200244440ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void geblttrs_npvt_checkBadArgs(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, T dC, const rocblas_int ldc, const rocblas_stride stC, T dX, const rocblas_int ldx, const rocblas_stride stX, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, nullptr, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, (T) nullptr, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA, lda, stA, (T) nullptr, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, (T) nullptr, ldc, stC, dX, ldx, stX, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, (T) nullptr, ldx, stX, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, 0, nblocks, nrhs, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (T) nullptr, ldc, stC, (T) nullptr, ldx, stX, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, 0, nrhs, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (T) nullptr, ldc, stC, (T) nullptr, ldx, stX, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, 0, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, (T) nullptr, ldx, stX, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, 0), rocblas_status_success); } template void testing_geblttrs_npvt_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int nb = 1; rocblas_int nblocks = 2; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldc = 1; rocblas_int ldx = 1; rocblas_stride stA = 2; rocblas_stride stB = 2; rocblas_stride stC = 2; rocblas_stride stX = 2; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_batch_vector dC(1, 1, 1); device_batch_vector dX(1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); // check bad arguments geblttrs_npvt_checkBadArgs(handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dX(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); // check bad arguments geblttrs_npvt_checkBadArgs(handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc); } } template void geblttrs_npvt_initData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, Td& dA, const rocblas_int lda, Td& dB, const rocblas_int ldb, Td& dC, const rocblas_int ldc, Td& dX, const rocblas_int ldx, const rocblas_int bc, Th& hA, Th& hB, Th& hC, Th& hX, Th& hRHS) { if(CPU) { int info; int n = nb * nblocks; std::vector M(n * n); std::vector XX(n * nrhs); std::vector XB(n * nrhs); std::vector ipiv(nb); // initialize blocks of the original matrix rocblas_init(hA, true); rocblas_init(hB, false); rocblas_init(hC, false); // initialize solution vectors rocblas_init(hX, false); for(rocblas_int b = 0; b < bc; ++b) { // form original matrix M and scale to avoid singularities for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { if(i == j) M[i + j * n + k * (n + 1) * nb] = hB[b][i + j * ldb + k * ldb * nb] + 400; else M[i + j * n + k * (n + 1) * nb] = hB[b][i + j * ldb + k * ldb * nb] - 4; if(k < nblocks - 1) { M[(i + nb) + j * n + k * (n + 1) * nb] = hA[b][i + j * lda + k * lda * nb] - 4; M[i + (j + nb) * n + k * (n + 1) * nb] = hC[b][i + j * ldc + k * ldc * nb] - 4; } } } } // move blocks of X to full matrix XX for(rocblas_int k = 0; k < nblocks; k++) for(rocblas_int i = 0; i < nb; i++) for(rocblas_int j = 0; j < nrhs; j++) XX[i + j * n + k * nb] = hX[b][i + j * ldx + k * ldx * nrhs]; // generate the full matrix of right-hand-side vectors XB by computing M * XX cpu_gemm(rocblas_operation_none, rocblas_operation_none, n, nrhs, n, T(1), M.data(), n, XX.data(), n, T(0), XB.data(), n); // move XB to block format in hRHS for(rocblas_int k = 0; k < nblocks; k++) for(rocblas_int i = 0; i < nb; i++) for(rocblas_int j = 0; j < nrhs; j++) hRHS[b][i + j * ldx + k * ldx * nrhs] = XB[i + j * n + k * nb]; // factorize M cpu_getrf(nb, nb, M.data(), n, ipiv.data(), &info); for(rocblas_int k = 0; k < nblocks - 1; k++) { cpu_getrs(rocblas_operation_none, nb, nb, M.data() + k * (n + 1) * nb, n, ipiv.data(), M.data() + nb * n + k * (n + 1) * nb, n); cpu_gemm(rocblas_operation_none, rocblas_operation_none, nb, nb, nb, T(-1), M.data() + nb + k * (n + 1) * nb, n, M.data() + nb * n + k * (n + 1) * nb, n, T(1), M.data() + (k + 1) * (n + 1) * nb, n); cpu_getrf(nb, nb, M.data() + (k + 1) * (n + 1) * nb, n, ipiv.data(), &info); } // move factorized blocks from M into hA, hB, and hC for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { hB[b][i + j * ldb + k * ldb * nb] = M[i + j * n + k * (n + 1) * nb]; if(k < nblocks - 1) { hA[b][i + j * lda + k * lda * nb] = M[(i + nb) + j * n + k * (n + 1) * nb]; hC[b][i + j * ldc + k * ldc * nb] = M[i + (j + nb) * n + k * (n + 1) * nb]; } } } } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dC.transfer_from(hC)); CHECK_HIP_ERROR(dX.transfer_from(hRHS)); } } template void geblttrs_npvt_getError(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Td& dX, const rocblas_int ldx, const rocblas_stride stX, const rocblas_int bc, Th& hA, Th& hB, Th& hC, Th& hX, Th& hXRes, double* max_err) { // input data initialization geblttrs_npvt_initData(handle, nb, nblocks, nrhs, dA, lda, dB, ldb, dC, ldc, dX, ldx, bc, hA, hB, hC, hX, hXRes); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc)); CHECK_HIP_ERROR(hXRes.transfer_from(dX)); double err = 0; *max_err = 0; // error is ||hX - hXRes|| / ||hX|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', nb, nrhs * nblocks, ldx, hX[b], hXRes[b]); *max_err = err > *max_err ? err : *max_err; } } template void geblttrs_npvt_getPerfData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Td& dX, const rocblas_int ldx, const rocblas_stride stX, const rocblas_int bc, Th& hA, Th& hB, Th& hC, Th& hX, Th& hXRes, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { // there is no direct CPU/LAPACK equivalent for this function, therefore // we return an invalid CPU time *cpu_time_used = nan(""); } geblttrs_npvt_initData(handle, nb, nblocks, nrhs, dA, lda, dB, ldb, dC, ldc, dX, ldx, bc, hA, hB, hC, hX, hXRes); // cold calls for(int iter = 0; iter < 2; iter++) { geblttrs_npvt_initData(handle, nb, nblocks, nrhs, dA, lda, dB, ldb, dC, ldc, dX, ldx, bc, hA, hB, hC, hX, hXRes); CHECK_ROCBLAS_ERROR(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { geblttrs_npvt_initData(handle, nb, nblocks, nrhs, dA, lda, dB, ldb, dC, ldc, dX, ldx, bc, hA, hB, hC, hX, hXRes); start = get_time_us_sync(stream); rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_geblttrs_npvt(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int nb = argus.get("nb"); rocblas_int nblocks = argus.get("nblocks"); rocblas_int nrhs = argus.get("nrhs"); rocblas_int lda = argus.get("lda", nb); rocblas_int ldb = argus.get("ldb", nb); rocblas_int ldc = argus.get("ldc", nb); rocblas_int ldx = argus.get("ldx", nb); rocblas_stride stA = argus.get("strideA", lda * nb * nblocks); rocblas_stride stB = argus.get("strideB", ldb * nb * nblocks); rocblas_stride stC = argus.get("strideC", ldc * nb * nblocks); rocblas_stride stX = argus.get("strideX", ldx * nrhs * nblocks); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stXRes = stX; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * nb * nblocks; size_t size_B = size_t(ldb) * nb * nblocks; size_t size_C = size_t(ldc) * nb * nblocks; size_t size_X = size_t(ldx) * nrhs * nblocks; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_XRes = size_X; // check invalid sizes bool invalid_size = (nb < 0 || nblocks < 0 || nrhs < 0 || lda < nb || ldb < nb || ldc < nb || ldx < nb || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldc, stC, (T* const*)nullptr, ldx, stX, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldc, stC, (T*)nullptr, ldx, stX, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_geblttrs_npvt( STRIDED, handle, nb, nblocks, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldc, stC, (T* const*)nullptr, ldx, stX, bc)); else CHECK_ALLOC_QUERY(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldc, stC, (T*)nullptr, ldx, stX, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hC(size_C, 1, bc); host_batch_vector hX(size_X, 1, bc); host_batch_vector hXRes(size_XRes, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dC(size_C, 1, bc); device_batch_vector dX(size_X, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); // check quick return if(nb == 0 || nblocks == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geblttrs_npvt_getError(handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc, hA, hB, hC, hX, hXRes, &max_error); // collect performance data if(argus.timing) geblttrs_npvt_getPerfData(handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc, hA, hB, hC, hX, hXRes, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hC(size_C, 1, stC, bc); host_strided_batch_vector hX(size_X, 1, stX, bc); host_strided_batch_vector hXRes(size_XRes, 1, stXRes, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dC(size_C, 1, stC, bc); device_strided_batch_vector dX(size_X, 1, stX, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); // check quick return if(nb == 0 || nblocks == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geblttrs_npvt_getError(handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc, hA, hB, hC, hX, hXRes, &max_error); // collect performance data if(argus.timing) geblttrs_npvt_getPerfData(handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc, hA, hB, hC, hX, hXRes, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using nb * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, nb); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("nb", "nblocks", "nrhs", "lda", "ldb", "ldc", "ldx", "batch_c"); rocsolver_bench_output(nb, nblocks, nrhs, lda, ldb, ldc, ldx, bc); } else if(STRIDED) { rocsolver_bench_output("nb", "nblocks", "nrhs", "lda", "strideA", "ldb", "strideB", "ldc", "strideC", "ldx", "strideX", "batch_c"); rocsolver_bench_output(nb, nblocks, nrhs, lda, stA, ldb, stB, ldc, stC, ldx, stX, bc); } else { rocsolver_bench_output("nb", "nblocks", "nrhs", "lda", "ldb", "ldc", "ldx"); rocsolver_bench_output(nb, nblocks, nrhs, lda, ldb, ldc, ldx); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEBLTTRS_NPVT(...) \ extern template void testing_geblttrs_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEBLTTRS_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_gelq2_gelqf.hpp000066400000000000000000000412021436600607200237230ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void gelq2_gelqf_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_gelq2_gelqf(STRIDED, GELQF, nullptr, m, n, dA, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA, lda, stA, dIpiv, stP, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, (T) nullptr, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA, lda, stA, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, 0, n, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, 0, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA, lda, stA, dIpiv, stP, 0), rocblas_status_success); } template void testing_gelq2_gelqf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments gelq2_gelqf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments gelq2_gelqf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } } template void gelq2_gelqf_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gelq2_gelqf_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, double* max_err) { std::vector hW(m); // input data initialization gelq2_gelqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { GELQF ? cpu_gelqf(m, n, hA[b], lda, hIpiv[b], hW.data(), m) : cpu_gelq2(m, n, hA[b], lda, hIpiv[b], hW.data()); } // error is ||hA - hARes|| / ||hA|| (ideally ||QR - Qres Rres|| / ||QR||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void gelq2_gelqf_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(m); if(!perf) { gelq2_gelqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { GELQF ? cpu_gelqf(m, n, hA[b], lda, hIpiv[b], hW.data(), m) : cpu_gelq2(m, n, hA[b], lda, hIpiv[b], hW.data()); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gelq2_gelqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cold calls for(int iter = 0; iter < 2; iter++) { gelq2_gelqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); CHECK_ROCBLAS_ERROR(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gelq2_gelqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); start = get_time_us_sync(stream); rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gelq2_gelqf(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gelq2_gelqf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) gelq2_gelqf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gelq2_gelqf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) gelq2_gelqf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using n * machine_precision as tolerance // (for possibly singular of ill-conditioned matrices we could use n*min(m,n)) if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GELQ2_GELQF(...) \ extern template void testing_gelq2_gelqf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GELQ2_GELQF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_gels.hpp000066400000000000000000000563351436600607200225020ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void gels_checkBadArgs(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, U dA, const rocblas_int lda, const rocblas_stride stA, U dB, const rocblas_int ldb, const rocblas_stride stB, rocblas_int* info, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_gels(STRIDED, nullptr, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, info, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, rocblas_operation(0), m, n, nrhs, dA, lda, stA, dB, ldb, stB, info, bc), rocblas_status_invalid_value) << "Must report error when operation is invalid"; // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, info, -1), rocblas_status_invalid_size) << "Must report error when batch size is negative"; // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, info, bc), rocblas_status_invalid_pointer) << "Should normally report error when A is null"; EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, (U) nullptr, ldb, stB, info, bc), rocblas_status_invalid_pointer) << "Should normally report error when B is null"; EXPECT_ROCBLAS_STATUS( rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, nullptr, bc), rocblas_status_invalid_pointer) << "Should normally report error when info is null"; // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, 0, n, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, info, bc), rocblas_status_success) << "Matrix A may be null when m is 0 (empty matrix)"; EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, 0, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, info, bc), rocblas_status_success) << "Matrix A may be null when n is 0 (empty matrix)"; EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, 0, dA, lda, stA, (U) nullptr, ldb, stB, info, bc), rocblas_status_success) << "Matrix B may be null when nhrs is 0 (empty matrix)"; EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, 0, 0, nrhs, (U) nullptr, lda, stA, (U) nullptr, ldb, stB, info, bc), rocblas_status_success) << "Matrices A and B may be null when m and n are 0 (empty matrix)"; if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, nullptr, 0), rocblas_status_success) << "Info may be null when batch size is 0"; // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, info, 0), rocblas_status_success); } template void testing_gels_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_int bc = 1; rocblas_operation trans = rocblas_operation_none; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gels_checkBadArgs(handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gels_checkBadArgs(handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); } } template void gels_initData(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hInfo, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); const rocblas_int max_index = std::max(0, std::min(m, n) - 1); std::uniform_int_distribution sample_index(0, max_index); std::bernoulli_distribution coinflip(0.5); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // add some singularities // always the same elements for debugging purposes if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { do { if(n <= m) { // zero random col rocblas_int j = sample_index(rocblas_rng); for(rocblas_int i = 0; i < m; i++) hA[b][i + j * lda] = 0; } else { // zero random row rocblas_int i = sample_index(rocblas_rng); for(rocblas_int j = 0; j < n; j++) hA[b][i + j * lda] = 0; } } while(coinflip(rocblas_rng)); } } } if(GPU) { // now copy pivoting indices and matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void gels_getError(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hBRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = max(1, min(m, n) + max(min(m, n), nrhs)); std::vector hW(sizeW); // input data initialization gels_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_gels(trans, m, n, nrhs, hA[b], lda, hB[b], ldb, hW.data(), sizeW, hInfo[b]); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('I', max(m, n), nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; } // also check info for singularities err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) err++; *max_err += err; } template void gels_getPerfData(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = max(1, min(m, n) + max(min(m, n), nrhs)); std::vector hW(sizeW); if(!perf) { gels_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_gels(trans, m, n, nrhs, hA[b], lda, hB[b], ldb, hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gels_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { gels_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gels_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, singular); start = get_time_us_sync(stream); rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_gels(Arguments& argus) { // get arguments rocblas_local_handle handle; char transC = argus.get("trans"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", m); rocblas_int ldb = argus.get("ldb", max(m, n)); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values bool invalid_value = ((COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || nrhs < 0 || lda < m || ldb < m || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(bc) CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(m == 0 || n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gels_getError(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gels_getPerfData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(bc) CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(m == 0 || n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gels_getError(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gels_getPerfData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using max(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, max(m, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb", "batch_c"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb, bc); } else if(STRIDED) { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb", "strideA", "strideB", "batch_c"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb, stA, stB, bc); } else { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GELS(...) extern template void testing_gels<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GELS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_gels_outofplace.hpp000066400000000000000000000715061436600607200247200ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void gels_outofplace_checkBadArgs(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, U dA, const rocblas_int lda, const rocblas_stride stA, U dB, const rocblas_int ldb, const rocblas_stride stB, U dX, const rocblas_int ldx, const rocblas_stride stX, rocblas_int* info, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, nullptr, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, rocblas_operation(0), m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, info, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, (U) nullptr, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, (U) nullptr, ldx, stX, info, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, rocblas_operation_none, 0, n, nrhs, (U) nullptr, lda, stA, (U) nullptr, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, 0, n, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, (U) nullptr, ldx, stX, info, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, rocblas_operation_none, m, 0, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, (U) nullptr, ldx, stX, info, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, 0, nrhs, (U) nullptr, lda, stA, (U) nullptr, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, 0, dA, lda, stA, (U) nullptr, ldb, stB, (U) nullptr, ldx, stX, info, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, 0, 0, nrhs, (U) nullptr, lda, stA, (U) nullptr, ldb, stB, (U) nullptr, ldx, stX, info, bc), rocblas_status_success); if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, nullptr, 0), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, info, 0), rocblas_status_success); } template void testing_gels_outofplace_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldx = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stX = 1; rocblas_int bc = 1; rocblas_operation trans = (!rocblas_is_complex ? rocblas_operation_transpose : rocblas_operation_conjugate_transpose); if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_batch_vector dX(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gels_outofplace_checkBadArgs(handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dX(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gels_outofplace_checkBadArgs(handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); } } template void gels_outofplace_initData(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hX, Uh& hInfo, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); const rocblas_int max_index = std::max(0, std::min(m, n) - 1); std::uniform_int_distribution sample_index(0, max_index); std::bernoulli_distribution coinflip(0.5); const rocblas_int rowsB = (trans == rocblas_operation_none) ? m : n; const rocblas_int ldx = max(m, n); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // populate hX with values from hB for(rocblas_int i = 0; i < rowsB; i++) for(rocblas_int j = 0; j < nrhs; j++) hX[b][i + j * ldx] = hB[b][i + j * ldb]; // add some singularities // always the same elements for debugging purposes if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { do { if(n <= m) { // zero random col rocblas_int j = sample_index(rocblas_rng); for(rocblas_int i = 0; i < m; i++) hA[b][i + j * lda] = 0; } else { // zero random row rocblas_int i = sample_index(rocblas_rng); for(rocblas_int j = 0; j < n; j++) hA[b][i + j * lda] = 0; } } while(coinflip(rocblas_rng)); } } } if(GPU) { // now copy pivoting indices and matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void gels_outofplace_getError(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dX, const rocblas_int ldx, const rocblas_stride stX, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hBRes, Th& hX, Th& hXRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = max(1, min(m, n) + max(min(m, n), nrhs)); std::vector hW(sizeW); // input data initialization gels_outofplace_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hX, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); CHECK_HIP_ERROR(hXRes.transfer_from(dX)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_gels(trans, m, n, nrhs, hA[b], lda, hX[b], max(m, n), hW.data(), sizeW, hInfo[b]); } // error is ||hX - hXRes|| / ||hX|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { const rocblas_int rowsB = (trans == rocblas_operation_none) ? m : n; err = norm_error('F', rowsB, nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; if(hInfo[b][0] == 0) { const rocblas_int rowsX = (trans == rocblas_operation_none) ? n : m; err = norm_error('I', rowsX, nrhs, max(m, n), hX[b], hXRes[b], ldx); *max_err = err > *max_err ? err : *max_err; } } // also check info for singularities err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) err++; *max_err += err; } template void gels_outofplace_getPerfData(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dX, const rocblas_int ldx, const rocblas_stride stX, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hX, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = max(1, min(m, n) + max(min(m, n), nrhs)); std::vector hW(sizeW); if(!perf) { gels_outofplace_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hX, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_gels(trans, m, n, nrhs, hA[b], lda, hX[b], max(m, n), hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gels_outofplace_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hX, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { gels_outofplace_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hX, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gels_outofplace_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hX, hInfo, singular); start = get_time_us_sync(stream); rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_gels_outofplace(Arguments& argus) { // get arguments rocblas_local_handle handle; char transC = argus.get("trans"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", m); rocblas_int ldb = argus.get("ldb", transC == 'N' ? m : n); rocblas_int ldx = argus.get("ldx", transC == 'N' ? n : m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_stride stX = argus.get("strideX", ldx * nrhs); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; rocblas_stride stXRes = (argus.unit_check || argus.norm_check) ? stX : 0; // check non-supported values bool invalid_value = ((COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; size_t size_X = size_t(ldx) * nrhs; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; size_t size_XRes = (argus.unit_check || argus.norm_check) ? size_X : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || nrhs < 0 || lda < m || (trans == rocblas_operation_none && (ldb < m || ldx < n)) || (trans != rocblas_operation_none && (ldb < n || ldx < m)) || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gels_outofplace( STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_gels_outofplace( STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_batch_vector hX(max(m, n) * nrhs, 1, bc); host_batch_vector hXRes(size_XRes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dX(size_X, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); if(bc) CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(m == 0 || n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gels_outofplace_getError(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hB, hBRes, hX, hXRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gels_outofplace_getPerfData( handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hB, hX, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hX(max(m, n) * nrhs, 1, max(m, n) * nrhs, bc); host_strided_batch_vector hXRes(size_XRes, 1, stXRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dX(size_X, 1, stX, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); if(bc) CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(m == 0 || n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gels_outofplace_getError(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hB, hBRes, hX, hXRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gels_outofplace_getPerfData( handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hB, hX, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using max(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, max(m, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb", "ldx", "batch_c"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb, ldx, bc); } else if(STRIDED) { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb", "ldx", "strideA", "strideB", "strideX", "batch_c"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb, ldx, stA, stB, stX, bc); } else { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb", "ldx"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb, ldx); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-5.5.1/clients/include/testing_geql2_geqlf.hpp000066400000000000000000000412121436600607200237240ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void geql2_geqlf_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_geql2_geqlf(STRIDED, GEQLF, nullptr, m, n, dA, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA, lda, stA, dIpiv, stP, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, (T) nullptr, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA, lda, stA, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, 0, n, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, 0, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA, lda, stA, dIpiv, stP, 0), rocblas_status_success); } template void testing_geql2_geqlf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments geql2_geqlf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments geql2_geqlf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } } template void geql2_geqlf_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(m - i == n - j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void geql2_geqlf_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, double* max_err) { std::vector hW(n); // input data initialization geql2_geqlf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { GEQLF ? cpu_geqlf(m, n, hA[b], lda, hIpiv[b], hW.data(), n) : cpu_geql2(m, n, hA[b], lda, hIpiv[b], hW.data()); } // error is ||hA - hARes|| / ||hA|| (ideally ||QL - Qres Lres|| / ||QL||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void geql2_geqlf_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(n); if(!perf) { geql2_geqlf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { GEQLF ? cpu_geqlf(m, n, hA[b], lda, hIpiv[b], hW.data(), n) : cpu_geql2(m, n, hA[b], lda, hIpiv[b], hW.data()); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } geql2_geqlf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cold calls for(int iter = 0; iter < 2; iter++) { geql2_geqlf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); CHECK_ROCBLAS_ERROR(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { geql2_geqlf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); start = get_time_us_sync(stream); rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_geql2_geqlf(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geql2_geqlf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) geql2_geqlf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geql2_geqlf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) geql2_geqlf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m * machine_precision as tolerance // (for possibly singular of ill-conditioned matrices we could use m*min(m,n)) if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, m); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEQL2_GEQLF(...) \ extern template void testing_geql2_geqlf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEQL2_GEQLF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_geqr2_geqrf.hpp000066400000000000000000000440741436600607200237510ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void geqr2_geqrf_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_geqr2_geqrf(STRIDED, GEQRF, nullptr, m, n, dA, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA, lda, stA, dIpiv, stP, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, (T) nullptr, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA, lda, stA, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, 0, n, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, 0, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA, lda, stA, dIpiv, stP, 0), rocblas_status_success); } template void testing_geqr2_geqrf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments geqr2_geqrf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments geqr2_geqrf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } } template void geqr2_geqrf_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void geqr2_geqrf_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, double* max_err) { std::vector hW(n); // input data initialization geqr2_geqrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { GEQRF ? cpu_geqrf(m, n, hA[b], lda, hIpiv[b], hW.data(), n) : cpu_geqr2(m, n, hA[b], lda, hIpiv[b], hW.data()); } // error is ||hA - hARes|| / ||hA|| (ideally ||QR - Qres Rres|| / ||QR||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void geqr2_geqrf_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(n); if(!perf) { geqr2_geqrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { GEQRF ? cpu_geqrf(m, n, hA[b], lda, hIpiv[b], hW.data(), n) : cpu_geqr2(m, n, hA[b], lda, hIpiv[b], hW.data()); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } geqr2_geqrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cold calls for(int iter = 0; iter < 2; iter++) { geqr2_geqrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); CHECK_ROCBLAS_ERROR(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { geqr2_geqrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); start = get_time_us_sync(stream); rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_geqr2_geqrf(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED && STRIDED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geqr2_geqrf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) geqr2_geqrf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hIpiv(size_P, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dIpiv(size_P, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geqr2_geqrf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) geqr2_geqrf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geqr2_geqrf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) geqr2_geqrf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m * machine_precision as tolerance // (for possibly singular of ill-conditioned matrices we could use m*min(m,n)) if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, m); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEQR2_GEQRF(...) \ extern template void testing_geqr2_geqrf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEQR2_GEQRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_gerq2_gerqf.hpp000066400000000000000000000412021436600607200237370ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void gerq2_gerqf_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_gerq2_gerqf(STRIDED, GERQF, nullptr, m, n, dA, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA, lda, stA, dIpiv, stP, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, (T) nullptr, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA, lda, stA, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, 0, n, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, 0, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA, lda, stA, dIpiv, stP, 0), rocblas_status_success); } template void testing_gerq2_gerqf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments gerq2_gerqf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments gerq2_gerqf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } } template void gerq2_gerqf_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gerq2_gerqf_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, double* max_err) { std::vector hW(m); // input data initialization gerq2_gerqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { GERQF ? cpu_gerqf(m, n, hA[b], lda, hIpiv[b], hW.data(), m) : cpu_gerq2(m, n, hA[b], lda, hIpiv[b], hW.data()); } // error is ||hA - hARes|| / ||hA|| (ideally ||QR - Qres Rres|| / ||QR||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void gerq2_gerqf_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(m); if(!perf) { gerq2_gerqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { GERQF ? cpu_gerqf(m, n, hA[b], lda, hIpiv[b], hW.data(), m) : cpu_gerq2(m, n, hA[b], lda, hIpiv[b], hW.data()); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gerq2_gerqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cold calls for(int iter = 0; iter < 2; iter++) { gerq2_gerqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); CHECK_ROCBLAS_ERROR(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gerq2_gerqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); start = get_time_us_sync(stream); rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gerq2_gerqf(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gerq2_gerqf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) gerq2_gerqf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gerq2_gerqf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) gerq2_gerqf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m * machine_precision as tolerance // (for possibly singular of ill-conditioned matrices we could use m*min(m,n)) if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, m); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GERQ2_GERQF(...) \ extern template void testing_gerq2_gerqf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GERQ2_GERQF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_gesv.hpp000066400000000000000000000527271436600607200225150ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void gesv_checkBadArgs(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, T dB, const rocblas_int ldb, const rocblas_stride stB, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, nullptr, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, (T) nullptr, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA, lda, stA, (U) nullptr, stP, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, 0, nrhs, (T) nullptr, lda, stA, (U) nullptr, stP, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, 0, dA, lda, stA, dIpiv, stP, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, 0), rocblas_status_success); } template void testing_gesv_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_stride stB = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gesv_checkBadArgs(handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gesv_checkBadArgs(handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc); } } template void gesv_initData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes). // The algorithm must detect the first zero element in the // diagonal of those matrices in the batch that are singular rocblas_int j = n / 4 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n / 2 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n - 1 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; } } } if(GPU) { // now copy pivoting indices and matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void gesv_getError(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, Th& hBRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization gesv_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesv(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_gesv(n, nrhs, hA[b], lda, hIpiv[b], hB[b], ldb, hInfo[b]); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('I', n, nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; } // also check info for singularities err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) err++; *max_err += err; } template void gesv_getPerfData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { gesv_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_gesv(n, nrhs, hA[b], lda, hIpiv[b], hB[b], ldb, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gesv_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // cold calls for(int iter = 0; iter < 2; iter++) { gesv_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); CHECK_ROCBLAS_ERROR(rocsolver_gesv(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesv_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); start = get_time_us_sync(stream); rocsolver_gesv(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesv(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; // check invalid sizes bool invalid_size = (n < 0 || nrhs < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gesv(STRIDED, handle, n, nrhs, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_gesv(STRIDED, handle, n, nrhs, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gesv_getError(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc, hA, hIpiv, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gesv_getPerfData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc, hA, hIpiv, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gesv_getError(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc, hA, hIpiv, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gesv_getPerfData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc, hA, hIpiv, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "nrhs", "lda", "ldb", "strideP", "batch_c"); rocsolver_bench_output(n, nrhs, lda, ldb, stP, bc); } else if(STRIDED) { rocsolver_bench_output("n", "nrhs", "lda", "ldb", "strideA", "strideP", "strideB", "batch_c"); rocsolver_bench_output(n, nrhs, lda, ldb, stA, stP, stB, bc); } else { rocsolver_bench_output("n", "nrhs", "lda", "ldb"); rocsolver_bench_output(n, nrhs, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GESV(...) extern template void testing_gesv<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GESV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_gesv_outofplace.hpp000066400000000000000000000616361436600607200247350ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void gesv_outofplace_checkBadArgs(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, T dB, const rocblas_int ldb, const rocblas_stride stB, T dX, const rocblas_int ldx, const rocblas_stride stX, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, nullptr, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, (T) nullptr, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, (U) nullptr, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, (T) nullptr, ldb, stB, dX, ldx, stX, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, (T) nullptr, ldx, stX, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, 0, nrhs, (T) nullptr, lda, stA, (U) nullptr, stP, (T) nullptr, ldb, stB, (T) nullptr, ldx, stX, dInfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, 0, dA, lda, stA, dIpiv, stP, (T) nullptr, ldb, stB, (T) nullptr, ldx, stX, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, 0), rocblas_status_success); } template void testing_gesv_outofplace_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldx = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_stride stB = 1; rocblas_stride stX = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_batch_vector dX(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gesv_outofplace_checkBadArgs(handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dX(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gesv_outofplace_checkBadArgs(handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); } } template void gesv_outofplace_initData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes). // The algorithm must detect the first zero element in the // diagonal of those matrices in the batch that are singular rocblas_int j = n / 4 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n / 2 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n - 1 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; } } } if(GPU) { // now copy pivoting indices and matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void gesv_outofplace_getError(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dX, const rocblas_int ldx, const rocblas_stride stX, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, Th& hBRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization gesv_outofplace_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dX)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_gesv(n, nrhs, hA[b], lda, hIpiv[b], hB[b], ldb, hInfo[b]); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('I', n, nrhs, ldb, hB[b], hBRes[b], ldx); *max_err = err > *max_err ? err : *max_err; } } // also check info for singularities err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) err++; *max_err += err; } template void gesv_outofplace_getPerfData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dX, const rocblas_int ldx, const rocblas_stride stX, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { gesv_outofplace_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_gesv(n, nrhs, hA[b], lda, hIpiv[b], hB[b], ldb, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gesv_outofplace_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // cold calls for(int iter = 0; iter < 2; iter++) { gesv_outofplace_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); CHECK_ROCBLAS_ERROR(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesv_outofplace_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); start = get_time_us_sync(stream); rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesv_outofplace(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_int ldx = argus.get("ldx", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_stride stX = argus.get("strideX", ldx * nrhs); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stX : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; size_t size_X = size_t(ldx) * nrhs; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_X : 0; // check invalid sizes bool invalid_size = (n < 0 || nrhs < 0 || lda < n || ldb < n || ldx < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldb, stB, (T*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY( rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_gesv_outofplace( STRIDED, handle, n, nrhs, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldb, stB, (T*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dX(size_X, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gesv_outofplace_getError(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hIpiv, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gesv_outofplace_getPerfData( handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hIpiv, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dX(size_X, 1, stX, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gesv_outofplace_getError(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hIpiv, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gesv_outofplace_getPerfData( handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hIpiv, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "nrhs", "lda", "ldb", "ldx", "strideP", "batch_c"); rocsolver_bench_output(n, nrhs, lda, ldb, ldx, stP, bc); } else if(STRIDED) { rocsolver_bench_output("n", "nrhs", "lda", "ldb", "ldx", "strideA", "strideP", "strideB", "strideX", "batch_c"); rocsolver_bench_output(n, nrhs, lda, ldb, ldx, stA, stP, stB, stX, bc); } else { rocsolver_bench_output("n", "nrhs", "lda", "ldb", "ldx"); rocsolver_bench_output(n, nrhs, lda, ldb, ldx); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-5.5.1/clients/include/testing_gesvd.hpp000066400000000000000000001075041436600607200226530ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void gesvd_checkBadArgs(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, W dA, const rocblas_int lda, const rocblas_stride stA, TT dS, const rocblas_stride stS, T dU, const rocblas_int ldu, const rocblas_stride stU, T dV, const rocblas_int ldv, const rocblas_stride stV, TT dE, const rocblas_stride stE, const rocblas_workmode fa, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, nullptr, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, rocblas_svect(0), right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, rocblas_svect(0), m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, rocblas_svect_overwrite, rocblas_svect_overwrite, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, (W) nullptr, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, (TT) nullptr, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, (T) nullptr, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, (T) nullptr, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, (TT) nullptr, stE, fa, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, 0, n, (W) nullptr, lda, stA, (TT) nullptr, stS, (T) nullptr, ldu, stU, dV, ldv, stV, (TT) nullptr, stE, fa, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, 0, (W) nullptr, lda, stA, (TT) nullptr, stS, dU, ldu, stU, (T) nullptr, ldv, stV, (TT) nullptr, stE, fa, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, (U) nullptr, 0), rocblas_status_success); } template void testing_gesvd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_svect left_svect = rocblas_svect_all; rocblas_svect right_svect = rocblas_svect_all; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_int ldu = 2; rocblas_int ldv = 2; rocblas_stride stA = 2; rocblas_stride stS = 2; rocblas_stride stU = 2; rocblas_stride stV = 2; rocblas_stride stE = 2; rocblas_int bc = 1; rocblas_workmode fa = rocblas_outofplace; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvd_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvd_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc); } } template void gesvd_initData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gesvd_getError(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Td& dE, const rocblas_stride stE, const rocblas_workmode fa, Id& dinfo, const rocblas_int bc, const rocblas_svect left_svectT, const rocblas_svect right_svectT, const rocblas_int mT, const rocblas_int nT, Ud& dUT, const rocblas_int lduT, const rocblas_stride stUT, Ud& dVT, const rocblas_int ldvT, const rocblas_stride stVT, Wh& hA, Th& hS, Th& hSres, Uh& hU, Uh& Ures, const rocblas_int ldures, Uh& hV, Uh& Vres, const rocblas_int ldvres, Ih& hinfo, Ih& hinfoRes, double* max_err, double* max_errv) { using W = decltype(std::real(T{})); rocblas_int lwork = 5 * max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); std::vector A(lda * n * bc); // input data initialization gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // execute computations: // complementary execution to compute all singular vectors if needed (always in-place to ensure // we don't combine results computed by gemm_batched with results computed by gemm_strided_batched) CHECK_ROCBLAS_ERROR(rocsolver_gesvd(STRIDED, handle, left_svectT, right_svectT, mT, nT, dA.data(), lda, stA, dS.data(), stS, dUT.data(), lduT, stUT, dVT.data(), ldvT, stVT, dE.data(), stE, rocblas_inplace, dinfo.data(), bc)); if(left_svect == rocblas_svect_none && right_svect != rocblas_svect_none) CHECK_HIP_ERROR(Ures.transfer_from(dUT)); if(right_svect == rocblas_svect_none && left_svect != rocblas_svect_none) CHECK_HIP_ERROR(Vres.transfer_from(dVT)); gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(rocblas_svect_none, rocblas_svect_none, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc)); CHECK_HIP_ERROR(hSres.transfer_from(dS)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(left_svect == rocblas_svect_singular || left_svect == rocblas_svect_all) CHECK_HIP_ERROR(Ures.transfer_from(dU)); if(right_svect == rocblas_svect_singular || right_svect == rocblas_svect_all) CHECK_HIP_ERROR(Vres.transfer_from(dV)); if(left_svect == rocblas_svect_overwrite) { CHECK_HIP_ERROR(hA.transfer_from(dA)); for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < min(m, n); j++) Ures[b][i + j * ldures] = hA[b][i + j * lda]; } } } if(right_svect == rocblas_svect_overwrite) { CHECK_HIP_ERROR(hA.transfer_from(dA)); for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < min(m, n); i++) { for(rocblas_int j = 0; j < n; j++) Vres[b][i + j * ldvres] = hA[b][i + j * lda]; } } } // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). double err; *max_errv = 0; for(rocblas_int b = 0; b < bc; ++b) { // error is ||hS - hSres|| err = norm_error('F', 1, min(m, n), 1, hS[b], hSres[b]); *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required if(hinfo[b][0] == 0 && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { err = 0; // check singular vectors implicitly (A*v_k = s_k*u_k) for(rocblas_int k = 0; k < min(m, n); ++k) { for(rocblas_int i = 0; i < m; ++i) { T tmp = 0; for(rocblas_int j = 0; j < n; ++j) tmp += A[b * lda * n + i + j * lda] * sconj(Vres[b][k + j * ldvres]); tmp -= hSres[b][k] * Ures[b][i + k * ldures]; err += std::abs(tmp) * std::abs(tmp); } } err = std::sqrt(err) / double(snorm('F', m, n, A.data() + b * lda * n, lda)); *max_errv = err > *max_errv ? err : *max_errv; } } } template void gesvd_getPerfData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Td& dE, const rocblas_stride stE, const rocblas_workmode fa, Id& dinfo, const rocblas_int bc, Wh& hA, Th& hS, Uh& hU, Uh& hV, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { using W = decltype(std::real(T{})); rocblas_int lwork = 5 * max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); std::vector A; if(!perf) { gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(left_svect, right_svect, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_gesvd( STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesvd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char leftvC = argus.get("left_svect"); char rightvC = argus.get("right_svect"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_int ldu = argus.get("ldu", m); rocblas_int ldv = argus.get("ldv", (rightvC == 'A' ? n : min(m, n))); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stS = argus.get("strideS", min(m, n)); rocblas_stride stU = argus.get("strideU", ldu * m); rocblas_stride stV = argus.get("strideV", ldv * n); rocblas_stride stE = argus.get("strideE", min(m, n) - 1); char faC = argus.get("fast_alg"); rocblas_svect leftv = char2rocblas_svect(leftvC); rocblas_svect rightv = char2rocblas_svect(rightvC); rocblas_workmode fa = char2rocblas_workmode(faC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(rightv == rocblas_svect_overwrite && leftv == rocblas_svect_overwrite) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } /** TESTING OF SINGULAR VECTORS IS DONE IMPLICITLY, NOT EXPLICITLY COMPARING WITH LAPACK. SO, WE ALWAYS NEED TO COMPUTE THE SAME NUMBER OF ELEMENTS OF THE RIGHT AND LEFT VECTORS. WHILE DOING THIS, IF MORE VECTORS THAN THE SPECIFIED IN THE MAIN CALL NEED TO BE COMPUTED, WE DO SO WITH AN EXTRA CALL **/ rocblas_svect leftvT = rocblas_svect_none; rocblas_svect rightvT = rocblas_svect_none; rocblas_int ldvT = 1; rocblas_int lduT = 1; rocblas_int mT = 0; rocblas_int nT = 0; bool svects = (leftv != rocblas_svect_none || rightv != rocblas_svect_none); if(svects) { if(leftv == rocblas_svect_none) { leftvT = rocblas_svect_all; lduT = m; mT = m; nT = n; if((n > m && fa == rocblas_outofplace) || (n > m && rightv == rocblas_svect_overwrite)) rightvT = rocblas_svect_overwrite; } if(rightv == rocblas_svect_none) { rightvT = rocblas_svect_all; ldvT = n; mT = m; nT = n; if((m >= n && fa == rocblas_outofplace) || (m >= n && leftv == rocblas_svect_overwrite)) leftvT = rocblas_svect_overwrite; } } // determine sizes rocblas_int ldures = 1; rocblas_int ldvres = 1; size_t size_Sres = 0; size_t size_Ures = 0; size_t size_Vres = 0; size_t size_UT = 0; size_t size_VT = 0; size_t size_A = size_t(lda) * n; size_t size_S = size_t(min(m, n)); size_t size_E = size_t(min(m, n) - 1); size_t size_V = size_t(ldv) * n; size_t size_U = size_t(ldu) * m; if(argus.unit_check || argus.norm_check) { size_VT = size_t(ldvT) * nT; size_UT = size_t(lduT) * mT; size_Sres = size_S; if(svects) { if(leftv == rocblas_svect_none) { size_Ures = size_UT; ldures = lduT; } else if(leftv == rocblas_svect_singular || leftv == rocblas_svect_all) { size_Ures = size_U; ldures = ldu; } else { size_Ures = m * m; ldures = m; } if(rightv == rocblas_svect_none) { size_Vres = size_VT; ldvres = ldvT; } else if(rightv == rocblas_svect_singular || rightv == rocblas_svect_all) { size_Vres = size_V; ldvres = ldv; } else { size_Vres = n * n; ldvres = n; } } } rocblas_stride stUT = size_UT; rocblas_stride stVT = size_VT; rocblas_stride stUres = size_Ures; rocblas_stride stVres = size_Vres; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || m < 0 || lda < m || ldu < 1 || ldv < 1 || bc < 0) || ((leftv == rocblas_svect_all || leftv == rocblas_svect_singular) && ldu < m) || ((rightv == rocblas_svect_all && ldv < n) || (rightv == rocblas_svect_singular && ldv < min(m, n))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) { CHECK_ALLOC_QUERY(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvd(STRIDED, handle, leftvT, rightvT, mT, nT, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc)); } else { CHECK_ALLOC_QUERY(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvd(STRIDED, handle, leftvT, rightvT, mT, nT, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc)); } size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hS(size_S, 1, stS, bc); host_strided_batch_vector hV(size_V, 1, stV, bc); host_strided_batch_vector hU(size_U, 1, stU, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hSres(size_Sres, 1, stS, bc); host_strided_batch_vector Vres(size_Vres, 1, stVres, bc); host_strided_batch_vector Ures(size_Ures, 1, stUres, bc); // device device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dS(size_S, 1, stS, bc); device_strided_batch_vector dV(size_V, 1, stV, bc); device_strided_batch_vector dU(size_U, 1, stU, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); device_strided_batch_vector dVT(size_VT, 1, stVT, bc); device_strided_batch_vector dUT(size_UT, 1, stUT, bc); if(size_VT) CHECK_HIP_ERROR(dVT.memcheck()); if(size_UT) CHECK_HIP_ERROR(dUT.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvd_getError(handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvd_getPerfData(handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvd_getError(handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvd_getPerfData(handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * min(m, n) * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 2 * min(m, n)); if(svects) ROCSOLVER_TEST_CHECK(T, max_errorv, 2 * min(m, n)); } // output results for rocsolver-bench if(argus.timing) { if(svects) max_error = (max_error >= max_errorv) ? max_error : max_errorv; if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "strideS", "ldu", "strideU", "ldv", "strideV", "strideE", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, stS, ldu, stU, ldv, stV, stE, bc); } else if(STRIDED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "strideA", "strideS", "ldu", "strideU", "ldv", "strideV", "strideE", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, stA, stS, ldu, stU, ldv, stV, stE, bc); } else { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "ldu", "ldv"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, ldu, ldv); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GESVD(...) extern template void testing_gesvd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GESVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_gesvdj.hpp000066400000000000000000001121651436600607200230240ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void gesvdj_checkBadArgs(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, S dResidual, const rocblas_int max_sweeps, I dSweeps, S dS, const rocblas_stride stS, U dU, const rocblas_int ldu, const rocblas_stride stU, U dV, const rocblas_int ldv, const rocblas_stride stV, I dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, nullptr, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, rocblas_svect_overwrite, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, rocblas_svect_overwrite, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, (I) nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, (U) nullptr, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, (U) nullptr, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, (I) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, 0, n, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, (U) nullptr, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, 0, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, dU, ldu, stU, (U) nullptr, ldv, stV, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, (I) nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, (I) nullptr, 0), rocblas_status_success); } template void testing_gesvdj_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_svect left_svect = rocblas_svect_singular; rocblas_svect right_svect = rocblas_svect_singular; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_int ldu = 2; rocblas_int ldv = 2; rocblas_stride stA = 2; rocblas_stride stS = 2; rocblas_stride stU = 2; rocblas_stride stV = 2; rocblas_int bc = 1; S abstol = 0; rocblas_int max_sweeps = 100; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvdj_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvdj_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); } } template void gesvdj_initData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gesvdj_getError(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, Td& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Id& dinfo, const rocblas_int bc, const rocblas_svect left_svectT, const rocblas_svect right_svectT, const rocblas_int mT, const rocblas_int nT, Ud& dUT, const rocblas_int lduT, const rocblas_stride stUT, Ud& dVT, const rocblas_int ldvT, const rocblas_stride stVT, Wh& hA, Th& hResidualRes, Ih& hSweepsRes, Th& hS, Th& hSres, Uh& hU, Uh& Ures, const rocblas_int ldures, Uh& hV, Uh& Vres, const rocblas_int ldvres, Ih& hinfo, Ih& hinfoRes, double* max_err, double* max_errv) { rocblas_int lwork = 5 * max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); std::vector A(lda * n * bc); // input data initialization gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // execute computations: // complementary execution to compute all singular vectors if needed CHECK_ROCBLAS_ERROR(rocsolver_gesvdj(STRIDED, handle, left_svectT, right_svectT, mT, nT, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dUT.data(), lduT, stUT, dVT.data(), ldvT, stVT, dinfo.data(), bc)); if(left_svect == rocblas_svect_none && right_svect != rocblas_svect_none) CHECK_HIP_ERROR(Ures.transfer_from(dUT)); if(right_svect == rocblas_svect_none && left_svect != rocblas_svect_none) CHECK_HIP_ERROR(Vres.transfer_from(dVT)); gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(rocblas_svect_none, rocblas_svect_none, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc)); CHECK_HIP_ERROR(hResidualRes.transfer_from(dResidual)); CHECK_HIP_ERROR(hSweepsRes.transfer_from(dSweeps)); CHECK_HIP_ERROR(hSres.transfer_from(dS)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(left_svect == rocblas_svect_singular || left_svect == rocblas_svect_all) CHECK_HIP_ERROR(Ures.transfer_from(dU)); if(right_svect == rocblas_svect_singular || right_svect == rocblas_svect_all) CHECK_HIP_ERROR(Vres.transfer_from(dV)); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; // Also check validity of residual for(rocblas_int b = 0; b < bc; ++b) if(hResidualRes[b][0] < 0) *max_err += 1; // Also check validity of sweeps for(rocblas_int b = 0; b < bc; ++b) if(hSweepsRes[b][0] < 0 || hSweepsRes[b][0] > max_sweeps) *max_err += 1; // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). double err; *max_errv = 0; for(rocblas_int b = 0; b < bc; ++b) { // error is ||hS - hSres|| err = norm_error('F', 1, min(m, n), 1, hS[b], hSres[b]); *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required if(hinfo[b][0] == 0 && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { err = 0; // check singular vectors implicitly (A*v_k = s_k*u_k) for(rocblas_int k = 0; k < min(m, n); ++k) { for(rocblas_int i = 0; i < m; ++i) { T tmp = 0; for(rocblas_int j = 0; j < n; ++j) tmp += A[b * lda * n + i + j * lda] * sconj(Vres[b][k + j * ldvres]); tmp -= hSres[b][k] * Ures[b][i + k * ldures]; err += std::abs(tmp) * std::abs(tmp); } } err = std::sqrt(err) / double(snorm('F', m, n, A.data() + b * lda * n, lda)); *max_errv = err > *max_errv ? err : *max_errv; } } } template void gesvdj_getPerfData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, Td& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Id& dinfo, const rocblas_int bc, Wh& hA, Th& hS, Uh& hU, Uh& hV, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { rocblas_int lwork = 5 * max(m, n); rocblas_int lrwork = 5 * min(m, n); std::vector work(lwork); std::vector rwork(lrwork); std::vector A; if(!perf) { gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(left_svect, right_svect, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesvdj(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char leftvC = argus.get("left_svect"); char rightvC = argus.get("right_svect"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_int ldu = argus.get("ldu", m); rocblas_int ldv = argus.get("ldv", (rightvC == 'A' ? n : min(m, n))); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stS = argus.get("strideS", min(m, n)); rocblas_stride stU = argus.get("strideU", (leftvC == 'A' ? ldu * m : ldu * min(m, n))); rocblas_stride stV = argus.get("strideV", ldv * n); S abstol = S(argus.get("abstol", 0)); rocblas_int max_sweeps = argus.get("max_sweeps", 100); rocblas_svect leftv = char2rocblas_svect(leftvC); rocblas_svect rightv = char2rocblas_svect(rightvC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if((rightv != rocblas_svect_none && rightv != rocblas_svect_singular && rightv != rocblas_svect_all) || (leftv != rocblas_svect_none && leftv != rocblas_svect_singular && leftv != rocblas_svect_all)) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } /** TESTING OF SINGULAR VECTORS IS DONE IMPLICITLY, NOT EXPLICITLY COMPARING WITH LAPACK. SO, WE ALWAYS NEED TO COMPUTE THE SAME NUMBER OF ELEMENTS OF THE RIGHT AND LEFT VECTORS. WHILE DOING THIS, IF MORE VECTORS THAN THE SPECIFIED IN THE MAIN CALL NEED TO BE COMPUTED, WE DO SO WITH AN EXTRA CALL **/ rocblas_svect leftvT = rocblas_svect_none; rocblas_svect rightvT = rocblas_svect_none; rocblas_int ldvT = 1; rocblas_int lduT = 1; rocblas_int mT = 0; rocblas_int nT = 0; bool svects = (leftv != rocblas_svect_none || rightv != rocblas_svect_none); if(svects) { if(leftv == rocblas_svect_none) { leftvT = rocblas_svect_singular; lduT = m; mT = m; nT = n; } if(rightv == rocblas_svect_none) { rightvT = rocblas_svect_singular; ldvT = min(m, n); mT = m; nT = n; } } // determine sizes rocblas_int ldures = 1; rocblas_int ldvres = 1; size_t size_Sres = 0; size_t size_Ures = 0; size_t size_Vres = 0; size_t size_UT = 0; size_t size_VT = 0; size_t size_A = size_t(lda) * n; size_t size_S = size_t(min(m, n)); size_t size_U = (leftvC == 'A' ? size_t(ldu) * m : size_t(ldu) * min(m, n)); size_t size_V = size_t(ldv) * n; if(argus.unit_check || argus.norm_check) { size_Sres = size_S; if(svects) { if(leftv == rocblas_svect_none) { size_UT = size_t(lduT) * min(mT, nT); size_Ures = size_UT; ldures = lduT; } else { size_Ures = size_U; ldures = ldu; } if(rightv == rocblas_svect_none) { size_VT = size_t(ldvT) * nT; size_Vres = size_VT; ldvres = ldvT; } else { size_Vres = size_V; ldvres = ldv; } } } rocblas_stride stUT = size_UT; rocblas_stride stVT = size_VT; rocblas_stride stUres = size_Ures; rocblas_stride stVres = size_Vres; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || m < 0 || lda < m || ldu < 1 || ldv < 1 || bc < 0) || ((leftv == rocblas_svect_all || leftv == rocblas_svect_singular) && ldu < m) || ((rightv == rocblas_svect_all && ldv < n) || (rightv == rocblas_svect_singular && ldv < min(m, n))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) { CHECK_ALLOC_QUERY(rocsolver_gesvdj( STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdj( STRIDED, handle, leftvT, rightvT, mT, nT, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, bc)); } else { CHECK_ALLOC_QUERY(rocsolver_gesvdj( STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdj( STRIDED, handle, leftvT, rightvT, mT, nT, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, bc)); } size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hResidualRes(1, 1, 1, bc); host_strided_batch_vector hSweepsRes(1, 1, 1, bc); host_strided_batch_vector hS(size_S, 1, stS, bc); host_strided_batch_vector hV(size_V, 1, stV, bc); host_strided_batch_vector hU(size_U, 1, stU, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hSres(size_Sres, 1, stS, bc); host_strided_batch_vector Vres(size_Vres, 1, stVres, bc); host_strided_batch_vector Ures(size_Ures, 1, stUres, bc); // device device_strided_batch_vector dResidual(1, 1, 1, bc); device_strided_batch_vector dSweeps(1, 1, 1, bc); device_strided_batch_vector dS(size_S, 1, stS, bc); device_strided_batch_vector dV(size_V, 1, stV, bc); device_strided_batch_vector dU(size_U, 1, stU, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); device_strided_batch_vector dVT(size_VT, 1, stVT, bc); device_strided_batch_vector dUT(size_UT, 1, stUT, bc); if(size_VT) CHECK_HIP_ERROR(dVT.memcheck()); if(size_UT) CHECK_HIP_ERROR(dUT.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdj_getError( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hResidualRes, hSweepsRes, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdj_getPerfData(handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdj_getError( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hResidualRes, hSweepsRes, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdj_getPerfData(handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * min(m, n) * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 2 * min(m, n)); if(svects) ROCSOLVER_TEST_CHECK(T, max_errorv, 2 * min(m, n)); } // output results for rocsolver-bench if(argus.timing) { if(svects) max_error = (max_error >= max_errorv) ? max_error : max_errorv; if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "abstol", "max_sweeps", "strideS", "ldu", "strideU", "ldv", "strideV", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, abstol, max_sweeps, stS, ldu, stU, ldv, stV, bc); } else if(STRIDED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "strideA", "abstol", "max_sweeps", "strideS", "ldu", "strideU", "ldv", "strideV", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, stA, abstol, max_sweeps, stS, ldu, stU, ldv, stV, bc); } else { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "abstol", "max_sweeps", "ldu", "ldv"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, abstol, max_sweeps, ldu, ldv); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GESVDJ(...) extern template void testing_gesvdj<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GESVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_gesvdj_notransv.hpp000066400000000000000000001153311436600607200247540ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void gesvdj_notransv_checkBadArgs(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, S dResidual, const rocblas_int max_sweeps, I dSweeps, S dS, const rocblas_stride stS, U dU, const rocblas_int ldu, const rocblas_stride stU, U dV, const rocblas_int ldv, const rocblas_stride stV, I dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, nullptr, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, rocblas_svect_overwrite, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, rocblas_svect_overwrite, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, (I) nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, (U) nullptr, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, (U) nullptr, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, (I) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, 0, n, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, (U) nullptr, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, 0, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, dU, ldu, stU, (U) nullptr, ldv, stV, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, (I) nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, (I) nullptr, 0), rocblas_status_success); } template void testing_gesvdj_notransv_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_svect left_svect = rocblas_svect_singular; rocblas_svect right_svect = rocblas_svect_singular; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_int ldu = 2; rocblas_int ldv = 2; rocblas_stride stA = 2; rocblas_stride stS = 2; rocblas_stride stU = 2; rocblas_stride stV = 2; rocblas_int bc = 1; S abstol = 0; rocblas_int max_sweeps = 100; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvdj_notransv_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvdj_notransv_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); } } template void gesvdj_notransv_initData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gesvdj_notransv_getError(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, Td& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Id& dinfo, const rocblas_int bc, const rocblas_svect left_svectT, const rocblas_svect right_svectT, const rocblas_int mT, const rocblas_int nT, Ud& dUT, const rocblas_int lduT, const rocblas_stride stUT, Ud& dVT, const rocblas_int ldvT, const rocblas_stride stVT, Wh& hA, Th& hResidualRes, Ih& hSweepsRes, Th& hS, Th& hSres, Uh& hU, Uh& Ures, const rocblas_int ldures, Uh& hV, Uh& Vres, const rocblas_int ldvres, Ih& hinfo, Ih& hinfoRes, double* max_err, double* max_errv) { rocblas_int lwork = 5 * max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); std::vector A(lda * n * bc); // input data initialization gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // execute computations: // complementary execution to compute all singular vectors if needed CHECK_ROCBLAS_ERROR(rocsolver_gesvdj_notransv( STRIDED, handle, left_svectT, right_svectT, mT, nT, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dUT.data(), lduT, stUT, dVT.data(), ldvT, stVT, dinfo.data(), bc)); if(left_svect == rocblas_svect_none && right_svect != rocblas_svect_none) CHECK_HIP_ERROR(Ures.transfer_from(dUT)); if(right_svect == rocblas_svect_none && left_svect != rocblas_svect_none) CHECK_HIP_ERROR(Vres.transfer_from(dVT)); gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(rocblas_svect_none, rocblas_svect_none, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesvdj_notransv( STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc)); CHECK_HIP_ERROR(hResidualRes.transfer_from(dResidual)); CHECK_HIP_ERROR(hSweepsRes.transfer_from(dSweeps)); CHECK_HIP_ERROR(hSres.transfer_from(dS)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(left_svect == rocblas_svect_singular || left_svect == rocblas_svect_all) CHECK_HIP_ERROR(Ures.transfer_from(dU)); if(right_svect == rocblas_svect_singular || right_svect == rocblas_svect_all) CHECK_HIP_ERROR(Vres.transfer_from(dV)); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; // Also check validity of residual for(rocblas_int b = 0; b < bc; ++b) if(hResidualRes[b][0] < 0) *max_err += 1; // Also check validity of sweeps for(rocblas_int b = 0; b < bc; ++b) if(hSweepsRes[b][0] < 0 || hSweepsRes[b][0] > max_sweeps) *max_err += 1; // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). double err; *max_errv = 0; for(rocblas_int b = 0; b < bc; ++b) { // error is ||hS - hSres|| err = norm_error('F', 1, min(m, n), 1, hS[b], hSres[b]); *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required if(hinfo[b][0] == 0 && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { err = 0; // check singular vectors implicitly (A*v_k = s_k*u_k) for(rocblas_int k = 0; k < min(m, n); ++k) { for(rocblas_int i = 0; i < m; ++i) { T tmp = 0; for(rocblas_int j = 0; j < n; ++j) tmp += A[b * lda * n + i + j * lda] * Vres[b][j + k * ldvres]; tmp -= hSres[b][k] * Ures[b][i + k * ldures]; err += std::abs(tmp) * std::abs(tmp); } } err = std::sqrt(err) / double(snorm('F', m, n, A.data() + b * lda * n, lda)); *max_errv = err > *max_errv ? err : *max_errv; } } } template void gesvdj_notransv_getPerfData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, Td& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Id& dinfo, const rocblas_int bc, Wh& hA, Th& hS, Uh& hU, Uh& hV, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { rocblas_int lwork = 5 * max(m, n); rocblas_int lrwork = 5 * min(m, n); std::vector work(lwork); std::vector rwork(lrwork); std::vector A; if(!perf) { gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(left_svect, right_svect, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_gesvdj_notransv( STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesvdj_notransv(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char leftvC = argus.get("left_svect"); char rightvC = argus.get("right_svect"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_int ldu = argus.get("ldu", m); rocblas_int ldv = argus.get("ldv", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stS = argus.get("strideS", min(m, n)); rocblas_stride stU = argus.get("strideU", (leftvC == 'A' ? ldu * m : ldu * min(m, n))); rocblas_stride stV = argus.get("strideV", (rightvC == 'A' ? ldv * n : ldv * min(m, n))); S abstol = S(argus.get("abstol", 0)); rocblas_int max_sweeps = argus.get("max_sweeps", 100); rocblas_svect leftv = char2rocblas_svect(leftvC); rocblas_svect rightv = char2rocblas_svect(rightvC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if((rightv != rocblas_svect_none && rightv != rocblas_svect_singular && rightv != rocblas_svect_all) || (leftv != rocblas_svect_none && leftv != rocblas_svect_singular && leftv != rocblas_svect_all)) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } /** TESTING OF SINGULAR VECTORS IS DONE IMPLICITLY, NOT EXPLICITLY COMPARING WITH LAPACK. SO, WE ALWAYS NEED TO COMPUTE THE SAME NUMBER OF ELEMENTS OF THE RIGHT AND LEFT VECTORS. WHILE DOING THIS, IF MORE VECTORS THAN THE SPECIFIED IN THE MAIN CALL NEED TO BE COMPUTED, WE DO SO WITH AN EXTRA CALL **/ rocblas_svect leftvT = rocblas_svect_none; rocblas_svect rightvT = rocblas_svect_none; rocblas_int ldvT = 1; rocblas_int lduT = 1; rocblas_int mT = 0; rocblas_int nT = 0; bool svects = (leftv != rocblas_svect_none || rightv != rocblas_svect_none); if(svects) { if(leftv == rocblas_svect_none) { leftvT = rocblas_svect_singular; lduT = m; mT = m; nT = n; } if(rightv == rocblas_svect_none) { rightvT = rocblas_svect_singular; ldvT = n; mT = m; nT = n; } } // determine sizes rocblas_int ldures = 1; rocblas_int ldvres = 1; size_t size_Sres = 0; size_t size_Ures = 0; size_t size_Vres = 0; size_t size_UT = 0; size_t size_VT = 0; size_t size_A = size_t(lda) * n; size_t size_S = size_t(min(m, n)); size_t size_U = (leftvC == 'A' ? size_t(ldu) * m : size_t(ldu) * min(m, n)); size_t size_V = (rightvC == 'A' ? size_t(ldv) * n : size_t(ldv) * min(m, n)); if(argus.unit_check || argus.norm_check) { size_Sres = size_S; if(svects) { if(leftv == rocblas_svect_none) { size_UT = size_t(lduT) * min(mT, nT); size_Ures = size_UT; ldures = lduT; } else { size_Ures = size_U; ldures = ldu; } if(rightv == rocblas_svect_none) { size_VT = size_t(ldvT) * min(mT, nT); size_Vres = size_VT; ldvres = ldvT; } else { size_Vres = size_V; ldvres = ldv; } } } rocblas_stride stUT = size_UT; rocblas_stride stVT = size_VT; rocblas_stride stUres = size_Ures; rocblas_stride stVres = size_Vres; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || m < 0 || lda < m || ldu < 1 || ldv < 1 || bc < 0) || ((leftv == rocblas_svect_all || leftv == rocblas_svect_singular) && ldu < m) || ((rightv == rocblas_svect_all || rightv == rocblas_svect_singular) && ldv < n); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) { CHECK_ALLOC_QUERY(rocsolver_gesvdj_notransv( STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdj_notransv( STRIDED, handle, leftvT, rightvT, mT, nT, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, bc)); } else { CHECK_ALLOC_QUERY(rocsolver_gesvdj_notransv( STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdj_notransv( STRIDED, handle, leftvT, rightvT, mT, nT, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, bc)); } size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hResidualRes(1, 1, 1, bc); host_strided_batch_vector hSweepsRes(1, 1, 1, bc); host_strided_batch_vector hS(size_S, 1, stS, bc); host_strided_batch_vector hV(size_V, 1, stV, bc); host_strided_batch_vector hU(size_U, 1, stU, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hSres(size_Sres, 1, stS, bc); host_strided_batch_vector Vres(size_Vres, 1, stVres, bc); host_strided_batch_vector Ures(size_Ures, 1, stUres, bc); // device device_strided_batch_vector dResidual(1, 1, 1, bc); device_strided_batch_vector dSweeps(1, 1, 1, bc); device_strided_batch_vector dS(size_S, 1, stS, bc); device_strided_batch_vector dV(size_V, 1, stV, bc); device_strided_batch_vector dU(size_U, 1, stU, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); device_strided_batch_vector dVT(size_VT, 1, stVT, bc); device_strided_batch_vector dUT(size_UT, 1, stUT, bc); if(size_VT) CHECK_HIP_ERROR(dVT.memcheck()); if(size_UT) CHECK_HIP_ERROR(dUT.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdj_notransv_getError( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hResidualRes, hSweepsRes, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdj_notransv_getPerfData( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdj_notransv_getError( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hResidualRes, hSweepsRes, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdj_notransv_getPerfData( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * min(m, n) * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 2 * min(m, n)); if(svects) ROCSOLVER_TEST_CHECK(T, max_errorv, 2 * min(m, n)); } // output results for rocsolver-bench if(argus.timing) { if(svects) max_error = (max_error >= max_errorv) ? max_error : max_errorv; if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "abstol", "max_sweeps", "strideS", "ldu", "strideU", "ldv", "strideV", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, abstol, max_sweeps, stS, ldu, stU, ldv, stV, bc); } else if(STRIDED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "strideA", "abstol", "max_sweeps", "strideS", "ldu", "strideU", "ldv", "strideV", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, stA, abstol, max_sweeps, stS, ldu, stU, ldv, stV, bc); } else { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "abstol", "max_sweeps", "ldu", "ldv"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, abstol, max_sweeps, ldu, ldv); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-5.5.1/clients/include/testing_gesvdx.hpp000066400000000000000000001255221436600607200230430ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void gesvdx_checkBadArgs(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_srange srange, const rocblas_int m, const rocblas_int n, W dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, rocblas_int* dNsv, S* dS, const rocblas_stride stS, T dU, const rocblas_int ldu, const rocblas_stride stU, T dV, const rocblas_int ldv, const rocblas_stride stV, rocblas_int* difail, const rocblas_stride stF, rocblas_int* dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, nullptr, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, rocblas_svect_all, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, rocblas_svect_all, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, rocblas_srange(0), m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, (W) nullptr, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, (S*)nullptr, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, (T) nullptr, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, (T) nullptr, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, (rocblas_int*)nullptr, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, 0, n, (W) nullptr, lda, stA, vl, vu, il, iu, dNsv, (S*)nullptr, stS, (T) nullptr, ldu, stU, dV, ldv, stV, (rocblas_int*)nullptr, stF, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, 0, (W) nullptr, lda, stA, vl, vu, il, iu, dNsv, (S*)nullptr, stS, (T) nullptr, ldu, stU, (T) nullptr, ldv, stV, (rocblas_int*)nullptr, stF, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_gesvdx_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_svect left_svect = rocblas_svect_singular; rocblas_svect right_svect = rocblas_svect_singular; rocblas_srange srange = rocblas_srange_all; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_int ldu = 2; rocblas_int ldv = 2; rocblas_stride stA = 2; rocblas_stride stS = 2; rocblas_stride stU = 2; rocblas_stride stV = 2; rocblas_stride stF = 2; rocblas_int bc = 1; S vl = 0; S vu = 0; rocblas_int il = 0; rocblas_int iu = 0; // memory allocations (all cases) device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dNsv(1, 1, 1, 1); device_strided_batch_vector difail(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dNsv.memcheck()); CHECK_HIP_ERROR(difail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments gesvdx_checkBadArgs(handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments gesvdx_checkBadArgs(handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc); } } template void gesvdx_initData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); rocblas_int nn = std::min(m, n); // construct non singular matrix A such that all singular values are in (0, 20] for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { if(i == nn / 4 || i == nn / 2 || i == nn - 1 || i == nn / 7 || i == nn / 5 || i == nn / 3) hA[b][i + i * lda] = 0; for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = 2 * std::real(hA[b][i + j * lda]) - 21; else { if(m >= n) { if(j == i + 1) hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; else hA[b][i + j * lda] = 0; } else { if(i == j + 1) hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; else hA[b][i + j * lda] = 0; } } } } // make copy of original data to test vectors if required if(test && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gesvdx_getError(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_srange srange, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Id& dNsv, Ud& dS, const rocblas_stride stS, Td& dU, const rocblas_int ldu, const rocblas_stride stU, Td& dV, const rocblas_int ldv, const rocblas_stride stV, Id& difail, const rocblas_stride stF, Id& dinfo, const rocblas_int bc, const rocblas_svect left_svectT, const rocblas_svect right_svectT, const rocblas_int mT, const rocblas_int nT, Td& dUT, const rocblas_int lduT, const rocblas_stride stUT, Td& dVT, const rocblas_int ldvT, const rocblas_stride stVT, Wh& hA, Ih& hNsv, Ih& hNsvRes, Uh& hS, Uh& hSres, Th& hU, Th& hUres, const rocblas_int ldures, Th& hV, Th& hVres, const rocblas_int ldvres, Ih& hifail, Ih& hifailRes, Ih& hinfo, Ih& hinfoRes, double* max_err, double* max_errv) { /** As per lapack's documentation, the following workspace size should work: rocblas_int minn = std::min(m,n); rocblas_int maxn = std::max(m,n); rocblas_int lwork = minn * minn + 6 * minn + maxn; rocblas_int lrwork = 17 * minn * minn; std::vector work(lwork); std::vector rwork(lrwork); HOWEVER, gesvdx_ GIVES ILLEGAL VALUE FOR ARGUMENT lwork. Making the memory query to get the correct workspace dimension: std::vector query(1); cpu_gesvdx(left_svect, right_svect, srange, m, n, hA[0], lda, vl, vu, il, iu, hNsv[0], hS[0], hU[0], ldu, hV[0], ldv, query.data(), -1, rwork.data(), hifail[0], hinfo[0]); rocblas_int lwork = int(std::real(query[0])); std::vector work(lwork); AND NOW gesvdx_ FAILS WITH seg fault ERROR. **/ // (TODO: Need to confirm problem with gesvdx_ and report it) /** WORKAROUND: for now, we will call gesvd_ to get all the singular values on the CPU side and offset the result array according to srange, vl, vu, il, and iu. This approach has 2 disadvantages: 1. singular values are not computed to the same accuracy by gesvd_ (QR iteration) and gesvdx_ (inverse iteration). So, comparison maybe more sensitive. 2. info and ifail cannot be tested as they have different meaning in gesvd_ 3. we cannot provide timing for CPU execution using gesvd_ when testing gesvdx_ **/ // (TODO: We may revisit the entire approach in the future: change to another solution, // or wait for problems with gesvdx_ to be fixed) std::vector offset(bc); rocblas_int lwork = 5 * max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); rocblas_int minn = std::min(m, n); // input data initialization std::vector A(lda * n * bc); gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // execute computations: // complementary execution to compute all singular vectors if needed if(mT * nT > 0) { CHECK_ROCBLAS_ERROR(rocsolver_gesvdx(STRIDED, handle, left_svectT, right_svectT, srange, mT, nT, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dUT.data(), lduT, stUT, dVT.data(), ldvT, stVT, difail.data(), stF, dinfo.data(), bc)); if(left_svect == rocblas_svect_none && right_svect != rocblas_svect_none) CHECK_HIP_ERROR(hUres.transfer_from(dUT)); if(right_svect == rocblas_svect_none && left_svect != rocblas_svect_none) CHECK_HIP_ERROR(hVres.transfer_from(dVT)); } gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { //cpu_gesvdx(rocblas_svect_none, rocblas_svect_none, srange, m, n, hA[b], lda, vl, vu, il, iu, hNsv[b], hS[b], hU[b], ldu, hV[b], ldv, // work.data(), lwork, rwork.data(), hifail[b], hinfo[b]); /*** WORKAROUND: ***/ cpu_gesvd(rocblas_svect_none, rocblas_svect_none, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); hNsv[b][0] = 0; offset[b] = -1; if(srange == rocblas_srange_index) { offset[b] = il - 1; hNsv[b][0] = iu - il + 1; } else if(srange == rocblas_srange_value) { for(int j = 0; j < minn; ++j) { if(hS[b][j] < vu && hS[b][j] >= vl) { if(offset[b] == -1) offset[b] = j; hNsv[b][0]++; } } } else { offset[b] = 0; hNsv[b][0] = minn; } /*******************/ } // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc)); CHECK_HIP_ERROR(hSres.transfer_from(dS)); CHECK_HIP_ERROR(hNsvRes.transfer_from(dNsv)); CHECK_HIP_ERROR(hifailRes.transfer_from(difail)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(left_svect == rocblas_svect_singular) CHECK_HIP_ERROR(hUres.transfer_from(dU)); if(right_svect == rocblas_svect_singular) CHECK_HIP_ERROR(hVres.transfer_from(dV)); *max_err = 0; *max_errv = 0; // Check info and ifail for non-convergence // (NOTE: With the workaround in place, info and ifail cannot be tested as they have different // meaning in gesvd_, however, We expect the used input matrices to always converge) /*for(rocblas_int b = 0; b < bc; ++b) { if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; for(int j = 0; j < hNsv[b][0]; ++j) { if(hifail[b][j] != hifailRes[b][j]) *max_err += 1; } }*/ // Check number of returned singular values double err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(hNsv[b][0] != hNsvRes[b][0]) err++; } *max_err = err > *max_err ? err : *max_err; for(rocblas_int b = 0; b < bc; ++b) { // error is ||hS - hSres|| err = norm_error('F', 1, hNsv[b][0], 1, hS[b] + offset[b], hSres[b]); //WORKAROUND *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required if(hinfo[b][0] == 0 && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { err = 0; // check singular vectors implicitly (A*v_k = s_k*u_k) for(rocblas_int k = 0; k < hNsv[b][0]; ++k) { T tmp = 0; double tmp2 = 0; // (Comparing absolute values to deal with the fact that the pair of singular vectors (u,-v) or (-u,v) are // both ok and we could get either one with the complementary or main executions when only // one side set of vectors is required. May be revisited in the future.) for(rocblas_int i = 0; i < m; ++i) { tmp = 0; for(rocblas_int j = 0; j < n; ++j) tmp += A[b * lda * n + i + j * lda] * sconj(hVres[b][k + j * ldvres]); tmp2 = std::abs(tmp) - std::abs(hSres[b][k] * hUres[b][i + k * ldures]); err += tmp2 * tmp2; } } err = std::sqrt(err) / double(snorm('F', m, n, A.data() + b * lda * n, lda)); *max_errv = err > *max_errv ? err : *max_errv; } } } template void gesvdx_getPerfData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_srange srange, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Id& dNsv, Ud& dS, const rocblas_stride stS, Td& dU, const rocblas_int ldu, const rocblas_stride stU, Td& dV, const rocblas_int ldv, const rocblas_stride stV, Id& difail, const rocblas_stride stF, Id& dinfo, const rocblas_int bc, Wh& hA, Ih& hNsv, Uh& hS, Th& hU, Th& hV, Ih& hifail, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { /** As per lapack's documentation, the following workspace size should work: rocblas_int minn = std::min(m,n); rocblas_int maxn = std::max(m,n); rocblas_int lwork = minn * minn + 6 * minn + maxn; rocblas_int lrwork = 17 * minn * minn; std::vector work(lwork); std::vector rwork(lrwork); HOWEVER, gesvdx_ GIVES ILLEGAL VALUE FOR ARGUMENT lwork. Making the memory query to get the correct workspace dimension: std::vector query(1); cpu_gesvdx(left_svect, right_svect, srange, m, n, hA[0], lda, vl, vu, il, iu, hNsv[0], hS[0], hU[0], ldu, hV[0], ldv, query.data(), -1, rwork.data(), hifail[0], hinfo[0]); rocblas_int lwork = int(std::real(query[0])); std::vector work(lwork); AND NOW gesvdx_ FAILS WITH seg fault ERROR. **/ // (TODO: Need to confirm problem with gesvdx_ and report it) // For now we cannot report cpu time std::vector A(lda * n * bc); if(!perf) { //gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) //*cpu_time_used = get_time_us_no_sync(); //for(rocblas_int b = 0; b < bc; ++b) // cpu_gesvdx(left_svect, right_svect, srange, m, n, hA[b], lda, vl, vu, il, iu, hNsv[b], hS[b], hU[b], ldu, hV[b], ldv, // work.data(), lwork, rwork.data(), hifail[b], hinfo[b]); //*cpu_time_used = get_time_us_no_sync() - *cpu_time_used; *cpu_time_used = nan(""); } gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesvdx(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char leftvC = argus.get("left_svect"); char rightvC = argus.get("right_svect"); char srangeC = argus.get("srange"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int nn = std::min(m, n); rocblas_int lda = argus.get("lda", m); rocblas_int ldu = argus.get("ldu", m); rocblas_int ldv = argus.get("ldv", nn); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stS = argus.get("strideS", nn); rocblas_stride stF = argus.get("strideF", nn); rocblas_stride stU = argus.get("strideU", ldu * nn); rocblas_stride stV = argus.get("strideV", ldv * n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", srangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", srangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", srangeC == 'I' ? 1 : 0); rocblas_svect leftv = char2rocblas_svect(leftvC); rocblas_svect rightv = char2rocblas_svect(rightvC); rocblas_srange srange = char2rocblas_srange(srangeC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(rightv == rocblas_svect_overwrite || leftv == rocblas_svect_overwrite || rightv == rocblas_svect_all || leftv == rocblas_svect_all) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } /** TESTING OF SINGULAR VECTORS IS DONE IMPLICITLY, NOT EXPLICITLY COMPARING WITH LAPACK. SO, WE ALWAYS NEED TO COMPUTE THE SAME NUMBER OF ELEMENTS OF THE RIGHT AND LEFT VECTORS. WHILE DOING THIS, IF MORE VECTORS THAN THE SPECIFIED IN THE MAIN CALL NEED TO BE COMPUTED, WE DO SO WITH AN EXTRA CALL **/ rocblas_svect leftvT = rocblas_svect_none; rocblas_svect rightvT = rocblas_svect_none; rocblas_int ldvT = 1; rocblas_int lduT = 1; rocblas_int mT = 0; rocblas_int nT = 0; bool svects = (leftv != rocblas_svect_none || rightv != rocblas_svect_none); if(svects) { if(leftv == rocblas_svect_none) { leftvT = rocblas_svect_singular; lduT = m; mT = m; nT = n; } if(rightv == rocblas_svect_none) { rightvT = rocblas_svect_singular; ldvT = nn; mT = m; nT = n; } } // determine sizes rocblas_int ldures = 1; rocblas_int ldvres = 1; size_t size_hSres = 0; size_t size_hUres = 0; size_t size_hVres = 0; size_t size_hifailRes = 0; size_t size_UT = 0; size_t size_VT = 0; size_t size_A = size_t(lda) * n; size_t size_S = size_t(nn); size_t size_V = size_t(ldv) * n; size_t size_U = size_t(ldu) * nn; size_t size_ifail = nn; if(argus.unit_check || argus.norm_check) { size_hifailRes = nn; size_VT = size_t(ldvT) * n; size_UT = size_t(lduT) * nn; size_hSres = nn; if(svects) { if(leftv == rocblas_svect_none) { size_hUres = size_UT; ldures = lduT; } else { size_hUres = size_U; ldures = ldu; } if(rightv == rocblas_svect_none) { size_hVres = size_VT; ldvres = ldvT; } else { size_hVres = size_V; ldvres = ldv; } } } rocblas_stride stUT = size_UT; rocblas_stride stVT = size_VT; rocblas_stride stUres = size_hUres; rocblas_stride stVres = size_hVres; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || m < 0 || lda < m || ldu < 1 || ldv < 1 || bc < 0) || (leftv == rocblas_svect_singular && ldu < m) || (rightv == rocblas_svect_singular && ldv < nn) || (srange == rocblas_srange_value && (vl < 0 || vl >= vu)) || (srange == rocblas_srange_index && (il < 1 || iu < 0)) || (srange == rocblas_srange_index && (iu > nn || (nn > 0 && il > iu))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) { CHECK_ALLOC_QUERY(rocsolver_gesvdx( STRIDED, handle, leftv, rightv, srange, m, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdx( STRIDED, handle, leftvT, rightvT, srange, mT, nT, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); } else { CHECK_ALLOC_QUERY(rocsolver_gesvdx( STRIDED, handle, leftv, rightv, srange, m, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdx( STRIDED, handle, leftvT, rightvT, srange, mT, nT, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); } size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hS(size_S, 1, stS, bc); host_strided_batch_vector hV(size_V, 1, stV, bc); host_strided_batch_vector hU(size_U, 1, stU, bc); host_strided_batch_vector hNsv(1, 1, 1, bc); host_strided_batch_vector hNsvRes(1, 1, 1, bc); host_strided_batch_vector hifail(12 * nn, 1, stF, bc); host_strided_batch_vector hifailRes(size_hifailRes, 1, stF, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hSres(size_hSres, 1, stS, bc); host_strided_batch_vector hVres(size_hVres, 1, stVres, bc); host_strided_batch_vector hUres(size_hUres, 1, stUres, bc); // device device_strided_batch_vector dS(size_S, 1, stS, bc); device_strided_batch_vector dV(size_V, 1, stV, bc); device_strided_batch_vector dU(size_U, 1, stU, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); device_strided_batch_vector dNsv(1, 1, 1, bc); device_strided_batch_vector difail(size_ifail, 1, stF, bc); device_strided_batch_vector dVT(size_VT, 1, stVT, bc); device_strided_batch_vector dUT(size_UT, 1, stUT, bc); if(size_VT) CHECK_HIP_ERROR(dVT.memcheck()); if(size_UT) CHECK_HIP_ERROR(dUT.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); if(size_ifail) CHECK_HIP_ERROR(difail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); CHECK_HIP_ERROR(dNsv.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdx_getError( handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hNsv, hNsvRes, hS, hSres, hU, hUres, ldures, hV, hVres, ldvres, hifail, hifailRes, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdx_getPerfData(handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, hA, hNsv, hS, hU, hV, hifail, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdx_getError( handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hNsv, hNsvRes, hS, hSres, hU, hUres, ldures, hV, hVres, ldvres, hifail, hifailRes, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdx_getPerfData(handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, hA, hNsv, hS, hU, hV, hifail, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * min(m, n) * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 2 * min(m, n)); if(svects) ROCSOLVER_TEST_CHECK(T, max_errorv, 2 * min(m, n)); } // output results for rocsolver-bench if(argus.timing) { if(svects) max_error = (max_error >= max_errorv) ? max_error : max_errorv; if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("left_svect", "right_svect", "srange", "m", "n", "lda", "vl", "vu", "il", "iu", "strideS", "ldu", "strideU", "ldv", "strideV", "strideF", "batch_c"); rocsolver_bench_output(leftvC, rightvC, srangeC, m, n, lda, vl, vu, il, iu, stS, ldu, stU, ldv, stV, stF, bc); } else if(STRIDED) { rocsolver_bench_output("left_svect", "right_svect", "srange", "m", "n", "lda", "strideA", "vl", "vu", "il", "iu", "strideS", "ldu", "strideU", "ldv", "strideV", "strideF", "batch_c"); rocsolver_bench_output(leftvC, rightvC, srangeC, m, n, lda, stA, vl, vu, il, iu, stS, ldu, stU, ldv, stV, stF, bc); } else { rocsolver_bench_output("left_svect", "right_svect", "srange", "m", "n", "lda", "vl", "vu", "il", "iu", "ldu", "ldv"); rocsolver_bench_output(leftvC, rightvC, srangeC, m, n, lda, vl, vu, il, iu, ldu, ldv); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GESVDX(...) extern template void testing_gesvdx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GESVDX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_getf2_getrf.hpp000066400000000000000000000530361436600607200237410ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void getf2_getrf_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf(STRIDED, GETRF, nullptr, m, n, dA, lda, stA, dIpiv, stP, dinfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA, lda, stA, dIpiv, stP, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, (T) nullptr, lda, stA, dIpiv, stP, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA, lda, stA, (U) nullptr, stP, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA, lda, stA, dIpiv, stP, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, 0, n, (T) nullptr, lda, stA, (U) nullptr, stP, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, 0, (T) nullptr, lda, stA, (U) nullptr, stP, dinfo, bc), rocblas_status_success); if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA, lda, stA, dIpiv, stP, (U) nullptr, 0), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA, lda, stA, dIpiv, stP, dinfo, 0), rocblas_status_success); } template void testing_getf2_getrf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getf2_getrf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getf2_getrf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } } template void getf2_getrf_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(rocblas_int i = 0; i < m / 2; i++) { for(rocblas_int j = 0; j < n; j++) { tmp = hA[b][i + j * lda]; hA[b][i + j * lda] = hA[b][m - 1 - i + j * lda]; hA[b][m - 1 - i + j * lda] = tmp; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes). // The algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int j = n / 4 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < m; i++) hA[b][i + j * lda] = 0; j = n / 2 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < m; i++) hA[b][i + j * lda] = 0; j = n - 1 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < m; i++) hA[b][i + j * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void getf2_getrf_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hIpivRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization getf2_getrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hIpivRes.transfer_from(dIpiv)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { GETRF ? cpu_getrf(m, n, hA[b], lda, hIpiv[b], hInfo[b]) : cpu_getf2(m, n, hA[b], lda, hIpiv[b], hInfo[b]); } // expecting original matrix to be non-singular // error is ||hA - hARes|| / ||hA|| (ideally ||LU - Lres Ures|| / ||LU||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; // also check pivoting (count the number of incorrect pivots) err = 0; for(rocblas_int i = 0; i < min(m, n); ++i) if(hIpiv[b][i] != hIpivRes[b][i]) err++; *max_err = err > *max_err ? err : *max_err; } // also check info for singularities err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) err++; *max_err += err; } template void getf2_getrf_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { getf2_getrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { GETRF ? cpu_getrf(m, n, hA[b], lda, hIpiv[b], hInfo[b]) : cpu_getf2(m, n, hA[b], lda, hIpiv[b], hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getf2_getrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getf2_getrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { getf2_getrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); start = get_time_us_sync(stream); rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getf2_getrf(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stPRes = (argus.unit_check || argus.norm_check) ? stP : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_PRes = (argus.unit_check || argus.norm_check) ? size_P : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hIpivRes(size_PRes, 1, stPRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getf2_getrf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hIpivRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getf2_getrf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hIpivRes(size_PRes, 1, stPRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getf2_getrf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hIpivRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getf2_getrf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using min(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, min(m, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETF2_GETRF(...) \ extern template void testing_getf2_getrf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETF2_GETRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_getf2_getrf_npvt.hpp000066400000000000000000000454761436600607200250210ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void getf2_getrf_npvt_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, nullptr, m, n, dA, lda, stA, dinfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA, lda, stA, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, (T) nullptr, lda, stA, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA, lda, stA, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, 0, n, (T) nullptr, lda, stA, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, 0, (T) nullptr, lda, stA, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA, lda, stA, dinfo, 0), rocblas_status_success); } template void testing_getf2_getrf_npvt_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments getf2_getrf_npvt_checkBadArgs(handle, m, n, dA.data(), lda, stA, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments getf2_getrf_npvt_checkBadArgs(handle, m, n, dA.data(), lda, stA, dinfo.data(), bc); } } template void getf2_getrf_npvt_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dinfo, const rocblas_int bc, Th& hA, Uh& hinfo, const bool singular) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities // leaving matrix as diagonal dominant so that pivoting is not required for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes). // The algorithm must detect the first zero element in the // diagonal of those matrices in the batch that are singular rocblas_int j = n / 4 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < m; i++) hA[b][i + j * lda] = 0; j = n / 2 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < m; i++) hA[b][i + j * lda] = 0; j = n - 1 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < m; i++) hA[b][i + j * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void getf2_getrf_npvt_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dinfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hinfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization getf2_getrf_npvt_initData(handle, m, n, dA, lda, stA, dinfo, bc, hA, hinfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dinfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dinfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { GETRF ? cpu_getrf(m, n, hA[b], lda, hIpiv[b], hinfo[b]) : cpu_getf2(m, n, hA[b], lda, hIpiv[b], hinfo[b]); } // expecting original matrix to be non-singular // error is ||hA - hARes|| / ||hA|| (ideally ||LU - Lres Ures|| / ||LU||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } // also check info for singularities err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hinfo[b][0] != hInfoRes[b][0]) err++; *max_err += err; } template void getf2_getrf_npvt_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dinfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { getf2_getrf_npvt_initData(handle, m, n, dA, lda, stA, dinfo, bc, hA, hinfo, singular); // cpu-lapack performance (only if no perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { GETRF ? cpu_getrf(m, n, hA[b], lda, hIpiv[b], hinfo[b]) : cpu_getf2(m, n, hA[b], lda, hIpiv[b], hinfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getf2_getrf_npvt_initData(handle, m, n, dA, lda, stA, dinfo, bc, hA, hinfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getf2_getrf_npvt_initData(handle, m, n, dA, lda, stA, dinfo, bc, hA, hinfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { getf2_getrf_npvt_initData(handle, m, n, dA, lda, stA, dinfo, bc, hA, hinfo, singular); start = get_time_us_sync(stream); rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getf2_getrf_npvt(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getf2_getrf_npvt_getError(handle, m, n, dA, lda, stA, dinfo, bc, hA, hARes, hIpiv, hinfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getf2_getrf_npvt_getPerfData( handle, m, n, dA, lda, stA, dinfo, bc, hA, hIpiv, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getf2_getrf_npvt_getError(handle, m, n, dA, lda, stA, dinfo, bc, hA, hARes, hIpiv, hinfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getf2_getrf_npvt_getPerfData( handle, m, n, dA, lda, stA, dinfo, bc, hA, hIpiv, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using min(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, min(m, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "batch_c"); rocsolver_bench_output(m, n, lda, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "batch_c"); rocsolver_bench_output(m, n, lda, stA, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETF2_GETRF_NPVT(...) \ extern template void testing_getf2_getrf_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETF2_GETRF_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_getri.hpp000066400000000000000000000446501436600607200226570ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void getri_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, nullptr, n, dA, lda, stA, dIpiv, stP, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_getri(STRIDED, handle, n, (T) nullptr, lda, stA, dIpiv, stP, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_getri(STRIDED, handle, n, dA, lda, stA, (U) nullptr, stP, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_getri(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_getri(STRIDED, handle, 0, (T) nullptr, lda, stA, (U) nullptr, stP, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_getri(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, (U) nullptr, 0), rocblas_status_success); } template void testing_getri_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_checkBadArgs(handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_checkBadArgs(handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } } template void getri_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, Ud& dIpiv, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = hA[b][i + j * lda] / 10.0 + 10; //+= 400; else hA[b][i + j * lda] = (hA[b][i + j * lda] - 4) / 10.0; // -= 4; } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(rocblas_int i = 0; i < n / 2; i++) { for(rocblas_int j = 0; j < n; j++) { tmp = hA[b][i + j * lda]; hA[b][i + j * lda] = hA[b][n - 1 - i + j * lda]; hA[b][n - 1 - i + j * lda] = tmp; } } // do the LU decomposition of matrix A w/ the reference LAPACK routine cpu_getrf(n, n, hA[b], lda, hIpiv[b], hInfo[b]); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void getri_getError(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); // input data initialization getri_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getri(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } template void getri_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); if(!perf) { getri_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getri_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getri_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_getri(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { getri_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); start = get_time_us_sync(stream); rocsolver_getri(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getri(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getri(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getri(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_getError(handle, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_getPerfData(handle, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_getError(handle, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_getPerfData(handle, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "lda", "strideP", "batch_c"); rocsolver_bench_output(n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(n, lda, stA, stP, bc); } else { rocsolver_bench_output("n", "lda"); rocsolver_bench_output(n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETRI(...) extern template void testing_getri<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_getri_npvt.hpp000066400000000000000000000413201436600607200237150ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void getri_npvt_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, nullptr, n, dA, lda, stA, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, dA, lda, stA, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, (T) nullptr, lda, stA, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, dA, lda, stA, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, 0, (T) nullptr, lda, stA, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, dA, lda, stA, (U) nullptr, 0), rocblas_status_success); } template void testing_getri_npvt_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_npvt_checkBadArgs(handle, n, dA.data(), lda, stA, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_npvt_checkBadArgs(handle, n, dA.data(), lda, stA, dInfo.data(), bc); } } template void getri_npvt_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities // leaving matrix as diagonal dominant so that pivoting is not required for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // do the LU decomposition of matrix A w/ the reference LAPACK routine cpu_getrf(n, n, hA[b], lda, hIpiv[b], hInfo[b]); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void getri_npvt_getError(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); // input data initialization getri_npvt_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_getri_npvt(STRIDED, handle, n, dA.data(), lda, stA, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } template void getri_npvt_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); if(!perf) { getri_npvt_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getri_npvt_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getri_npvt_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); CHECK_ROCBLAS_ERROR( rocsolver_getri_npvt(STRIDED, handle, n, dA.data(), lda, stA, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { getri_npvt_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); start = get_time_us_sync(stream); rocsolver_getri_npvt(STRIDED, handle, n, dA.data(), lda, stA, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getri_npvt(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getri_npvt(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getri_npvt(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_getri_npvt(STRIDED, handle, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_npvt_getError(handle, n, dA, lda, stA, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_npvt_getPerfData(handle, n, dA, lda, stA, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_getri_npvt(STRIDED, handle, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_npvt_getError(handle, n, dA, lda, stA, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_npvt_getPerfData(handle, n, dA, lda, stA, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "lda", "batch_c"); rocsolver_bench_output(n, lda, bc); } else if(STRIDED) { rocsolver_bench_output("n", "lda", "strideA", "batch_c"); rocsolver_bench_output(n, lda, stA, bc); } else { rocsolver_bench_output("n", "lda"); rocsolver_bench_output(n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETRI_NPVT(...) \ extern template void testing_getri_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETRI_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_getri_npvt_outofplace.hpp000066400000000000000000000511071436600607200261420ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void getri_npvt_outofplace_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dC, const rocblas_int ldc, const rocblas_stride stC, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_getri_npvt_outofplace(STRIDED, nullptr, n, dA, lda, stA, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA, lda, stA, dC, ldc, stC, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, (T) nullptr, lda, stA, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA, lda, stA, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA, lda, stA, dC, ldc, stC, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, 0, (T) nullptr, lda, stA, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA, lda, stA, dC, ldc, stC, (U) nullptr, 0), rocblas_status_success); } template void testing_getri_npvt_outofplace_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; rocblas_stride stA = 1; rocblas_stride stC = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dC(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_npvt_outofplace_checkBadArgs(handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_npvt_outofplace_checkBadArgs(handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc); } } template void getri_npvt_outofplace_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities // leaving matrix as diagonal dominant so that pivoting is not required for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // do the LU decomposition of matrix A w/ the reference LAPACK routine cpu_getrf(n, n, hA[b], lda, hIpiv[b], hInfo[b]); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void getri_npvt_outofplace_getError(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); // input data initialization getri_npvt_outofplace_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dC)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b], ldc); *max_err = err > *max_err ? err : *max_err; } } } template void getri_npvt_outofplace_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); if(!perf) { getri_npvt_outofplace_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getri_npvt_outofplace_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getri_npvt_outofplace_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { getri_npvt_outofplace_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); start = get_time_us_sync(stream); rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getri_npvt_outofplace(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldc = argus.get("ldc", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stC = argus.get("strideC", ldc * n); rocblas_stride stP = argus.get("strideP", n); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stC : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_C = size_t(ldc) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldc < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace( STRIDED, handle, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, (T*)nullptr, lda, stA, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, (T*)nullptr, lda, stA, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dC(size_C, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_npvt_outofplace_getError(handle, n, dA, lda, stA, dC, ldc, stC, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_npvt_outofplace_getPerfData( handle, n, dA, lda, stA, dC, ldc, stC, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dC(size_C, 1, stC, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_npvt_outofplace_getError(handle, n, dA, lda, stA, dC, ldc, stC, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_npvt_outofplace_getPerfData( handle, n, dA, lda, stA, dC, ldc, stC, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "lda", "ldc", "batch_c"); rocsolver_bench_output(n, lda, ldc, bc); } else if(STRIDED) { rocsolver_bench_output("n", "lda", "strideA", "ldc", "strideC", "batch_c"); rocsolver_bench_output(n, lda, stA, ldc, stC, bc); } else { rocsolver_bench_output("n", "lda", "ldc"); rocsolver_bench_output(n, lda, ldc); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETRI_NPVT_OUTOFPLACE(...) \ extern template void testing_getri_npvt_outofplace<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETRI_NPVT_OUTOFPLACE, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_getri_outofplace.hpp000066400000000000000000000543231436600607200250760ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void getri_outofplace_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, T dC, const rocblas_int ldc, const rocblas_stride stC, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, nullptr, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, (T) nullptr, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA, lda, stA, (U) nullptr, stP, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, 0, (T) nullptr, lda, stA, (U) nullptr, stP, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, (U) nullptr, 0), rocblas_status_success); } template void testing_getri_outofplace_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; rocblas_stride stA = 1; rocblas_stride stC = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dC(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_outofplace_checkBadArgs(handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_outofplace_checkBadArgs(handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc); } } template void getri_outofplace_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, Ud& dIpiv, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(rocblas_int i = 0; i < n / 2; i++) { for(rocblas_int j = 0; j < n; j++) { tmp = hA[b][i + j * lda]; hA[b][i + j * lda] = hA[b][n - 1 - i + j * lda]; hA[b][n - 1 - i + j * lda] = tmp; } } // do the LU decomposition of matrix A w/ the reference LAPACK routine cpu_getrf(n, n, hA[b], lda, hIpiv[b], hInfo[b]); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void getri_outofplace_getError(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); // input data initialization getri_outofplace_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getri_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dC)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b], ldc); *max_err = err > *max_err ? err : *max_err; } } } template void getri_outofplace_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); if(!perf) { getri_outofplace_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getri_outofplace_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getri_outofplace_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_getri_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { getri_outofplace_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); start = get_time_us_sync(stream); rocsolver_getri_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getri_outofplace(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldc = argus.get("ldc", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stC = argus.get("strideC", ldc * n); rocblas_stride stP = argus.get("strideP", n); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stC : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_C = size_t(ldc) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldc < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getri_outofplace( STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getri_outofplace(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dC(size_C, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_outofplace_getError(handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_outofplace_getPerfData( handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dC(size_C, 1, stC, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_outofplace_getError(handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_outofplace_getPerfData( handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "lda", "strideP", "ldc", "batch_c"); rocsolver_bench_output(n, lda, stP, ldc, bc); } else if(STRIDED) { rocsolver_bench_output("n", "lda", "strideA", "strideP", "ldc", "strideC", "batch_c"); rocsolver_bench_output(n, lda, stA, stP, ldc, stC, bc); } else { rocsolver_bench_output("n", "lda", "ldc"); rocsolver_bench_output(n, lda, ldc); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETRI_OUTOFPLACE(...) \ extern template void testing_getri_outofplace<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETRI_OUTOFPLACE, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_getrs.hpp000066400000000000000000000471651436600607200226750ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void getrs_checkBadArgs(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, T dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, nullptr, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, rocblas_operation(0), n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, (T) nullptr, lda, stA, dIpiv, stP, dB, ldb, stB, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA, lda, stA, (U) nullptr, stP, dB, ldb, stB, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, (T) nullptr, ldb, stB, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, 0, nrhs, (T) nullptr, lda, stA, (U) nullptr, stP, (T) nullptr, ldb, stB, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, 0, dA, lda, stA, dIpiv, stP, (T) nullptr, ldb, stB, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, 0), rocblas_status_success); } template void testing_getrs_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_stride stB = 1; rocblas_int bc = 1; rocblas_operation trans = rocblas_operation_none; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments getrs_checkBadArgs(handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments getrs_checkBadArgs(handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc); } } template void getrs_initData(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } // do the LU decomposition of matrix A w/ the reference LAPACK routine for(rocblas_int b = 0; b < bc; ++b) { int info; cpu_getrf(n, n, hA[b], lda, hIpiv[b], &info); } } if(GPU) { // now copy pivoting indices and matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void getrs_getError(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, Th& hBRes, double* max_err) { // input data initialization getrs_initData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_getrs(trans, n, nrhs, hA[b], lda, hIpiv[b], hB[b], ldb); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('I', n, nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; } } template void getrs_getPerfData(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { getrs_initData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_getrs(trans, n, nrhs, hA[b], lda, hIpiv[b], hB[b], ldb); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getrs_initData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB); // cold calls for(int iter = 0; iter < 2; iter++) { getrs_initData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB); CHECK_ROCBLAS_ERROR(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { getrs_initData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB); start = get_time_us_sync(stream); rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getrs(Arguments& argus) { // get arguments rocblas_local_handle handle; char transC = argus.get("trans"); rocblas_int n = argus.get("n"); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; // check invalid sizes bool invalid_size = (n < 0 || nrhs < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_getrs(STRIDED, handle, trans, n, nrhs, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldb, stB, bc)); else CHECK_ALLOC_QUERY(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldb, stB, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getrs_getError(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, hBRes, &max_error); // collect performance data if(argus.timing) getrs_getPerfData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getrs_getError(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, hBRes, &max_error); // collect performance data if(argus.timing) getrs_getPerfData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("trans", "n", "nrhs", "lda", "ldb", "strideP", "batch_c"); rocsolver_bench_output(transC, n, nrhs, lda, ldb, stP, bc); } else if(STRIDED) { rocsolver_bench_output("trans", "n", "nrhs", "lda", "ldb", "strideA", "strideP", "strideB", "batch_c"); rocsolver_bench_output(transC, n, nrhs, lda, ldb, stA, stP, stB, bc); } else { rocsolver_bench_output("trans", "n", "nrhs", "lda", "ldb"); rocsolver_bench_output(transC, n, nrhs, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETRS(...) extern template void testing_getrs<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETRS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_labrd.hpp000066400000000000000000000425111436600607200226230ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void labrd_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int nb, T dA, const rocblas_int lda, S dD, S dE, U dTauq, U dTaup, T dX, const rocblas_int ldx, T dY, const rocblas_int ldy) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_labrd(nullptr, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, (T) nullptr, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, (S) nullptr, dE, dTauq, dTaup, dX, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, dD, (S) nullptr, dTauq, dTaup, dX, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, dD, dE, (U) nullptr, dTaup, dX, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, dD, dE, dTauq, (U) nullptr, dX, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, (T) nullptr, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, (T) nullptr, ldy), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, 0, n, 0, (T) nullptr, lda, dD, dE, dTauq, dTaup, (T) nullptr, ldx, dY, ldy), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, m, 0, 0, (T) nullptr, lda, dD, dE, dTauq, dTaup, dX, ldx, (T) nullptr, ldy), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, m, n, 0, dA, lda, (S) nullptr, (S) nullptr, (U) nullptr, (U) nullptr, (T) nullptr, ldx, (T) nullptr, ldy), rocblas_status_success); } template void testing_labrd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int nb = 1; rocblas_int lda = 1; rocblas_int ldx = 1; rocblas_int ldy = 1; // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTauq(1, 1, 1, 1); device_strided_batch_vector dTaup(1, 1, 1, 1); device_strided_batch_vector dX(1, 1, 1, 1); device_strided_batch_vector dY(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTauq.memcheck()); CHECK_HIP_ERROR(dTaup.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); CHECK_HIP_ERROR(dY.memcheck()); // check bad arguments labrd_checkBadArgs(handle, m, n, nb, dA.data(), lda, dD.data(), dE.data(), dTauq.data(), dTaup.data(), dX.data(), ldx, dY.data(), ldy); } template void labrd_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int nb, Td& dA, const rocblas_int lda, Sd& dD, Sd& dE, Ud& dTauq, Ud& dTaup, Td& dX, const rocblas_int ldx, Td& dY, const rocblas_int ldy, Th& hA, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup, Th& hX, Th& hY) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j || (m >= n && j == i + 1) || (m < n && i == j + 1)) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void labrd_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int nb, Td& dA, const rocblas_int lda, Sd& dD, Sd& dE, Ud& dTauq, Ud& dTaup, Td& dX, const rocblas_int ldx, Td& dY, const rocblas_int ldy, Th& hA, Th& hARes, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup, Th& hX, Th& hXRes, Th& hY, Th& hYRes, double* max_err) { // input data initialization labrd_initData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_labrd(handle, m, n, nb, dA.data(), lda, dD.data(), dE.data(), dTauq.data(), dTaup.data(), dX.data(), ldx, dY.data(), ldy)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hXRes.transfer_from(dX)); CHECK_HIP_ERROR(hYRes.transfer_from(dY)); // CPU lapack cpu_labrd(m, n, nb, hA[0], lda, hD[0], hE[0], hTauq[0], hTaup[0], hX[0], ldx, hY[0], ldy); // error is max(||hA - hARes|| / ||hA||, ||hX - hXRes|| / ||hX||, ||hY - // hYRes|| / ||hY||) (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY // ISSUES. IT MIGHT BE REVISITED IN THE FUTURE) using frobenius norm double err; *max_err = 0; err = norm_error('F', m, n, lda, hA[0], hARes[0]); *max_err = err > *max_err ? err : *max_err; err = norm_error('F', m - nb, nb, ldx, hX[0] + nb, hXRes[0] + nb); *max_err = err > *max_err ? err : *max_err; err = norm_error('F', n - nb, nb, ldy, hY[0] + nb, hYRes[0] + nb); *max_err = err > *max_err ? err : *max_err; } template void labrd_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int nb, Td& dA, const rocblas_int lda, Sd& dD, Sd& dE, Ud& dTauq, Ud& dTaup, Td& dX, const rocblas_int ldx, Td& dY, const rocblas_int ldy, Th& hA, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup, Th& hX, Th& hY, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { labrd_initData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY); // cpu-lapack performance *cpu_time_used = get_time_us_no_sync(); memset(hX[0], 0, ldx * nb * sizeof(T)); memset(hY[0], 0, ldy * nb * sizeof(T)); cpu_labrd(m, n, nb, hA[0], lda, hD[0], hE[0], hTauq[0], hTaup[0], hX[0], ldx, hY[0], ldy); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } labrd_initData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY); // cold calls for(int iter = 0; iter < 2; iter++) { labrd_initData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY); CHECK_ROCBLAS_ERROR(rocsolver_labrd(handle, m, n, nb, dA.data(), lda, dD.data(), dE.data(), dTauq.data(), dTaup.data(), dX.data(), ldx, dY.data(), ldy)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { labrd_initData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY); start = get_time_us_sync(stream); rocsolver_labrd(handle, m, n, nb, dA.data(), lda, dD.data(), dE.data(), dTauq.data(), dTaup.data(), dX.data(), ldx, dY.data(), ldy); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_labrd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int nb = argus.get("k", min(m, n)); rocblas_int lda = argus.get("lda", m); rocblas_int ldx = argus.get("ldx", m); rocblas_int ldy = argus.get("ldy", n); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = lda * n; size_t size_D = nb; size_t size_E = nb; size_t size_Q = nb; size_t size_P = nb; size_t size_X = ldx * nb; size_t size_Y = ldy * nb; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_XRes = (argus.unit_check || argus.norm_check) ? size_X : 0; size_t size_YRes = (argus.unit_check || argus.norm_check) ? size_Y : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || nb < 0 || nb > min(m, n) || lda < m || ldx < m || ldy < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, m, n, nb, (T*)nullptr, lda, (S*)nullptr, (S*)nullptr, (T*)nullptr, (T*)nullptr, (T*)nullptr, ldx, (T*)nullptr, ldy), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_labrd(handle, m, n, nb, (T*)nullptr, lda, (S*)nullptr, (S*)nullptr, (T*)nullptr, (T*)nullptr, (T*)nullptr, ldx, (T*)nullptr, ldy)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hARes(size_ARes, 1, size_ARes, 1); host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hTauq(size_Q, 1, size_Q, 1); host_strided_batch_vector hTaup(size_P, 1, size_P, 1); host_strided_batch_vector hX(size_X, 1, size_X, 1); host_strided_batch_vector hXRes(size_XRes, 1, size_XRes, 1); host_strided_batch_vector hY(size_Y, 1, size_Y, 1); host_strided_batch_vector hYRes(size_YRes, 1, size_YRes, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dTauq(size_Q, 1, size_Q, 1); device_strided_batch_vector dTaup(size_P, 1, size_P, 1); device_strided_batch_vector dX(size_X, 1, size_X, 1); device_strided_batch_vector dY(size_Y, 1, size_Y, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_Q) CHECK_HIP_ERROR(dTauq.memcheck()); if(size_P) CHECK_HIP_ERROR(dTaup.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); if(size_Y) CHECK_HIP_ERROR(dY.memcheck()); // check quick return if(m == 0 || n == 0 || nb == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, m, n, nb, dA.data(), lda, dD.data(), dE.data(), dTauq.data(), dTaup.data(), dX.data(), ldx, dY.data(), ldy), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) labrd_getError(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hARes, hD, hE, hTauq, hTaup, hX, hXRes, hY, hYRes, &max_error); // collect performance data if(argus.timing) labrd_getPerfData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using nb * max(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, nb * max(m, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("m", "n", "nb", "lda", "ldx", "ldy"); rocsolver_bench_output(m, n, nb, lda, ldx, ldy); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LABRD(...) extern template void testing_labrd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LABRD, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_lacgv.hpp000066400000000000000000000177341436600607200226440ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void lacgv_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dA, const rocblas_int inc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_lacgv(nullptr, n, dA, inc), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_lacgv(handle, n, (T) nullptr, inc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_lacgv(handle, 0, (T) nullptr, inc), rocblas_status_success); } template void testing_lacgv_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int inc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments lacgv_checkBadArgs(handle, n, dA.data(), inc); } template void lacgv_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int inc, Th& hA) { if(CPU) { rocblas_init(hA, true); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void lacgv_getError(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int inc, Th& hA, Th& hAr, double* max_err) { // initialize data lacgv_initData(handle, n, dA, inc, hA); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_lacgv(handle, n, dA.data(), inc)); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_lacgv(n, hA[0], inc); // error |hA - hAr| (elements must be identical) *max_err = 0; double diff; for(int j = 0; j < n; j++) { diff = std::abs(hAr[0][j * abs(inc)] - hA[0][j * abs(inc)]); *max_err = diff > *max_err ? diff : *max_err; } } template void lacgv_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int inc, Th& hA, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { lacgv_initData(handle, n, dA, inc, hA); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_lacgv(n, hA[0], inc); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } lacgv_initData(handle, n, dA, inc, hA); // cold calls for(int iter = 0; iter < 2; iter++) { lacgv_initData(handle, n, dA, inc, hA); CHECK_ROCBLAS_ERROR(rocsolver_lacgv(handle, n, dA.data(), inc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { lacgv_initData(handle, n, dA, inc, hA); start = get_time_us_sync(stream); rocsolver_lacgv(handle, n, dA.data(), inc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_lacgv(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int inc = argus.get("incx"); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = size_t(n) * abs(inc); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || !inc); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_lacgv(handle, n, (T*)nullptr, inc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_lacgv(handle, n, (T*)nullptr, inc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_lacgv(handle, n, dA.data(), inc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) lacgv_getError(handle, n, dA, inc, hA, hAr, &max_error); // collect performance data if(argus.timing) lacgv_getPerfData(handle, n, dA, inc, hA, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // no tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 0); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("n", "inc"); rocsolver_bench_output(n, inc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LACGV(...) extern template void testing_lacgv<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LACGV, FOREACH_COMPLEX_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_larf.hpp000066400000000000000000000310111436600607200224540ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void larf_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_int m, const rocblas_int n, T dx, const rocblas_int inc, T dt, T dA, const rocblas_int lda) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_larf(nullptr, side, m, n, dx, inc, dt, dA, lda), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, rocblas_side_both, m, n, dx, inc, dt, dA, lda), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, side, m, n, (T) nullptr, inc, dt, dA, lda), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, side, m, n, dx, inc, (T) nullptr, dA, lda), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, side, m, n, dx, inc, dt, (T) nullptr, lda), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, rocblas_side_left, 0, n, (T) nullptr, inc, (T) nullptr, (T) nullptr, lda), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, rocblas_side_right, m, 0, (T) nullptr, inc, (T) nullptr, (T) nullptr, lda), rocblas_status_success); } template void testing_larf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_int m = 1; rocblas_int n = 1; rocblas_int inc = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dx(1, 1, 1, 1); device_strided_batch_vector dt(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dx.memcheck()); CHECK_HIP_ERROR(dt.memcheck()); // check bad arguments larf_checkBadArgs(handle, side, m, n, dx.data(), inc, dt.data(), dA.data(), lda); } template void larf_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_int m, const rocblas_int n, Td& dx, const rocblas_int inc, Td& dt, Td& dA, const rocblas_int lda, Th& xx, Th& hx, Th& ht, Th& hA) { if(CPU) { rocblas_int order = xx.n(); rocblas_init(hA, true); rocblas_init(xx, true); // compute householder reflector cpu_larfg(order, xx[0], xx[0] + abs(inc), abs(inc), ht[0]); xx[0][0] = 1; for(rocblas_int i = 0; i < order; i++) { if(inc < 0) hx[0][i * abs(inc)] = xx[0][(order - 1 - i) * abs(inc)]; else hx[0][i * inc] = xx[0][i * inc]; } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dx.transfer_from(hx)); CHECK_HIP_ERROR(dt.transfer_from(ht)); } } template void larf_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_int m, const rocblas_int n, Td& dx, const rocblas_int inc, Td& dt, Td& dA, const rocblas_int lda, Th& xx, Th& hx, Th& ht, Th& hA, Th& hAr, double* max_err) { size_t size_w = (side == rocblas_side_left) ? size_t(n) : size_t(m); std::vector hw(size_w); // initialize data larf_initData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_larf(handle, side, m, n, dx.data(), inc, dt.data(), dA.data(), lda)); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_larf(side, m, n, hx[0], inc, ht[0], hA[0], lda, hw.data()); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void larf_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_int m, const rocblas_int n, Td& dx, const rocblas_int inc, Td& dt, Td& dA, const rocblas_int lda, Th& xx, Th& hx, Th& ht, Th& hA, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_w = (side == rocblas_side_left) ? size_t(n) : size_t(m); std::vector hw(size_w); if(!perf) { larf_initData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_larf(side, m, n, hx[0], inc, ht[0], hA[0], lda, hw.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } larf_initData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA); // cold calls for(int iter = 0; iter < 2; iter++) { larf_initData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA); CHECK_ROCBLAS_ERROR( rocsolver_larf(handle, side, m, n, dx.data(), inc, dt.data(), dA.data(), lda)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { larf_initData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA); start = get_time_us_sync(stream); rocsolver_larf(handle, side, m, n, dx.data(), inc, dt.data(), dA.data(), lda); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_larf(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int inc = argus.get("incx"); rocblas_int lda = argus.get("lda", m); rocblas_side side = char2rocblas_side(sideC); rocblas_int hot_calls = argus.iters; // check non-supported values if(side != rocblas_side_left && side != rocblas_side_right) { EXPECT_ROCBLAS_STATUS( rocsolver_larf(handle, side, m, n, (T*)nullptr, inc, (T*)nullptr, (T*)nullptr, lda), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_A = size_t(lda) * n; size_t size_x = left ? size_t(m) : size_t(n); size_t stx = size_x * abs(inc); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || !inc || lda < m); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_larf(handle, side, m, n, (T*)nullptr, inc, (T*)nullptr, (T*)nullptr, lda), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY( rocsolver_larf(handle, side, m, n, (T*)nullptr, inc, (T*)nullptr, (T*)nullptr, lda)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hx(size_x, abs(inc), stx, 1); host_strided_batch_vector xx(size_x, abs(inc), stx, 1); host_strided_batch_vector ht(1, 1, 1, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dx(size_x, abs(inc), stx, 1); device_strided_batch_vector dt(1, 1, 1, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_x) CHECK_HIP_ERROR(dx.memcheck()); CHECK_HIP_ERROR(dt.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_larf(handle, side, m, n, dx.data(), inc, dt.data(), dA.data(), lda), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) larf_getError(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA, hAr, &max_error); // collect performance data if(argus.timing) larf_getPerfData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using size_x * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, size_x); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "m", "n", "inc", "lda"); rocsolver_bench_output(sideC, m, n, inc, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LARF(...) extern template void testing_larf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LARF, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_larfb.hpp000066400000000000000000000463041436600607200226310ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void larfb_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dV, const rocblas_int ldv, T dT, const rocblas_int ldt, T dA, const rocblas_int lda) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_larfb(nullptr, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, rocblas_side(0), trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, rocblas_operation(0), direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, rocblas_direct(0), storev, m, n, k, dV, ldv, dT, ldt, dA, lda), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, rocblas_storev(0), m, n, k, dV, ldv, dT, ldt, dA, lda), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, (T) nullptr, ldv, dT, ldt, dA, lda), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV, ldv, (T) nullptr, ldt, dA, lda), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, (T) nullptr, lda), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, rocblas_side_left, trans, direct, storev, 0, n, k, (T) nullptr, ldv, dT, ldt, (T) nullptr, lda), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, rocblas_side_right, trans, direct, storev, m, 0, k, (T) nullptr, ldv, dT, ldt, (T) nullptr, lda), rocblas_status_success); } template void testing_larfb_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_operation trans = rocblas_operation_none; rocblas_direct direct = rocblas_forward_direction; rocblas_storev storev = rocblas_column_wise; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int ldv = 1; rocblas_int ldt = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dT(1, 1, 1, 1); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dT.memcheck()); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments larfb_checkBadArgs(handle, side, trans, direct, storev, m, n, k, dV.data(), ldv, dT.data(), ldt, dA.data(), lda); } template void larfb_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dT, const rocblas_int ldt, Td& dA, const rocblas_int lda, Th& hV, Th& hT, Th& hA, std::vector& hW, size_t sizeW) { if(CPU) { bool left = (side == rocblas_side_left); bool forward = (direct == rocblas_forward_direction); bool column = (storev == rocblas_column_wise); std::vector htau(k); rocblas_init(hV, true); rocblas_init(hA, true); rocblas_init(hT, true); // scale to avoid singularities // create householder reflectors and triangular factor if(left) { if(column) { for(int i = 0; i < m; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(forward) cpu_geqrf(m, k, hV[0], ldv, htau.data(), hW.data(), sizeW); else cpu_geqlf(m, k, hV[0], ldv, htau.data(), hW.data(), sizeW); } else { for(int i = 0; i < k; ++i) { for(int j = 0; j < m; ++j) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(forward) cpu_gelqf(k, m, hV[0], ldv, htau.data(), hW.data(), sizeW); else cpu_gerqf(k, m, hV[0], ldv, htau.data(), hW.data(), sizeW); } cpu_larft(direct, storev, m, k, hV[0], ldv, htau.data(), hT[0], ldt); } else { if(column) { for(int i = 0; i < n; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(forward) cpu_geqrf(n, k, hV[0], ldv, htau.data(), hW.data(), sizeW); else cpu_geqlf(n, k, hV[0], ldv, htau.data(), hW.data(), sizeW); } else { for(int i = 0; i < k; ++i) { for(int j = 0; j < n; ++j) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(forward) cpu_gelqf(k, n, hV[0], ldv, htau.data(), hW.data(), sizeW); else cpu_gerqf(k, n, hV[0], ldv, htau.data(), hW.data(), sizeW); } cpu_larft(direct, storev, n, k, hV[0], ldv, htau.data(), hT[0], ldt); } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dV.transfer_from(hV)); CHECK_HIP_ERROR(dT.transfer_from(hT)); CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void larfb_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dT, const rocblas_int ldt, Td& dA, const rocblas_int lda, Th& hV, Th& hT, Th& hA, Th& hAr, double* max_err) { bool left = (side == rocblas_side_left); rocblas_int ldw = left ? n : m; size_t sizeW = size_t(ldw) * k; std::vector hW(sizeW); // initialize data larfb_initData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hW, sizeW); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV.data(), ldv, dT.data(), ldt, dA.data(), lda)); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_larfb(side, trans, direct, storev, m, n, k, hV[0], ldv, hT[0], ldt, hA[0], lda, hW.data(), ldw); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void larfb_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dT, const rocblas_int ldt, Td& dA, const rocblas_int lda, Th& hV, Th& hT, Th& hA, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { bool left = (side == rocblas_side_left); rocblas_int ldw = left ? n : m; size_t sizeW = size_t(ldw) * k; std::vector hW(sizeW); if(!perf) { larfb_initData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hW, sizeW); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_larfb(side, trans, direct, storev, m, n, k, hV[0], ldv, hT[0], ldt, hA[0], lda, hW.data(), ldw); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } larfb_initData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hW, sizeW); // cold calls for(int iter = 0; iter < 2; iter++) { larfb_initData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hW, sizeW); CHECK_ROCBLAS_ERROR(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV.data(), ldv, dT.data(), ldt, dA.data(), lda)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { larfb_initData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hW, sizeW); start = get_time_us_sync(stream); rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV.data(), ldv, dT.data(), ldt, dA.data(), lda); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_larfb(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); char transC = argus.get("trans"); char directC = argus.get("direct"); char storevC = argus.get("storev"); rocblas_int k = argus.get("k"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int ldv = argus.get("ldv", storevC == 'R' ? k : (sideC == 'L' ? m : n)); rocblas_int lda = argus.get("lda", m); rocblas_int ldt = argus.get("ldt", k); rocblas_side side = char2rocblas_side(sideC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_direct direct = char2rocblas_direct(directC); rocblas_storev storev = char2rocblas_storev(storevC); rocblas_int hot_calls = argus.iters; // check non-supported values if(side != rocblas_side_left && side != rocblas_side_right) { EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, (T*)nullptr, ldv, (T*)nullptr, ldt, (T*)nullptr, lda), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool row = (storev == rocblas_row_wise); bool left = (side == rocblas_side_left); size_t size_V = size_t(ldv) * k; if(row) size_V = left ? size_t(ldv) * m : size_t(ldv) * n; size_t size_T = size_t(ldt) * k; size_t size_A = size_t(lda) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || k < 1 || ldt < k || lda < m || (row && ldv < k) || (!row && !left && ldv < n) || (!row && left && ldv < m)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, (T*)nullptr, ldv, (T*)nullptr, ldt, (T*)nullptr, lda), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, (T*)nullptr, ldv, (T*)nullptr, ldt, (T*)nullptr, lda)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hT(size_T, 1, size_T, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hV(size_V, 1, size_V, 1); device_strided_batch_vector dT(size_T, 1, size_T, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dV(size_V, 1, size_V, 1); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_T) CHECK_HIP_ERROR(dT.memcheck()); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV.data(), ldv, dT.data(), ldt, dA.data(), lda), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) larfb_getError(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hAr, &max_error); // collect performance data if(argus.timing) larfb_getPerfData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "trans", "direct", "storev", "m", "n", "k", "ldv", "ldt", "lda"); rocsolver_bench_output(sideC, transC, directC, storevC, m, n, k, ldv, ldt, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LARFB(...) extern template void testing_larfb<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LARFB, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_larfg.hpp000066400000000000000000000237721436600607200226420ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void larfg_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T da, T dx, const rocblas_int inc, T dt) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_larfg(nullptr, n, da, dx, inc, dt), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, n, (T) nullptr, dx, inc, dt), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, n, da, (T) nullptr, inc, dt), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, n, da, dx, inc, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, 0, (T) nullptr, (T) nullptr, inc, (T) nullptr), rocblas_status_success); } template void testing_larfg_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 2; rocblas_int inc = 1; // memory allocation device_strided_batch_vector da(1, 1, 1, 1); device_strided_batch_vector dx(1, 1, 1, 1); device_strided_batch_vector dt(1, 1, 1, 1); CHECK_HIP_ERROR(da.memcheck()); CHECK_HIP_ERROR(dx.memcheck()); CHECK_HIP_ERROR(dt.memcheck()); // check bad arguments larfg_checkBadArgs(handle, n, da.data(), dx.data(), inc, dt.data()); } template void larfg_initData(const rocblas_handle handle, const rocblas_int n, Td& da, Td& dx, const rocblas_int inc, Td& dt, Th& ha, Th& hx, Th& ht) { if(CPU) { rocblas_init(ha, true); rocblas_init(hx, true); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(da.transfer_from(ha)); CHECK_HIP_ERROR(dx.transfer_from(hx)); } } template void larfg_getError(const rocblas_handle handle, const rocblas_int n, Td& da, Td& dx, const rocblas_int inc, Td& dt, Th& ha, Th& hx, Th& hxr, Th& ht, double* max_err) { // initialize data larfg_initData(handle, n, da, dx, inc, dt, ha, hx, ht); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_larfg(handle, n, da.data(), dx.data(), inc, dt.data())); CHECK_HIP_ERROR(hxr.transfer_from(dx)); // CPU lapack cpu_larfg(n, ha[0], hx[0], inc, ht[0]); // error is ||hx - hxr|| (not necessary to check tau, for now) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using norm-1 which is infinity norm for this data setup *max_err = norm_error('O', 1, n - 1, inc, hx[0], hxr[0]); } template void larfg_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& da, Td& dx, const rocblas_int inc, Td& dt, Th& ha, Th& hx, Th& ht, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { larfg_initData(handle, n, da, dx, inc, dt, ha, hx, ht); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_larfg(n, ha[0], hx[0], inc, ht[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } larfg_initData(handle, n, da, dx, inc, dt, ha, hx, ht); // cold calls for(int iter = 0; iter < 2; iter++) { larfg_initData(handle, n, da, dx, inc, dt, ha, hx, ht); CHECK_ROCBLAS_ERROR(rocsolver_larfg(handle, n, da.data(), dx.data(), inc, dt.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { larfg_initData(handle, n, da, dx, inc, dt, ha, hx, ht); start = get_time_us_sync(stream); rocsolver_larfg(handle, n, da.data(), dx.data(), inc, dt.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_larfg(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int inc = argus.get("incx"); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes // size_x could be zero in test cases that are not quick-return or invalid // cases setting it to one to avoid possible memory access errors in the rest // of the unit test size_t size_x = n > 1 ? size_t(n - 1) : 1; size_t stx = size_x * inc; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_xr = (argus.unit_check || argus.norm_check) ? size_x : 0; size_t stxr = (argus.unit_check || argus.norm_check) ? stx : 0; // check invalid sizes bool invalid_size = (n < 0 || inc < 1); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, n, (T*)nullptr, (T*)nullptr, inc, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_larfg(handle, n, (T*)nullptr, (T*)nullptr, inc, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hx(size_x, inc, stx, 1); host_strided_batch_vector hxr(size_xr, inc, stxr, 1); host_strided_batch_vector ha(1, 1, 1, 1); host_strided_batch_vector ht(1, 1, 1, 1); device_strided_batch_vector dx(size_x, inc, stx, 1); device_strided_batch_vector da(1, 1, 1, 1); device_strided_batch_vector dt(1, 1, 1, 1); CHECK_HIP_ERROR(da.memcheck()); if(size_x) CHECK_HIP_ERROR(dx.memcheck()); CHECK_HIP_ERROR(dt.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, n, da.data(), dx.data(), inc, dt.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) larfg_getError(handle, n, da, dx, inc, dt, ha, hx, hxr, ht, &max_error); // collect performance data if(argus.timing) larfg_getPerfData(handle, n, da, dx, inc, dt, ha, hx, ht, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("n", "inc"); rocsolver_bench_output(n, inc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LARFG(...) extern template void testing_larfg<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LARFG, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_larft.hpp000066400000000000000000000335641436600607200226570ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void larft_checkBadArgs(const rocblas_handle handle, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int n, const rocblas_int k, T dV, const rocblas_int ldv, T dt, T dT, const rocblas_int ldt) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_larft(nullptr, direct, storev, n, k, dV, ldv, dt, dT, ldt), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS( rocsolver_larft(handle, rocblas_direct(0), storev, n, k, dV, ldv, dt, dT, ldt), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS( rocsolver_larft(handle, direct, rocblas_storev(0), n, k, dV, ldv, dt, dT, ldt), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, n, k, (T) nullptr, ldv, dt, dT, ldt), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, n, k, dV, ldv, (T) nullptr, dT, ldt), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, n, k, dV, ldv, dt, (T) nullptr, ldt), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, 0, k, (T) nullptr, ldv, dt, dT, ldt), rocblas_status_success); } template void testing_larft_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_direct direct = rocblas_forward_direction; rocblas_storev storev = rocblas_column_wise; rocblas_int k = 1; rocblas_int n = 1; rocblas_int ldv = 1; rocblas_int ldt = 1; // memory allocation device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dt(1, 1, 1, 1); device_strided_batch_vector dT(1, 1, 1, 1); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dT.memcheck()); CHECK_HIP_ERROR(dt.memcheck()); // check bad arguments larft_checkBadArgs(handle, direct, storev, n, k, dV.data(), ldv, dt.data(), dT.data(), ldt); } template void larft_initData(const rocblas_handle handle, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dt, Td& dT, const rocblas_int ldt, Th& hV, Th& ht, Th& hT, std::vector& hw, size_t size_w) { if(CPU) { rocblas_init(hV, true); // scale to avoid singularities // and create householder reflectors if(storev == rocblas_column_wise) { for(int j = 0; j < k; ++j) { for(int i = 0; i < n; ++i) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(direct == rocblas_forward_direction) cpu_geqrf(n, k, hV[0], ldv, ht[0], hw.data(), k); else cpu_geqlf(n, k, hV[0], ldv, ht[0], hw.data(), k); } else { for(int j = 0; j < n; ++j) { for(int i = 0; i < k; ++i) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(direct == rocblas_forward_direction) cpu_gelqf(k, n, hV[0], ldv, ht[0], hw.data(), k); else cpu_gerqf(k, n, hV[0], ldv, ht[0], hw.data(), k); } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dV.transfer_from(hV)); CHECK_HIP_ERROR(dt.transfer_from(ht)); } } template void larft_getError(const rocblas_handle handle, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dt, Td& dT, const rocblas_int ldt, Th& hV, Th& ht, Th& hT, Th& hTr, double* max_err) { size_t size_w = size_t(k); std::vector hw(size_w); // initialize data larft_initData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hw, size_w); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_larft(handle, direct, storev, n, k, dV.data(), ldv, dt.data(), dT.data(), ldt)); CHECK_HIP_ERROR(hTr.transfer_from(dT)); // CPU lapack cpu_larft(direct, storev, n, k, hV[0], ldv, ht[0], hT[0], ldt); // error is ||hT - hTr|| / ||hT|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = (direct == rocblas_forward_direction) ? norm_error_upperTr('F', k, k, ldt, hT[0], hTr[0]) : norm_error_lowerTr('F', k, k, ldt, hT[0], hTr[0]); } template void larft_getPerfData(const rocblas_handle handle, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dt, Td& dT, const rocblas_int ldt, Th& hV, Th& ht, Th& hT, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_w = size_t(k); std::vector hw(size_w); if(!perf) { larft_initData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hw, size_w); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_larft(direct, storev, n, k, hV[0], ldv, ht[0], hT[0], ldt); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } larft_initData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hw, size_w); // cold calls for(int iter = 0; iter < 2; iter++) { larft_initData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hw, size_w); CHECK_ROCBLAS_ERROR(rocsolver_larft(handle, direct, storev, n, k, dV.data(), ldv, dt.data(), dT.data(), ldt)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { larft_initData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hw, size_w); start = get_time_us_sync(stream); rocsolver_larft(handle, direct, storev, n, k, dV.data(), ldv, dt.data(), dT.data(), ldt); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_larft(Arguments& argus) { // get arguments rocblas_local_handle handle; char directC = argus.get("direct"); char storevC = argus.get("storev"); rocblas_int k = argus.get("k"); rocblas_int n = argus.get("n"); rocblas_int ldv = argus.get("ldv", storevC == 'C' ? n : k); rocblas_int ldt = argus.get("ldt", k); rocblas_direct direct = char2rocblas_direct(directC); rocblas_storev storev = char2rocblas_storev(storevC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes bool row = (storev == rocblas_row_wise); size_t size_T = size_t(ldt) * k; size_t size_tau = size_t(k); size_t size_V = row ? size_t(ldv) * n : size_t(ldv) * k; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Tr = (argus.unit_check || argus.norm_check) ? size_T : 0; // check invalid sizes bool invalid_size = (n < 0 || k < 1 || ldt < k || (row && ldv < k) || (!row && ldv < n)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, n, k, (T*)nullptr, ldv, (T*)nullptr, (T*)nullptr, ldt), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_larft(handle, direct, storev, n, k, (T*)nullptr, ldv, (T*)nullptr, (T*)nullptr, ldt)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hT(size_T, 1, size_T, 1); host_strided_batch_vector hTr(size_Tr, 1, size_Tr, 1); host_strided_batch_vector ht(size_tau, 1, size_tau, 1); host_strided_batch_vector hV(size_V, 1, size_V, 1); device_strided_batch_vector dT(size_T, 1, size_T, 1); device_strided_batch_vector dt(size_tau, 1, size_tau, 1); device_strided_batch_vector dV(size_V, 1, size_V, 1); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_T) CHECK_HIP_ERROR(dT.memcheck()); if(size_tau) CHECK_HIP_ERROR(dt.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, n, k, dV.data(), ldv, dt.data(), dT.data(), ldt), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) larft_getError(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hTr, &max_error); // collect performance data if(argus.timing) larft_getPerfData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("direct", "storev", "n", "k", "ldv", "ldt"); rocsolver_bench_output(directC, storevC, n, k, ldv, ldt); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LARFT(...) extern template void testing_larft<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LARFT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_laswp.hpp000066400000000000000000000253271436600607200226730ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void laswp_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_int k1, const rocblas_int k2, U dIpiv, const rocblas_int inc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_laswp(nullptr, n, dA, lda, k1, k2, dIpiv, inc), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_laswp(handle, n, (T) nullptr, lda, k1, k2, dIpiv, inc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_laswp(handle, n, dA, lda, k1, k2, (U) nullptr, inc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_laswp(handle, 0, (T) nullptr, lda, k1, k2, dIpiv, inc), rocblas_status_success); } template void testing_laswp_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int k1 = 1; rocblas_int k2 = 2; rocblas_int inc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments laswp_checkBadArgs(handle, n, dA.data(), lda, k1, k2, dIpiv.data(), inc); } template void laswp_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int k1, const rocblas_int k2, Ud& dIpiv, const rocblas_int inc, Th& hA, Uh& hIpiv) { if(CPU) { // for simplicity consider number of rows m = lda rocblas_init(hA, true); rocblas_init(hIpiv, true); // put indices in range [1, x] // for simplicity, consider x = lda as this is the number of rows for(rocblas_int i = 0; i < hIpiv.n(); ++i) hIpiv[0][i] = hIpiv[0][i] * lda < 10 ? 1 : hIpiv[0][i] * lda / 10; } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void laswp_getError(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int k1, const rocblas_int k2, Ud& dIpiv, const rocblas_int inc, Th& hA, Th& hAr, Uh& hIpiv, double* max_err) { // initialize data laswp_initData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_laswp(handle, n, dA.data(), lda, k1, k2, dIpiv.data(), inc)); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_laswp(n, hA[0], lda, k1, k2, hIpiv[0], inc); // error |hA - hAr| (elements must be identical) *max_err = 0; double diff; for(int i = 0; i < lda; i++) { for(int j = 0; j < n; j++) { diff = std::abs(hAr[0][i + j * lda] - hA[0][i + j * lda]); *max_err = diff > *max_err ? diff : *max_err; } } } template void laswp_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int k1, const rocblas_int k2, Ud& dIpiv, const rocblas_int inc, Th& hA, Uh& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { laswp_initData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_laswp(n, hA[0], lda, k1, k2, hIpiv[0], inc); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } laswp_initData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv); // cold calls for(int iter = 0; iter < 2; iter++) { laswp_initData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv); CHECK_ROCBLAS_ERROR(rocsolver_laswp(handle, n, dA.data(), lda, k1, k2, dIpiv.data(), inc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { laswp_initData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv); start = get_time_us_sync(stream); rocsolver_laswp(handle, n, dA.data(), lda, k1, k2, dIpiv.data(), inc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_laswp(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int k1 = argus.get("k1"); rocblas_int k2 = argus.get("k2"); rocblas_int lda = argus.get("lda", k2); rocblas_int inc = argus.get("incx"); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = k1 + size_t(k2 - k1) * abs(inc); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < 1 || !inc || k1 < 1 || k2 < 1 || k2 < k1); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_laswp(handle, n, (T*)nullptr, lda, k1, k2, (rocblas_int*)nullptr, inc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY( rocsolver_laswp(handle, n, (T*)nullptr, lda, k1, k2, (rocblas_int*)nullptr, inc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_laswp(handle, n, dA.data(), lda, k1, k2, dIpiv.data(), inc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) laswp_getError(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) laswp_getPerfData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // no tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 0); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("n", "lda", "k1", "k2", "inc"); rocsolver_bench_output(n, lda, k1, k2, inc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LASWP(...) extern template void testing_laswp<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LASWP, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_lasyf.hpp000066400000000000000000000364131436600607200226610ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void lasyf_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nb, rocblas_int* kb, T dA, const rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(nullptr, uplo, n, nb, kb, dA, lda, ipiv, info), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(handle, rocblas_fill_full, n, nb, kb, dA, lda, ipiv, info), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_lasyf(handle, uplo, n, nb, (rocblas_int*)nullptr, dA, lda, ipiv, info), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(handle, uplo, n, nb, kb, (T) nullptr, lda, ipiv, info), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_lasyf(handle, uplo, n, nb, kb, dA, lda, (rocblas_int*)nullptr, info), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_lasyf(handle, uplo, n, nb, kb, dA, lda, ipiv, (rocblas_int*)nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_lasyf(handle, uplo, 0, 0, kb, (T) nullptr, lda, (rocblas_int*)nullptr, info), rocblas_status_success); } template void testing_lasyf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int nb = 1; rocblas_int lda = 1; // memory allocations device_strided_batch_vector dKB(1, 1, 1, 1); device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dKB.memcheck()); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments lasyf_checkBadArgs(handle, uplo, n, nb, dKB.data(), dA.data(), lda, dIpiv.data(), dInfo.data()); } template void lasyf_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(rocblas_int i = 0; i < n / 2; i++) { for(rocblas_int j = 0; j < n; j++) { tmp = hA[0][i + j * lda]; hA[0][i + j * lda] = hA[0][n - 1 - i + j * lda]; hA[0][n - 1 - i + j * lda] = tmp; } } if(singular) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int j = n / 4; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[0][i + j * lda] = 0; hA[0][j + i * lda] = 0; } j = n / 2; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[0][i + j * lda] = 0; hA[0][j + i * lda] = 0; } j = n - 1; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[0][i + j * lda] = 0; hA[0][j + i * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void lasyf_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nb, Ud& dKB, Td& dA, const rocblas_int lda, Ud& dIpiv, Ud& dInfo, Uh& hKB, Uh& hKBRes, Th& hA, Th& hARes, Uh& hIpiv, Uh& hIpivRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { int ldw = n; int lwork = ldw * nb; std::vector work(lwork); // input data initialization lasyf_initData(handle, n, dA, lda, hA, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_lasyf(handle, uplo, n, nb, dKB.data(), dA.data(), lda, dIpiv.data(), dInfo.data())); CHECK_HIP_ERROR(hKBRes.transfer_from(dKB)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hIpivRes.transfer_from(dIpiv)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack cpu_lasyf(uplo, n, nb, hKB[0], hA[0], lda, hIpiv[0], work.data(), ldw, hInfo[0]); // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY // ISSUES. IT MIGHT BE REVISITED IN THE FUTURE) using frobenius norm double err; *max_err = 0; err = norm_error('F', n, n, lda, hA[0], hARes[0]); *max_err = err > *max_err ? err : *max_err; // also check pivoting (count the number of incorrect pivots) err = 0; if(uplo == rocblas_fill_upper) { for(rocblas_int i = n - hKBRes[0][0]; i < n; ++i) if(hIpiv[0][i] != hIpivRes[0][i]) err++; } else { for(rocblas_int i = 0; i < hKBRes[0][0]; ++i) if(hIpiv[0][i] != hIpivRes[0][i]) err++; } *max_err = err > *max_err ? err : *max_err; // also check kb err = 0; if(hKB[0][0] != hKBRes[0][0]) err++; *max_err += err; // also check info err = 0; if(hInfo[0][0] != hInfoRes[0][0]) err++; *max_err += err; } template void lasyf_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nb, Ud& dKB, Td& dA, const rocblas_int lda, Ud& dIpiv, Ud& dInfo, Uh& hKB, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { int ldw = n; int lwork = ldw * nb; std::vector work(lwork); if(!perf) { lasyf_initData(handle, n, dA, lda, hA, singular); // cpu-lapack performance *cpu_time_used = get_time_us_no_sync(); cpu_lasyf(uplo, n, nb, hKB[0], hA[0], lda, hIpiv[0], work.data(), ldw, hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } lasyf_initData(handle, n, dA, lda, hA, singular); // cold calls for(int iter = 0; iter < 2; iter++) { lasyf_initData(handle, n, dA, lda, hA, singular); CHECK_ROCBLAS_ERROR(rocsolver_lasyf(handle, uplo, n, nb, dKB.data(), dA.data(), lda, dIpiv.data(), dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { lasyf_initData(handle, n, dA, lda, hA, singular); start = get_time_us_sync(stream); rocsolver_lasyf(handle, uplo, n, nb, dKB.data(), dA.data(), lda, dIpiv.data(), dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_lasyf(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int nb = argus.get("nb", n); rocblas_int lda = argus.get("lda", n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(handle, uplo, n, nb, (rocblas_int*)nullptr, (T*)nullptr, lda, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = lda * n; size_t size_Ipiv = n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_IpivRes = (argus.unit_check || argus.norm_check) ? size_Ipiv : 0; // check invalid sizes bool invalid_size = (n < 0 || nb < 0 || nb > n || lda < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(handle, uplo, n, nb, (rocblas_int*)nullptr, (T*)nullptr, lda, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_lasyf(handle, uplo, n, nb, (rocblas_int*)nullptr, (T*)nullptr, lda, (rocblas_int*)nullptr, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hKB(1, 1, 1, 1); host_strided_batch_vector hKBRes(1, 1, 1, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hARes(size_ARes, 1, size_ARes, 1); host_strided_batch_vector hIpiv(size_Ipiv, 1, size_Ipiv, 1); host_strided_batch_vector hIpivRes(size_IpivRes, 1, size_IpivRes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); device_strided_batch_vector dKB(1, 1, 1, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_Ipiv, 1, size_Ipiv, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_Ipiv) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dKB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(nb == 0 || n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(handle, uplo, n, nb, dKB.data(), dA.data(), lda, dIpiv.data(), dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) lasyf_getError(handle, uplo, n, nb, dKB, dA, lda, dIpiv, dInfo, hKB, hKBRes, hA, hARes, hIpiv, hIpivRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) lasyf_getPerfData(handle, uplo, n, nb, dKB, dA, lda, dIpiv, dInfo, hKB, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "n", "nb", "lda"); rocsolver_bench_output(uploC, n, nb, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LASYF(...) extern template void testing_lasyf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LASYF, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_latrd.hpp000066400000000000000000000343561436600607200226550ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void latrd_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, S dE, T dTau, T dW, const rocblas_int ldw) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_latrd(nullptr, uplo, n, k, dA, lda, dE, dTau, dW, ldw), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, rocblas_fill_full, n, k, dA, lda, dE, dTau, dW, ldw), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, (T) nullptr, lda, dE, dTau, dW, ldw), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, dA, lda, (S) nullptr, dTau, dW, ldw), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, dA, lda, dE, (T) nullptr, dW, ldw), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, dA, lda, dE, dTau, (T) nullptr, ldw), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, 0, dA, lda, dE, dTau, (T) nullptr, ldw), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, 0, 0, (T) nullptr, lda, (S) nullptr, (T) nullptr, (T) nullptr, ldw), rocblas_status_success); } template void testing_latrd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int k = 1; rocblas_int lda = 1; rocblas_int ldw = 1; // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTau(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTau.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); // check bad arguments latrd_checkBadArgs(handle, uplo, n, k, dA.data(), lda, dE.data(), dTau.data(), dW.data(), ldw); } template , int> = 0> void latrd_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j || (i == j + 1) || (i == j - 1)) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template , int> = 0> void latrd_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[0][i + j * lda] = hA[0][i + j * lda].real() + 400; else if((i == j + 1) || (i == j - 1)) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void latrd_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Sd& dE, Td& dTau, Td& dW, const rocblas_int ldw, Th& hA, Th& hARes, Sh& hE, Th& hTau, Th& hW, Th& hWRes, double* max_err) { // input data initialization latrd_initData(handle, n, dA, lda, hA); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_latrd(handle, uplo, n, k, dA.data(), lda, dE.data(), dTau.data(), dW.data(), ldw)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); // CPU lapack cpu_latrd(uplo, n, k, hA[0], lda, hE[0], hTau[0], hW[0], ldw); // error is max(||hA - hARes|| / ||hA||, ||hW - hWRes|| / ||hW||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY // ISSUES. IT MIGHT BE REVISITED IN THE FUTURE) using frobenius norm double err; rocblas_int offset = (uplo == rocblas_fill_lower) ? k : 0; *max_err = 0; err = norm_error('F', n, n, lda, hA[0], hARes[0]); *max_err = err > *max_err ? err : *max_err; err = norm_error('F', n - k, k, ldw, hW[0] + offset, hWRes[0] + offset); *max_err = err > *max_err ? err : *max_err; } template void latrd_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Sd& dE, Td& dTau, Td& dW, const rocblas_int ldw, Th& hA, Sh& hE, Th& hTau, Th& hW, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { latrd_initData(handle, n, dA, lda, hA); // cpu-lapack performance *cpu_time_used = get_time_us_no_sync(); memset(hW[0], 0, ldw * k * sizeof(T)); cpu_latrd(uplo, n, k, hA[0], lda, hE[0], hTau[0], hW[0], ldw); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } latrd_initData(handle, n, dA, lda, hA); // cold calls for(int iter = 0; iter < 2; iter++) { latrd_initData(handle, n, dA, lda, hA); CHECK_ROCBLAS_ERROR(rocsolver_latrd(handle, uplo, n, k, dA.data(), lda, dE.data(), dTau.data(), dW.data(), ldw)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { latrd_initData(handle, n, dA, lda, hA); start = get_time_us_sync(stream); rocsolver_latrd(handle, uplo, n, k, dA.data(), lda, dE.data(), dTau.data(), dW.data(), ldw); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_latrd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int k = argus.get("k", n); rocblas_int lda = argus.get("lda", n); rocblas_int ldw = argus.get("ldw", n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, (T*)nullptr, lda, (S*)nullptr, (T*)nullptr, (T*)nullptr, ldw), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = lda * n; size_t size_E = n; size_t size_tau = n; size_t size_W = ldw * k; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; // check invalid sizes bool invalid_size = (n < 0 || k < 0 || k > n || lda < n || ldw < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, (T*)nullptr, lda, (S*)nullptr, (T*)nullptr, (T*)nullptr, ldw), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_latrd(handle, uplo, n, k, (T*)nullptr, lda, (S*)nullptr, (T*)nullptr, (T*)nullptr, ldw)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hARes(size_ARes, 1, size_ARes, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hTau(size_tau, 1, size_tau, 1); host_strided_batch_vector hW(size_W, 1, size_W, 1); host_strided_batch_vector hWRes(size_WRes, 1, size_WRes, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dTau(size_tau, 1, size_tau, 1); device_strided_batch_vector dW(size_W, 1, size_W, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_tau) CHECK_HIP_ERROR(dTau.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); // check quick return if(k == 0 || n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, dA.data(), lda, dE.data(), dTau.data(), dW.data(), ldw), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) latrd_getError(handle, uplo, n, k, dA, lda, dE, dTau, dW, ldw, hA, hARes, hE, hTau, hW, hWRes, &max_error); // collect performance data if(argus.timing) latrd_getPerfData(handle, uplo, n, k, dA, lda, dE, dTau, dW, ldw, hA, hE, hTau, hW, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using k*n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, k * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "n", "k", "lda", "ldw"); rocsolver_bench_output(uploC, n, k, lda, ldw); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LATRD(...) extern template void testing_latrd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LATRD, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_lauum.hpp000066400000000000000000000215541436600607200226660ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void lauum_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, T A, const rocblas_int lda) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_lauum(nullptr, uplo, n, A, lda), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, rocblas_fill_full, n, A, lda), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, uplo, n, (T) nullptr, lda), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, uplo, 0, (T) nullptr, lda), rocblas_status_success); } template void testing_lauum_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments lauum_checkBadArgs(handle, uplo, n, dA.data(), lda); } template void lauum_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA) { if(CPU) { rocblas_init(hA, true); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void lauum_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA, Th& hAr, double* max_err) { // initialize data lauum_initData(handle, uplo, n, dA, lda, hA); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_lauum(handle, uplo, n, dA.data(), lda)); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_lauum(uplo, n, hA[0], lda); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius *max_err = norm_error('F', n, n, lda, hA[0], hAr[0]); } template void lauum_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { lauum_initData(handle, uplo, n, dA, lda, hA); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_lauum(uplo, n, hA[0], lda); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } lauum_initData(handle, uplo, n, dA, lda, hA); // cold calls for(int iter = 0; iter < 2; iter++) { lauum_initData(handle, uplo, n, dA, lda, hA); CHECK_ROCBLAS_ERROR(rocsolver_lauum(handle, uplo, n, dA.data(), lda)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { lauum_initData(handle, uplo, n, dA, lda, hA); start = get_time_us_sync(stream); rocsolver_lauum(handle, uplo, n, dA.data(), lda); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_lauum(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int hot_calls = argus.iters; rocblas_fill uplo = char2rocblas_fill(uploC); // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, uplo, n, (T*)nullptr, lda), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(n) * lda; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, uplo, n, (T*)nullptr, lda), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_lauum(handle, uplo, n, (T*)nullptr, lda)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_A, 1, size_A, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, uplo, n, dA.data(), lda), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) lauum_getError(handle, uplo, n, dA, lda, hA, hAr, &max_error); // collect performance data if(argus.timing) lauum_getPerfData(handle, uplo, n, dA, lda, hA, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using machine precision for tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 1); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LAUUM(...) extern template void testing_lauum<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LAUUM, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_managed_malloc.hpp000066400000000000000000000274561436600607200244750ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" /* * =========================================================================== * testing_managed_malloc is a modified version of testing_labrd that tests * the unified memory model/HMM. checkBadArgs has been removed as the memory * model has no impact on the bad arg check. * =========================================================================== */ template void managed_malloc_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int nb, T* dA, T* dARes, const rocblas_int lda) { if(CPU) { rocblas_init(dA, m, n, lda); // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j || (m >= n && j == i + 1) || (m < n && i == j + 1)) dA[i + j * lda] += 400; else dA[i + j * lda] -= 4; } } } if(GPU) { // copy A for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { dARes[i + j * lda] = dA[i + j * lda]; } } } } template void managed_malloc_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int nb, T* dA, T* dARes, const rocblas_int lda, S* dD, S* dE, T* dTauq, T* dTaup, T* dX, T* dXRes, const rocblas_int ldx, T* dY, T* dYRes, const rocblas_int ldy, double* max_err) { // input data initialization managed_malloc_initData(handle, m, n, nb, dA, dARes, lda); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_labrd(handle, m, n, nb, dARes, lda, dD, dE, dTauq, dTaup, dXRes, ldx, dYRes, ldy)); hipDeviceSynchronize(); // CPU lapack cpu_labrd(m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy); // error is max(||hA - hARes|| / ||hA||, ||hX - hXRes|| / ||hX||, ||hY - // hYRes|| / ||hY||) (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY // ISSUES. IT MIGHT BE REVISITED IN THE FUTURE) using frobenius norm double err; *max_err = 0; err = norm_error('F', m, n, lda, dA, dARes); *max_err = err > *max_err ? err : *max_err; err = norm_error('F', m - nb, nb, ldx, dX + nb, dXRes + nb); *max_err = err > *max_err ? err : *max_err; err = norm_error('F', n - nb, nb, ldy, dY + nb, dYRes + nb); *max_err = err > *max_err ? err : *max_err; } template void managed_malloc_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int nb, T* dA, T* dARes, const rocblas_int lda, S* dD, S* dE, T* dTauq, T* dTaup, T* dX, T* dXRes, const rocblas_int ldx, T* dY, T* dYRes, const rocblas_int ldy, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { managed_malloc_initData(handle, m, n, nb, dA, dARes, lda); // cpu-lapack performance *cpu_time_used = get_time_us_no_sync(); cpu_labrd(m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } managed_malloc_initData(handle, m, n, nb, dA, dARes, lda); // cold calls for(int iter = 0; iter < 2; iter++) { managed_malloc_initData(handle, m, n, nb, dA, dARes, lda); CHECK_ROCBLAS_ERROR(rocsolver_labrd(handle, m, n, nb, dARes, lda, dD, dE, dTauq, dTaup, dXRes, ldx, dYRes, ldy)); hipDeviceSynchronize(); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { managed_malloc_initData(handle, m, n, nb, dA, dARes, lda); start = get_time_us_sync(stream); rocsolver_labrd(handle, m, n, nb, dARes, lda, dD, dE, dTauq, dTaup, dXRes, ldx, dYRes, ldy); hipDeviceSynchronize(); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_managed_malloc(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int nb = argus.get("k", min(m, n)); rocblas_int lda = argus.get("lda", m); rocblas_int ldx = argus.get("ldx", m); rocblas_int ldy = argus.get("ldy", n); rocblas_int hot_calls = argus.iters; // check managed memory enablement int deviceID, hmm_enabled; hipGetDevice(&deviceID); hipDeviceGetAttribute(&hmm_enabled, hipDeviceAttributeManagedMemory, deviceID); if(!hmm_enabled) { std::puts("Managed memory not enabled on device. Skipping test..."); std::fflush(stdout); return; } // check non-supported values // N/A // determine sizes size_t size_A = lda * n; size_t size_D = nb; size_t size_E = nb; size_t size_Q = nb; size_t size_P = nb; size_t size_X = ldx * nb; size_t size_Y = ldy * nb; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || nb < 0 || nb > min(m, n) || lda < m || ldx < m || ldy < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, m, n, nb, (T*)nullptr, lda, (S*)nullptr, (S*)nullptr, (T*)nullptr, (T*)nullptr, (T*)nullptr, ldx, (T*)nullptr, ldy), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_labrd(handle, m, n, nb, (T*)nullptr, lda, (S*)nullptr, (S*)nullptr, (T*)nullptr, (T*)nullptr, (T*)nullptr, ldx, (T*)nullptr, ldy)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations S *dD, *dE; T *dA, *dARes, *dTauq, *dTaup, *dX, *dXRes, *dY, *dYRes; hipMallocManaged(&dA, sizeof(T) * size_A); hipMallocManaged(&dARes, sizeof(T) * size_A); hipMallocManaged(&dD, sizeof(S) * size_D); hipMallocManaged(&dE, sizeof(S) * size_E); hipMallocManaged(&dTauq, sizeof(T) * size_Q); hipMallocManaged(&dTaup, sizeof(T) * size_P); hipMallocManaged(&dX, sizeof(T) * size_X); hipMallocManaged(&dXRes, sizeof(T) * size_X); hipMallocManaged(&dY, sizeof(T) * size_Y); hipMallocManaged(&dYRes, sizeof(T) * size_Y); // check quick return if(m == 0 || n == 0 || nb == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, m, n, nb, dARes, lda, dD, dE, dTauq, dTaup, dXRes, ldx, dYRes, ldy), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) managed_malloc_getError(handle, m, n, nb, dA, dARes, lda, dD, dE, dTauq, dTaup, dX, dXRes, ldx, dY, dYRes, ldy, &max_error); // collect performance data if(argus.timing) managed_malloc_getPerfData(handle, m, n, nb, dA, dARes, lda, dD, dE, dTauq, dTaup, dX, dXRes, ldx, dY, dYRes, ldy, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // free memory hipFree(dA); hipFree(dARes); hipFree(dD); hipFree(dE); hipFree(dTauq); hipFree(dTaup); hipFree(dX); hipFree(dXRes); hipFree(dY); hipFree(dYRes); // validate results for rocsolver-test // using nb * max(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, nb * max(m, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("m", "n", "nb", "lda", "ldx", "ldy"); rocsolver_bench_output(m, n, nb, lda, ldx, ldy); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-5.5.1/clients/include/testing_orgbr_ungbr.hpp000066400000000000000000000324341436600607200240520ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void orgbr_ungbr_checkBadArgs(const rocblas_handle handle, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_orgbr_ungbr(nullptr, storev, m, n, k, dA, lda, dIpiv), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_orgbr_ungbr(handle, rocblas_storev(0), m, n, k, dA, lda, dIpiv), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgbr_ungbr(handle, storev, m, n, k, (T) nullptr, lda, dIpiv), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_orgbr_ungbr(handle, storev, m, n, k, dA, lda, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_orgbr_ungbr(handle, rocblas_row_wise, 0, n, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); EXPECT_ROCBLAS_STATUS( rocsolver_orgbr_ungbr(handle, rocblas_column_wise, m, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); } template void testing_orgbr_ungbr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_storev storev = rocblas_column_wise; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments orgbr_ungbr_checkBadArgs(handle, storev, m, n, k, dA.data(), lda, dIpiv.data()); } template void orgbr_ungbr_initData(const rocblas_handle handle, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, std::vector& hW, size_t size_W) { if(CPU) { using S = decltype(std::real(T{})); size_t s = max(hIpiv.n(), 2); std::vector E(s - 1); std::vector D(s); std::vector P(s); rocblas_init(hA, true); rocblas_init(hIpiv, true); // scale to avoid singularities // and compute gebrd if(storev == rocblas_column_wise) { for(int i = 0; i < m; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } cpu_gebrd(m, k, hA[0], lda, D.data(), E.data(), hIpiv[0], P.data(), hW.data(), size_W); } else { for(int i = 0; i < k; ++i) { for(int j = 0; j < n; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } cpu_gebrd(k, n, hA[0], lda, D.data(), E.data(), P.data(), hIpiv[0], hW.data(), size_W); } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void orgbr_ungbr_getError(const rocblas_handle handle, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hAr, Th& hIpiv, double* max_err) { size_t size_W = max(max(m, n), k); std::vector hW(size_W); // initialize data orgbr_ungbr_initData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_orgbr_ungbr(handle, storev, m, n, k, dA.data(), lda, dIpiv.data())); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_orgbr_ungbr(storev, m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void orgbr_ungbr_getPerfData(const rocblas_handle handle, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = max(max(m, n), k); std::vector hW(size_W); if(!perf) { orgbr_ungbr_initData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_orgbr_ungbr(storev, m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } orgbr_ungbr_initData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { orgbr_ungbr_initData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); CHECK_ROCBLAS_ERROR( rocsolver_orgbr_ungbr(handle, storev, m, n, k, dA.data(), lda, dIpiv.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { orgbr_ungbr_initData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); start = get_time_us_sync(stream); rocsolver_orgbr_ungbr(handle, storev, m, n, k, dA.data(), lda, dIpiv.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_orgbr_ungbr(Arguments& argus) { // get arguments rocblas_local_handle handle; char storevC = argus.get("storev"); rocblas_int m, n; if(storevC == 'R') { m = argus.get("m"); n = argus.get("n", m); } else { n = argus.get("n"); m = argus.get("m", n); } rocblas_int k = argus.get("k", min(m, n)); rocblas_int lda = argus.get("lda", m); rocblas_storev storev = char2rocblas_storev(storevC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes // size_P could be zero in test cases that are not quick-return or invalid // cases setting it to one to avoid possible memory access errors in the rest // of the unit test bool row = (storev == rocblas_row_wise); size_t size_A = row ? size_t(lda) * n : size_t(lda) * max(n, k); size_t size_P = row ? max(size_t(min(n, k)), 1) : max(size_t(min(m, k)), 1); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = ((m < 0 || n < 0 || k < 0 || lda < m) || (row && (m > n || m < min(n, k))) || (!row && (n > m || n < min(m, k)))); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_orgbr_ungbr(handle, storev, m, n, k, (T*)nullptr, lda, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY( rocsolver_orgbr_ungbr(handle, storev, m, n, k, (T*)nullptr, lda, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_orgbr_ungbr(handle, storev, m, n, k, dA.data(), lda, dIpiv.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) orgbr_ungbr_getError(handle, storev, m, n, k, dA, lda, dIpiv, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) orgbr_ungbr_getPerfData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = row ? n : m; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("storev", "m", "n", "k", "lda"); rocsolver_bench_output(storevC, m, n, k, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORGBR_UNGBR(...) \ extern template void testing_orgbr_ungbr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORGBR_UNGBR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_orglx_unglx.hpp000066400000000000000000000265241436600607200241150ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void orglx_unglx_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_orglx_unglx(GLQ, nullptr, m, n, k, dA, lda, dIpiv), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_orglx_unglx(GLQ, handle, m, n, k, (T) nullptr, lda, dIpiv), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_orglx_unglx(GLQ, handle, m, n, k, dA, lda, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_orglx_unglx(GLQ, handle, 0, n, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_orglx_unglx(GLQ, handle, 0, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); } template void testing_orglx_unglx_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments orglx_unglx_checkBadArgs(handle, m, n, k, dA.data(), lda, dIpiv.data()); } template void orglx_unglx_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, std::vector& hW, size_t size_W) { if(CPU) { rocblas_init(hA, true); rocblas_init(hIpiv, true); // scale to avoid singularities for(int i = 0; i < m; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute LQ factorization cpu_gelqf(m, n, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void orglx_unglx_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hAr, Th& hIpiv, double* max_err) { size_t size_W = size_t(m); std::vector hW(size_W); // initialize data orglx_unglx_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_orglx_unglx(GLQ, handle, m, n, k, dA.data(), lda, dIpiv.data())); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack GLQ ? cpu_orglq_unglq(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_orgl2_ungl2(m, n, k, hA[0], lda, hIpiv[0], hW.data()); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void orglx_unglx_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = size_t(m); std::vector hW(size_W); if(!perf) { orglx_unglx_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); GLQ ? cpu_orglq_unglq(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_orgl2_ungl2(m, n, k, hA[0], lda, hIpiv[0], hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } orglx_unglx_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { orglx_unglx_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_orglx_unglx(GLQ, handle, m, n, k, dA.data(), lda, dIpiv.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { orglx_unglx_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); start = get_time_us_sync(stream); rocsolver_orglx_unglx(GLQ, handle, m, n, k, dA.data(), lda, dIpiv.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_orglx_unglx(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int k = argus.get("k", m); rocblas_int lda = argus.get("lda", m); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(m); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || k < 0 || lda < m || n < m || k > m); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_orglx_unglx(GLQ, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_orglx_unglx(GLQ, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_orglx_unglx(GLQ, handle, m, n, k, dA.data(), lda, dIpiv.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) orglx_unglx_getError(handle, m, n, k, dA, lda, dIpiv, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) orglx_unglx_getPerfData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("m", "n", "k", "lda"); rocsolver_bench_output(m, n, k, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORGLX_UNGLX(...) \ extern template void testing_orglx_unglx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORGLX_UNGLX, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_orgtr_ungtr.hpp000066400000000000000000000262171436600607200241200ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void orgtr_ungtr_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, T dIpiv) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(nullptr, uplo, n, dA, lda, dIpiv), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, rocblas_fill(0), n, dA, lda, dIpiv), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, uplo, n, (T) nullptr, lda, dIpiv), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, uplo, n, dA, lda, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, uplo, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); } template void testing_orgtr_ungtr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments orgtr_ungtr_checkBadArgs(handle, uplo, n, dA.data(), lda, dIpiv.data()); } template void orgtr_ungtr_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, std::vector& hW, size_t size_W) { if(CPU) { using S = decltype(std::real(T{})); size_t s = max(hIpiv.n(), 2); std::vector E(s - 1); std::vector D(s); rocblas_init(hA, true); rocblas_init(hIpiv, true); // scale to avoid singularities for(int i = 0; i < n; ++i) { for(int j = 0; j < n; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute sytrd/hetrd cpu_sytrd_hetrd(uplo, n, hA[0], lda, D.data(), E.data(), hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void orgtr_ungtr_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hAr, Th& hIpiv, double* max_err) { size_t size_W = n * 32; std::vector hW(size_W); // initialize data orgtr_ungtr_initData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_orgtr_ungtr(handle, uplo, n, dA.data(), lda, dIpiv.data())); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_orgtr_ungtr(uplo, n, hA[0], lda, hIpiv[0], hW.data(), size_W); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', n, n, lda, hA[0], hAr[0]); } template void orgtr_ungtr_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = n * 32; std::vector hW(size_W); if(!perf) { orgtr_ungtr_initData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_orgtr_ungtr(uplo, n, hA[0], lda, hIpiv[0], hW.data(), size_W); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } orgtr_ungtr_initData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { orgtr_ungtr_initData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_orgtr_ungtr(handle, uplo, n, dA.data(), lda, dIpiv.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { orgtr_ungtr_initData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, hW, size_W); start = get_time_us_sync(stream); rocsolver_orgtr_ungtr(handle, uplo, n, dA.data(), lda, dIpiv.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_orgtr_ungtr(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes // size_P could be zero in test cases that are not quick-return or invalid // cases setting it to one to avoid possible memory access errors in the rest // of the unit test size_t size_A = size_t(lda) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, uplo, n, (T*)nullptr, lda, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_orgtr_ungtr(handle, uplo, n, (T*)nullptr, lda, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, uplo, n, dA.data(), lda, dIpiv.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) orgtr_ungtr_getError(handle, uplo, n, dA, lda, dIpiv, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) orgtr_ungtr_getPerfData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORGTR_UNGTR(...) \ extern template void testing_orgtr_ungtr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORGTR_UNGTR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_orgxl_ungxl.hpp000066400000000000000000000265341436600607200241160ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void orgxl_ungxl_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_orgxl_ungxl(GQL, nullptr, m, n, k, dA, lda, dIpiv), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgxl_ungxl(GQL, handle, m, n, k, (T) nullptr, lda, dIpiv), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_orgxl_ungxl(GQL, handle, m, n, k, dA, lda, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgxl_ungxl(GQL, handle, m, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_orgxl_ungxl(GQL, handle, 0, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); } template void testing_orgxl_ungxl_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments orgxl_ungxl_checkBadArgs(handle, m, n, k, dA.data(), lda, dIpiv.data()); } template void orgxl_ungxl_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, std::vector& hW, size_t size_W) { if(CPU) { rocblas_init(hA, true); rocblas_init(hIpiv, true); // scale to avoid singularities for(int i = 0; i < m; ++i) { for(int j = 0; j < k; ++j) { if(m - i == n - j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute QL factorization cpu_geqlf(m, n, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void orgxl_ungxl_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hAr, Th& hIpiv, double* max_err) { size_t size_W = size_t(n); std::vector hW(size_W); // initialize data orgxl_ungxl_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_orgxl_ungxl(GQL, handle, m, n, k, dA.data(), lda, dIpiv.data())); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack GQL ? cpu_orgql_ungql(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_org2l_ung2l(m, n, k, hA[0], lda, hIpiv[0], hW.data()); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void orgxl_ungxl_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = size_t(n); std::vector hW(size_W); if(!perf) { orgxl_ungxl_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); GQL ? cpu_orgql_ungql(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_org2l_ung2l(m, n, k, hA[0], lda, hIpiv[0], hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } orgxl_ungxl_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { orgxl_ungxl_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_orgxl_ungxl(GQL, handle, m, n, k, dA.data(), lda, dIpiv.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { orgxl_ungxl_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); start = get_time_us_sync(stream); rocsolver_orgxl_ungxl(GQL, handle, m, n, k, dA.data(), lda, dIpiv.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_orgxl_ungxl(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int m = argus.get("m", n); rocblas_int k = argus.get("k", n); rocblas_int lda = argus.get("lda", m); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(m); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || k < 0 || lda < m || m < n || k > n); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_orgxl_ungxl(GQL, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_orgxl_ungxl(GQL, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_orgxl_ungxl(GQL, handle, m, n, k, dA.data(), lda, dIpiv.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) orgxl_ungxl_getError(handle, m, n, k, dA, lda, dIpiv, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) orgxl_ungxl_getPerfData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("m", "n", "k", "lda"); rocsolver_bench_output(m, n, k, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORGXL_UNGXL(...) \ extern template void testing_orgxl_ungxl<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORGXL_UNGXL, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_orgxr_ungxr.hpp000066400000000000000000000265241436600607200241310ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void orgxr_ungxr_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_orgxr_ungxr(GQR, nullptr, m, n, k, dA, lda, dIpiv), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgxr_ungxr(GQR, handle, m, n, k, (T) nullptr, lda, dIpiv), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_orgxr_ungxr(GQR, handle, m, n, k, dA, lda, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgxr_ungxr(GQR, handle, 0, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_orgxr_ungxr(GQR, handle, m, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); } template void testing_orgxr_ungxr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments orgxr_ungxr_checkBadArgs(handle, m, n, k, dA.data(), lda, dIpiv.data()); } template void orgxr_ungxr_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, std::vector& hW, size_t size_W) { if(CPU) { rocblas_init(hA, true); rocblas_init(hIpiv, true); // scale to avoid singularities for(int i = 0; i < m; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute QR factorization cpu_geqrf(m, n, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void orgxr_ungxr_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hAr, Th& hIpiv, double* max_err) { size_t size_W = size_t(n); std::vector hW(size_W); // initialize data orgxr_ungxr_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_orgxr_ungxr(GQR, handle, m, n, k, dA.data(), lda, dIpiv.data())); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack GQR ? cpu_orgqr_ungqr(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_org2r_ung2r(m, n, k, hA[0], lda, hIpiv[0], hW.data()); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void orgxr_ungxr_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = size_t(n); std::vector hW(size_W); if(!perf) { orgxr_ungxr_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); GQR ? cpu_orgqr_ungqr(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_org2r_ung2r(m, n, k, hA[0], lda, hIpiv[0], hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } orgxr_ungxr_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { orgxr_ungxr_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_orgxr_ungxr(GQR, handle, m, n, k, dA.data(), lda, dIpiv.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { orgxr_ungxr_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); start = get_time_us_sync(stream); rocsolver_orgxr_ungxr(GQR, handle, m, n, k, dA.data(), lda, dIpiv.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_orgxr_ungxr(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int m = argus.get("m", n); rocblas_int k = argus.get("k", n); rocblas_int lda = argus.get("lda", m); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || k < 0 || lda < m || n > m || k > n); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_orgxr_ungxr(GQR, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_orgxr_ungxr(GQR, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_orgxr_ungxr(GQR, handle, m, n, k, dA.data(), lda, dIpiv.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) orgxr_ungxr_getError(handle, m, n, k, dA, lda, dIpiv, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) orgxr_ungxr_getPerfData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using m * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, m); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("m", "n", "k", "lda"); rocsolver_bench_output(m, n, k, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORGXR_UNGXR(...) \ extern template void testing_orgxr_ungxr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORGXR_UNGXR, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_ormbr_unmbr.hpp000066400000000000000000000442011436600607200240610ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void ormbr_unmbr_checkBadArgs(const rocblas_handle handle, const rocblas_storev storev, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv, T dC, const rocblas_int ldc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_ormbr_unmbr(nullptr, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, rocblas_side(0), trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, rocblas_storev(0), side, trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, rocblas_operation(0), m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); if(COMPLEX) EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, rocblas_operation_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, rocblas_operation_conjugate_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, (T) nullptr, lda, dIpiv, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA, lda, (T) nullptr, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, rocblas_side_left, trans, 0, n, k, (T) nullptr, lda, (T) nullptr, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, rocblas_side_right, trans, m, 0, k, (T) nullptr, lda, (T) nullptr, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, rocblas_side_left, trans, m, n, 0, (T) nullptr, lda, (T) nullptr, dC, ldc), rocblas_status_success); } template > void testing_ormbr_unmbr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_storev storev = rocblas_column_wise; rocblas_side side = rocblas_side_left; rocblas_operation trans = rocblas_operation_none; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); // check bad arguments ormbr_unmbr_checkBadArgs(handle, storev, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); } template void ormbr_unmbr_initData(const rocblas_handle handle, const rocblas_storev storev, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, std::vector& hW, size_t size_W) { if(CPU) { using S = decltype(std::real(T{})); size_t s = max(hIpiv.n(), 2); std::vector E(s - 1); std::vector D(s); std::vector P(s); rocblas_int nq = (side == rocblas_side_left) ? m : n; rocblas_init(hA, true); rocblas_init(hIpiv, true); rocblas_init(hC, true); // scale to avoid singularities // and compute gebrd if(storev == rocblas_column_wise) { for(int i = 0; i < nq; ++i) { for(int j = 0; j < s; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } cpu_gebrd(nq, s, hA[0], lda, D.data(), E.data(), hIpiv[0], P.data(), hW.data(), size_W); } else { for(int i = 0; i < s; ++i) { for(int j = 0; j < nq; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } cpu_gebrd(s, nq, hA[0], lda, D.data(), E.data(), P.data(), hIpiv[0], hW.data(), size_W); } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void ormbr_unmbr_getError(const rocblas_handle handle, const rocblas_storev storev, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, Th& hCr, double* max_err) { size_t size_W = max(max(m, n), k); std::vector hW(size_W); // initialize data ormbr_unmbr_initData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); CHECK_HIP_ERROR(hCr.transfer_from(dC)); // CPU lapack cpu_ormbr_unmbr(storev, side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W); // error is ||hC - hCr|| / ||hC|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, ldc, hC[0], hCr[0]); } template void ormbr_unmbr_getPerfData(const rocblas_handle handle, const rocblas_storev storev, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = max(max(m, n), k); std::vector hW(size_W); if(!perf) { ormbr_unmbr_initData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_ormbr_unmbr(storev, side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } ormbr_unmbr_initData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { ormbr_unmbr_initData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { ormbr_unmbr_initData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); start = get_time_us_sync(stream); rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_ormbr_unmbr(Arguments& argus) { // get arguments rocblas_local_handle handle; char storevC = argus.get("storev"); char sideC = argus.get("side"); char transC = argus.get("trans"); rocblas_int m, n; if(sideC == 'L') { m = argus.get("m"); n = argus.get("n", m); } else { n = argus.get("n"); m = argus.get("m", n); } rocblas_int k = argus.get("k", min(m, n)); rocblas_int nq = (sideC == 'L' ? m : n); rocblas_int lda = argus.get("lda", storevC == 'C' ? nq : min(nq, k)); rocblas_int ldc = argus.get("ldc", m); rocblas_side side = char2rocblas_side(sideC); rocblas_storev storev = char2rocblas_storev(storevC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int hot_calls = argus.iters; // check non-supported values bool invalid_value = (side == rocblas_side_both || (COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_P = size_t(min(nq, k)); size_t size_C = size_t(ldc) * n; bool row = (storev == rocblas_row_wise); size_t size_A = row ? size_t(lda) * nq : size_t(lda) * size_P; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Cr = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = ((m < 0 || n < 0 || k < 0 || ldc < m) || (row && lda < min(nq, k)) || (!row && lda < nq)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCr(size_Cr, 1, size_Cr, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); // check quick return if(n == 0 || m == 0 || k == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) ormbr_unmbr_getError(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hCr, &max_error); // collect performance data if(argus.timing) ormbr_unmbr_getPerfData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("storev", "side", "trans", "m", "n", "k", "lda", "ldc"); rocsolver_bench_output(storevC, sideC, transC, m, n, k, lda, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORMBR_UNMBR(...) \ extern template void testing_ormbr_unmbr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORMBR_UNMBR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_ormlx_unmlx.hpp000066400000000000000000000413261436600607200241260ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void ormlx_unmlx_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv, T dC, const rocblas_int ldc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_ormlx_unmlx(MLQ, nullptr, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, rocblas_side(0), trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, rocblas_operation(0), m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); if(COMPLEX) EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, rocblas_operation_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, rocblas_operation_conjugate_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, (T) nullptr, lda, dIpiv, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA, lda, (T) nullptr, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, rocblas_side_right, trans, 0, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, rocblas_side_left, trans, m, 0, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, rocblas_side_left, trans, m, n, 0, (T) nullptr, lda, (T) nullptr, dC, ldc), rocblas_status_success); } template > void testing_ormlx_unmlx_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_operation trans = rocblas_operation_none; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); // check bad arguments ormlx_unmlx_checkBadArgs(handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); } template void ormlx_unmlx_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, std::vector& hW, size_t size_W) { if(CPU) { rocblas_int nq = (side == rocblas_side_left) ? m : n; rocblas_init(hA, true); rocblas_init(hIpiv, true); rocblas_init(hC, true); // scale to avoid singularities for(int i = 0; i < k; ++i) { for(int j = 0; j < nq; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute LQ factorization cpu_gelqf(k, nq, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void ormlx_unmlx_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, Th& hCr, double* max_err) { size_t size_W = max(max(m, n), k); std::vector hW(size_W); // initialize data ormlx_unmlx_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); CHECK_HIP_ERROR(hCr.transfer_from(dC)); // CPU lapack MLQ ? cpu_ormlq_unmlq(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orml2_unml2(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); // error is ||hC - hCr|| / ||hC|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, ldc, hC[0], hCr[0]); } template void ormlx_unmlx_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = max(max(m, n), k); std::vector hW(size_W); if(!perf) { ormlx_unmlx_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); MLQ ? cpu_ormlq_unmlq(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orml2_unml2(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } ormlx_unmlx_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { ormlx_unmlx_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { ormlx_unmlx_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); start = get_time_us_sync(stream); rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_ormlx_unmlx(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); char transC = argus.get("trans"); rocblas_int m, n, k; if(sideC == 'L') { m = argus.get("m"); n = argus.get("n", m); k = argus.get("k", m); } else { n = argus.get("n"); m = argus.get("m", n); k = argus.get("k", n); } rocblas_int lda = argus.get("lda", k); rocblas_int ldc = argus.get("ldc", m); rocblas_side side = char2rocblas_side(sideC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int hot_calls = argus.iters; // check non-supported values bool invalid_value = (side == rocblas_side_both || (COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_A = left ? size_t(lda) * m : size_t(lda) * n; size_t size_P = size_t(k); size_t size_C = size_t(ldc) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Cr = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = ((m < 0 || n < 0 || k < 0 || ldc < m || lda < k) || (left && k > m) || (!left && k > n)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCr(size_Cr, 1, size_Cr, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); // check quick return if(n == 0 || m == 0 || k == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) ormlx_unmlx_getError(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hCr, &max_error); // collect performance data if(argus.timing) ormlx_unmlx_getPerfData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "trans", "m", "n", "k", "lda", "ldc"); rocsolver_bench_output(sideC, transC, m, n, k, lda, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORMLX_UNMLX(...) \ extern template void testing_ormlx_unmlx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORMLX_UNMLX, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_ormtr_unmtr.hpp000066400000000000000000000407201436600607200241270ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void ormtr_unmtr_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_fill uplo, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, T dIpiv, T dC, const rocblas_int ldc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(nullptr, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(handle, rocblas_side(0), uplo, trans, m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(handle, side, rocblas_fill(0), trans, m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, rocblas_operation(0), m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); if(COMPLEX) EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, rocblas_operation_transpose, m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, rocblas_operation_conjugate_transpose, m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, (T) nullptr, lda, dIpiv, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA, lda, (T) nullptr, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, rocblas_side_left, uplo, trans, 0, n, (T) nullptr, lda, (T) nullptr, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, rocblas_side_right, uplo, trans, m, 0, (T) nullptr, lda, (T) nullptr, (T) nullptr, ldc), rocblas_status_success); } template > void testing_ormtr_unmtr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_fill uplo = rocblas_fill_upper; rocblas_operation trans = rocblas_operation_none; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); // check bad arguments ormtr_unmtr_checkBadArgs(handle, side, uplo, trans, m, n, dA.data(), lda, dIpiv.data(), dC.data(), ldc); } template void ormtr_unmtr_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_fill uplo, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, std::vector& hW, size_t size_W) { if(CPU) { using S = decltype(std::real(T{})); rocblas_int nq = (side == rocblas_side_left) ? m : n; std::vector E(nq - 1); std::vector D(nq); rocblas_init(hA, true); rocblas_init(hIpiv, true); rocblas_init(hC, true); // scale to avoid singularities for(int i = 0; i < nq; ++i) { for(int j = 0; j < nq; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute sytrd/hetrd cpu_sytrd_hetrd(uplo, nq, hA[0], lda, D.data(), E.data(), hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void ormtr_unmtr_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_fill uplo, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, Th& hCr, double* max_err) { size_t size_W = (side == rocblas_side_left ? m : n) * 32; std::vector hW(size_W); // initialize data ormtr_unmtr_initData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); CHECK_HIP_ERROR(hCr.transfer_from(dC)); // CPU lapack cpu_ormtr_unmtr(side, uplo, trans, m, n, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W); // error is ||hC - hCr|| / ||hC|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, ldc, hC[0], hCr[0]); } template void ormtr_unmtr_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_fill uplo, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = (side == rocblas_side_left ? m : n) * 32; std::vector hW(size_W); if(!perf) { ormtr_unmtr_initData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_ormtr_unmtr(side, uplo, trans, m, n, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } ormtr_unmtr_initData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { ormtr_unmtr_initData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { ormtr_unmtr_initData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); start = get_time_us_sync(stream); rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA.data(), lda, dIpiv.data(), dC.data(), ldc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_ormtr_unmtr(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); char uploC = argus.get("uplo"); char transC = argus.get("trans"); rocblas_int m, n; if(sideC == 'L') { m = argus.get("m"); n = argus.get("n", m); } else { n = argus.get("n"); m = argus.get("m", n); } rocblas_int nq = (sideC == 'L' ? m : n); rocblas_int lda = argus.get("lda", nq); rocblas_int ldc = argus.get("ldc", m); rocblas_side side = char2rocblas_side(sideC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int hot_calls = argus.iters; // check non-supported values bool invalid_value = (side == rocblas_side_both || (COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_P = size_t(nq); size_t size_C = size_t(ldc) * n; size_t size_A = size_t(lda) * nq; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Cr = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || ldc < m || lda < nq); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCr(size_Cr, 1, size_Cr, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA.data(), lda, dIpiv.data(), dC.data(), ldc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) ormtr_unmtr_getError(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hCr, &max_error); // collect performance data if(argus.timing) ormtr_unmtr_getPerfData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "uplo", "trans", "m", "n", "lda", "ldc"); rocsolver_bench_output(sideC, uploC, transC, m, n, lda, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORMTR_UNMTR(...) \ extern template void testing_ormtr_unmtr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORMTR_UNMTR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_ormxl_unmxl.hpp000066400000000000000000000414111436600607200241210ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void ormxl_unmxl_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv, T dC, const rocblas_int ldc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_ormxl_unmxl(MQL, nullptr, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, rocblas_side(0), trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, rocblas_operation(0), m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); if(COMPLEX) EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, rocblas_operation_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, rocblas_operation_conjugate_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, (T) nullptr, lda, dIpiv, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA, lda, (T) nullptr, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, rocblas_side_right, trans, 0, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, rocblas_side_left, trans, m, 0, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, rocblas_side_left, trans, m, n, 0, (T) nullptr, lda, (T) nullptr, dC, ldc), rocblas_status_success); } template > void testing_ormxl_unmxl_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_operation trans = rocblas_operation_none; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); // check bad arguments ormxl_unmxl_checkBadArgs(handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); } template void ormxl_unmxl_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, std::vector& hW, size_t size_W) { if(CPU) { rocblas_int nq = (side == rocblas_side_left) ? m : n; rocblas_init(hA, true); rocblas_init(hIpiv, true); rocblas_init(hC, true); // scale to avoid singularities for(int i = 0; i < nq; ++i) { for(int j = 0; j < k; ++j) { if(m - i == n - j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute QL factorization cpu_geqlf(nq, k, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void ormxl_unmxl_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, Th& hCr, double* max_err) { size_t size_W = max(max(m, n), k); std::vector hW(size_W); // initialize data ormxl_unmxl_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); CHECK_HIP_ERROR(hCr.transfer_from(dC)); // CPU lapack MQL ? cpu_ormql_unmql(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orm2l_unm2l(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); // error is ||hC - hCr|| / ||hC|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, ldc, hC[0], hCr[0]); } template void ormxl_unmxl_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = max(max(m, n), k); std::vector hW(size_W); if(!perf) { ormxl_unmxl_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); MQL ? cpu_ormql_unmql(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orm2l_unm2l(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } ormxl_unmxl_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { ormxl_unmxl_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { ormxl_unmxl_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); start = get_time_us_sync(stream); rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_ormxl_unmxl(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); char transC = argus.get("trans"); rocblas_int m, n, k; if(sideC == 'L') { m = argus.get("m"); n = argus.get("n", m); k = argus.get("k", m); } else { n = argus.get("n"); m = argus.get("m", n); k = argus.get("k", n); } rocblas_int lda = argus.get("lda", sideC == 'L' ? m : n); rocblas_int ldc = argus.get("ldc", m); rocblas_side side = char2rocblas_side(sideC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int hot_calls = argus.iters; // check non-supported values bool invalid_value = (side == rocblas_side_both || (COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_A = size_t(lda) * k; size_t size_P = size_t(k); size_t size_C = size_t(ldc) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Cr = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = ((m < 0 || n < 0 || k < 0 || ldc < m) || (left && lda < m) || (!left && lda < n) || (left && k > m) || (!left && k > n)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCr(size_Cr, 1, size_Cr, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); // check quick return if(n == 0 || m == 0 || k == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) ormxl_unmxl_getError(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hCr, &max_error); // collect performance data if(argus.timing) ormxl_unmxl_getPerfData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "trans", "m", "n", "k", "lda", "ldc"); rocsolver_bench_output(sideC, transC, m, n, k, lda, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORMXL_UNMXL(...) \ extern template void testing_ormxl_unmxl<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORMXL_UNMXL, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_ormxr_unmxr.hpp000066400000000000000000000413601436600607200241400ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void ormxr_unmxr_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv, T dC, const rocblas_int ldc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_ormxr_unmxr(MQR, nullptr, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, rocblas_side(0), trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, rocblas_operation(0), m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); if(COMPLEX) EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, rocblas_operation_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, rocblas_operation_conjugate_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, (T) nullptr, lda, dIpiv, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA, lda, (T) nullptr, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, rocblas_side_right, trans, 0, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, rocblas_side_left, trans, m, 0, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, rocblas_side_left, trans, m, n, 0, (T) nullptr, lda, (T) nullptr, dC, ldc), rocblas_status_success); } template > void testing_ormxr_unmxr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_operation trans = rocblas_operation_none; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); // check bad arguments ormxr_unmxr_checkBadArgs(handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); } template void ormxr_unmxr_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, std::vector& hW, size_t size_W) { if(CPU) { rocblas_int nq = (side == rocblas_side_left) ? m : n; rocblas_init(hA, true); rocblas_init(hIpiv, true); rocblas_init(hC, true); // scale to avoid singularities for(int i = 0; i < nq; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute QR factorization cpu_geqrf(nq, k, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void ormxr_unmxr_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, Th& hCr, double* max_err) { size_t size_W = max(max(m, n), k); std::vector hW(size_W); // initialize data ormxr_unmxr_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); CHECK_HIP_ERROR(hCr.transfer_from(dC)); // CPU lapack MQR ? cpu_ormqr_unmqr(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orm2r_unm2r(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); // error is ||hC - hCr|| / ||hC|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, ldc, hC[0], hCr[0]); } template void ormxr_unmxr_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = max(max(m, n), k); std::vector hW(size_W); if(!perf) { ormxr_unmxr_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); MQR ? cpu_ormqr_unmqr(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orm2r_unm2r(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } ormxr_unmxr_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { ormxr_unmxr_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { ormxr_unmxr_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); start = get_time_us_sync(stream); rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_ormxr_unmxr(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); char transC = argus.get("trans"); rocblas_int m, n, k; if(sideC == 'L') { m = argus.get("m"); n = argus.get("n", m); k = argus.get("k", m); } else { n = argus.get("n"); m = argus.get("m", n); k = argus.get("k", n); } rocblas_int lda = argus.get("lda", sideC == 'L' ? m : n); rocblas_int ldc = argus.get("ldc", m); rocblas_side side = char2rocblas_side(sideC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int hot_calls = argus.iters; // check non-supported values bool invalid_value = (side == rocblas_side_both || (COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_A = size_t(lda) * k; size_t size_P = size_t(k); size_t size_C = size_t(ldc) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Cr = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = ((m < 0 || n < 0 || k < 0 || ldc < m) || (left && (lda < m || k > m)) || (!left && (lda < n || k > n))); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCr(size_Cr, 1, size_Cr, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); // check quick return if(n == 0 || m == 0 || k == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) ormxr_unmxr_getError(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hCr, &max_error); // collect performance data if(argus.timing) ormxr_unmxr_getPerfData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "trans", "m", "n", "k", "lda", "ldc"); rocsolver_bench_output(sideC, transC, m, n, k, lda, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORMXR_UNMXR(...) \ extern template void testing_ormxr_unmxr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORMXR_UNMXR, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_posv.hpp000066400000000000000000000502321436600607200225250ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void posv_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_posv(STRIDED, nullptr, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, rocblas_fill_full, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T) nullptr, lda, stA, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, 0, nrhs, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS( rocsolver_posv(STRIDED, handle, uplo, n, 0, dA, lda, stA, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, 0), rocblas_status_success); } template void testing_posv_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_int bc = 1; rocblas_fill uplo = rocblas_fill_upper; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments posv_checkBadArgs(handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments posv_checkBadArgs(handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); } } template void posv_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); for(rocblas_int b = 0; b < bc; ++b) { // scale to ensure positive definiteness for(rocblas_int i = 0; i < n; i++) hA[b][i + i * lda] = hA[b][i + i * lda] * sconj(hA[b][i + i * lda]) * 400; if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } if(GPU) { // now copy matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void posv_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hBRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization posv_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_posv(uplo, n, nrhs, hA[b], lda, hB[b], ldb, hInfo[b]); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('I', n, nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; } // also check info for non positive definite cases err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) err++; *max_err += err; } template void posv_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { posv_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_posv(uplo, n, nrhs, hA[b], lda, hB[b], ldb, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } posv_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, singular); // cold calls for(int iter = 0; iter < 2; iter++) { posv_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, singular); CHECK_ROCBLAS_ERROR(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { posv_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, singular); start = get_time_us_sync(stream); rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_posv(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; // check invalid sizes bool invalid_size = (n < 0 || nrhs < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) posv_getError(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) posv_getPerfData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) posv_getError(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) posv_getPerfData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb", "batch_c"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb", "strideA", "strideB", "batch_c"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb, stA, stB, bc); } else { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_POSV(...) extern template void testing_posv<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_POSV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_potf2_potrf.hpp000066400000000000000000000456761436600607200240220ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void potf2_potrf_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, nullptr, uplo, n, dA, lda, stA, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, rocblas_fill_full, n, dA, lda, stA, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA, lda, stA, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T) nullptr, lda, stA, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA, lda, stA, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, 0, (T) nullptr, lda, stA, dinfo, bc), rocblas_status_success); if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA, lda, stA, (U) nullptr, 0), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA, lda, stA, dinfo, 0), rocblas_status_success); } template void testing_potf2_potrf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments potf2_potrf_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments potf2_potrf_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dinfo.data(), bc); } } template void potf2_potrf_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hInfo, const bool singular) { if(CPU) { rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale to ensure positive definiteness for(rocblas_int i = 0; i < n; i++) hA[b][i + i * lda] = hA[b][i + i * lda] * sconj(hA[b][i + i * lda]) * 400; if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void potf2_potrf_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization potf2_potrf_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { POTRF ? cpu_potrf(uplo, n, hA[b], lda, hInfo[b]) : cpu_potf2(uplo, n, hA[b], lda, hInfo[b]); } // error is ||hA - hARes|| / ||hA|| (ideally ||LL' - Lres Lres'|| / ||LL'||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; rocblas_int nn; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { nn = hInfoRes[b][0] == 0 ? n : hInfoRes[b][0]; // (TODO: For now, the algorithm is modifying the whole input matrix even when // it is not positive definite. So we only check the principal nn-by-nn submatrix. // Once this is corrected, nn could be always equal to n.) *max_err = (uplo == rocblas_fill_lower) ? norm_error_lowerTr('F', nn, nn, lda, hA[b], hARes[b]) : norm_error_upperTr('F', nn, nn, lda, hA[b], hARes[b]); } // also check info for non positive definite cases err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) err++; *max_err += err; } template void potf2_potrf_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { potf2_potrf_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { POTRF ? cpu_potrf(uplo, n, hA[b], lda, hInfo[b]) : cpu_potf2(uplo, n, hA[b], lda, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } potf2_potrf_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { potf2_potrf_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { potf2_potrf_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); start = get_time_us_sync(stream); rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_potf2_potrf(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) potf2_potrf_getError(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) potf2_potrf_getPerfData( handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) potf2_potrf_getError(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) potf2_potrf_getPerfData( handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "lda", "batch_c"); rocsolver_bench_output(uploC, n, lda, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "lda", "strideA", "batch_c"); rocsolver_bench_output(uploC, n, lda, stA, bc); } else { rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_POTF2_POTRF(...) \ extern template void testing_potf2_potrf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_POTF2_POTRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_potri.hpp000066400000000000000000000420011436600607200226660ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void potri_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, nullptr, uplo, n, dA, lda, stA, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS( rocsolver_potri(STRIDED, handle, rocblas_fill_full, n, dA, lda, stA, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, dA, lda, stA, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, (T) nullptr, lda, stA, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, dA, lda, stA, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, 0, (T) nullptr, lda, stA, dinfo, bc), rocblas_status_success); if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, dA, lda, stA, (U) nullptr, 0), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, dA, lda, stA, dinfo, 0), rocblas_status_success); } template void testing_potri_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments potri_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments potri_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dinfo.data(), bc); } } template void potri_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hInfo, const bool singular) { if(CPU) { rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale to ensure positive definiteness for(rocblas_int i = 0; i < n; i++) hA[b][i + i * lda] = hA[b][i + i * lda] * sconj(hA[b][i + i * lda]) * 400; // do the Cholesky factorization of matrix A w/ the reference LAPACK routine cpu_potrf(uplo, n, hA[b], lda, hInfo[b]); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero elemtent in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void potri_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization potri_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_potri(STRIDED, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_potri(uplo, n, hA[b], lda, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } template void potri_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { potri_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_potri(uplo, n, hA[b], lda, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } potri_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { potri_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); CHECK_ROCBLAS_ERROR( rocsolver_potri(STRIDED, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { potri_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); start = get_time_us_sync(stream); rocsolver_potri(STRIDED, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_potri(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_potri(STRIDED, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_potri(STRIDED, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_potri(STRIDED, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) potri_getError(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) potri_getPerfData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_potri(STRIDED, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) potri_getError(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) potri_getPerfData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "lda", "batch_c"); rocsolver_bench_output(uploC, n, lda, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "lda", "strideA", "batch_c"); rocsolver_bench_output(uploC, n, lda, stA, bc); } else { rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_POTRI(...) extern template void testing_potri<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_POTRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_potrs.hpp000066400000000000000000000423511436600607200227100ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void potrs_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, nullptr, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, rocblas_fill_full, n, nrhs, dA, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T) nullptr, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, (T) nullptr, ldb, stB, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, 0, nrhs, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, handle, uplo, n, 0, dA, lda, stA, (T) nullptr, ldb, stB, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, 0), rocblas_status_success); } template void testing_potrs_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_int bc = 1; rocblas_fill uplo = rocblas_fill_upper; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); // check bad arguments potrs_checkBadArgs(handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); // check bad arguments potrs_checkBadArgs(handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc); } } template void potrs_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); int info; for(rocblas_int b = 0; b < bc; ++b) { // scale to ensure positive definiteness for(rocblas_int i = 0; i < n; i++) hA[b][i + i * lda] = hA[b][i + i * lda] * sconj(hA[b][i + i * lda]) * 400; // do the Cholesky factorization of matrix A w/ the reference LAPACK routine cpu_potrf(uplo, n, hA[b], lda, &info); } } if(GPU) { // now copy matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void potrs_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, Th& hBRes, double* max_err) { // input data initialization potrs_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_potrs(uplo, n, nrhs, hA[b], lda, hB[b], ldb); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('I', n, nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; } } template void potrs_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { potrs_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_potrs(uplo, n, nrhs, hA[b], lda, hB[b], ldb); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } potrs_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB); // cold calls for(int iter = 0; iter < 2; iter++) { potrs_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB); CHECK_ROCBLAS_ERROR(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { potrs_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB); start = get_time_us_sync(stream); rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_potrs(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; // check invalid sizes bool invalid_size = (n < 0 || nrhs < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc)); else CHECK_ALLOC_QUERY(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) potrs_getError(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, hBRes, &max_error); // collect performance data if(argus.timing) potrs_getPerfData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) potrs_getError(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, hBRes, &max_error); // collect performance data if(argus.timing) potrs_getPerfData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb", "batch_c"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb", "strideA", "strideB", "batch_c"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb, stA, stB, bc); } else { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_POTRS(...) extern template void testing_potrs<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_POTRS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_stebz.hpp000066400000000000000000000516201436600607200226670ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void stebz_checkBadArgs(const rocblas_handle handle, const rocblas_erange erange, const rocblas_eorder eorder, const rocblas_int n, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, const T abstol, U dD, U dE, rocblas_int* dnev, rocblas_int* dnsplit, U dW, rocblas_int* dIblock, rocblas_int* dIsplit, rocblas_int* dinfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_stebz(nullptr, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, rocblas_erange(0), eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, rocblas_eorder(0), n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, (U) nullptr, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, (U) nullptr, dnev, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, (rocblas_int*)nullptr, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, (rocblas_int*)nullptr, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, (U) nullptr, dIblock, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, (rocblas_int*)nullptr, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, (rocblas_int*)nullptr, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, (rocblas_int*)nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, 0, vl, vu, il, iu, abstol, (U) nullptr, (U) nullptr, dnev, dnsplit, (U) nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, dinfo), rocblas_status_success); } template void testing_stebz_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 2; rocblas_erange erange = rocblas_erange_all; rocblas_eorder eorder = rocblas_eorder_entire; T vl = 0; T vu = 0; rocblas_int il = 0; rocblas_int iu = 0; T abstol = 0; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dnev(1, 1, 1, 1); device_strided_batch_vector dnsplit(1, 1, 1, 1); device_strided_batch_vector dIblock(1, 1, 1, 1); device_strided_batch_vector dIsplit(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dnev.memcheck()); CHECK_HIP_ERROR(dnsplit.memcheck()); CHECK_HIP_ERROR(dIblock.memcheck()); CHECK_HIP_ERROR(dIsplit.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments stebz_checkBadArgs(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD.data(), dE.data(), dnev.data(), dnsplit.data(), dW.data(), dIblock.data(), dIsplit.data(), dinfo.data()); } template void stebz_initData(const rocblas_handle handle, const rocblas_int n, Td& dD, Td& dE, Th& hD, Th& hE) { if(CPU) { rocblas_init(hD, true); rocblas_init(hE, true); // scale matrix and add fixed splits in the matrix to test split handling // (scaling ensures that all eigenvalues are in [-20, 20]) for(rocblas_int i = 0; i < n; i++) { hD[0][i] += 10; hE[0][i] = (hE[0][i] - 5) / 10; if(i == n / 4 || i == n / 2 || i == n - 1) hE[0][i] = 0; if(i == n / 7 || i == n / 5 || i == n / 3) hD[0][i] *= -1; } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); } } template void stebz_getError(const rocblas_handle handle, const rocblas_erange erange, const rocblas_eorder eorder, const rocblas_int n, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, const T abstol, Td& dD, Td& dE, Ud& dnev, Ud& dnsplit, Td& dW, Ud& dIblock, Ud& dIsplit, Ud& dinfo, Th& hD, Th& hE, Uh& hnev, Uh& hnevRes, Uh& hnsplit, Uh& hnsplitRes, Th& hW, Th& hWRes, Uh& hIblock, Uh& hIblockRes, Uh& hIsplit, Uh& hIsplitRes, Uh& hinfo, Uh& hinfoRes, double* max_err) { std::vector work(4 * n); std::vector iwork(3 * n); // input data initialization stebz_initData(handle, n, dD, dE, hD, hE); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD.data(), dE.data(), dnev.data(), dnsplit.data(), dW.data(), dIblock.data(), dIsplit.data(), dinfo.data())); CHECK_HIP_ERROR(hnevRes.transfer_from(dnev)); CHECK_HIP_ERROR(hnsplitRes.transfer_from(dnsplit)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hIblockRes.transfer_from(dIblock)); CHECK_HIP_ERROR(hIsplitRes.transfer_from(dIsplit)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin T atol = (abstol == 0) ? 2 * get_safemin() : abstol; cpu_stebz(erange, eorder, n, vl, vu, il, iu, atol, hD[0], hE[0], hnev[0], hnsplit[0], hW[0], hIblock[0], hIsplit[0], work.data(), iwork.data(), hinfo[0]); // check info if(hinfo[0][0] != hinfoRes[0][0]) *max_err = 1; else *max_err = 0; // check number of split blocks rocblas_int ns = hnsplit[0][0]; *max_err += std::abs(ns - hnsplitRes[0][0]); // check split blocks limits for(int k = 0; k < ns; ++k) *max_err += std::abs(hIsplit[0][k] - hIsplitRes[0][k]); // if finding eigenvalues succeded, check values if(hinfo[0][0] == 0) { // check number of computed eigenvalues rocblas_int nn = hnev[0][0]; *max_err += std::abs(nn - hnevRes[0][0]); // check block indices // (note: as very close eigenvalues could be considered to belong to different // blocks by the CPU and GPU algorithms, only check the block index of distinguishable // eigenvalues) for(int k = 0; k < nn; ++k) { int difb = std::abs(hIblock[0][k] - hIblockRes[0][k]); T difv = std::abs(hW[0][k] - hWRes[0][k]) / hW[0][k]; if(difb > 0 && difv > n * get_epsilon()) *max_err += difb; } // error is ||hW - hWRes|| / ||hW|| // using frobenius norm double err = norm_error('F', 1, nn, 1, hW[0], hWRes[0]); *max_err = err > *max_err ? err : *max_err; } } template void stebz_getPerfData(const rocblas_handle handle, const rocblas_erange erange, const rocblas_eorder eorder, const rocblas_int n, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, const T abstol, Td& dD, Td& dE, Ud& dnev, Ud& dnsplit, Td& dW, Ud& dIblock, Ud& dIsplit, Ud& dinfo, Th& hD, Th& hE, Uh& hnev, Uh& hnsplit, Th& hW, Uh& hIblock, Uh& hIsplit, Uh& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { std::vector work(4 * n); std::vector iwork(3 * n); // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin T atol = (abstol == 0) ? 2 * get_safemin() : abstol; stebz_initData(handle, n, dD, dE, hD, hE); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_stebz(erange, eorder, n, vl, vu, il, iu, atol, hD[0], hE[0], hnev[0], hnsplit[0], hW[0], hIblock[0], hIsplit[0], work.data(), iwork.data(), hinfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } stebz_initData(handle, n, dD, dE, hD, hE); // cold calls for(int iter = 0; iter < 2; iter++) { stebz_initData(handle, n, dD, dE, hD, hE); CHECK_ROCBLAS_ERROR(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD.data(), dE.data(), dnev.data(), dnsplit.data(), dW.data(), dIblock.data(), dIsplit.data(), dinfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { stebz_initData(handle, n, dD, dE, hD, hE); start = get_time_us_sync(stream); rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD.data(), dE.data(), dnev.data(), dnsplit.data(), dW.data(), dIblock.data(), dIsplit.data(), dinfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_stebz(Arguments& argus) { // get arguments rocblas_local_handle handle; char erangeC = argus.get("erange"); char eorderC = argus.get("eorder"); rocblas_int n = argus.get("n"); T vl = T(argus.get("vl", 0)); T vu = T(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); T abstol = T(argus.get("abstol")); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_eorder eorder = char2rocblas_eorder(eorderC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = n; size_t size_W = n; size_t size_iblock = n; size_t size_isplit = n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; size_t size_iblockRes = (argus.unit_check || argus.norm_check) ? size_iblock : 0; size_t size_isplitRes = (argus.unit_check || argus.norm_check) ? size_isplit : 0; // check invalid sizes bool invalid_size = (n < 0) || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu))) || (erange == rocblas_erange_index && (il < 1 || iu < 0)); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, (T*)nullptr, (T*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (T*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, (T*)nullptr, (T*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (T*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hW(size_W, 1, size_W, 1); host_strided_batch_vector hWRes(size_WRes, 1, size_WRes, 1); host_strided_batch_vector hIblock(size_iblock, 1, size_iblock, 1); host_strided_batch_vector hIblockRes(size_iblockRes, 1, size_iblockRes, 1); host_strided_batch_vector hIsplit(size_isplit, 1, size_isplit, 1); host_strided_batch_vector hIsplitRes(size_isplitRes, 1, size_isplitRes, 1); host_strided_batch_vector hnev(1, 1, 1, 1); host_strided_batch_vector hnevRes(1, 1, 1, 1); host_strided_batch_vector hnsplit(1, 1, 1, 1); host_strided_batch_vector hnsplitRes(1, 1, 1, 1); host_strided_batch_vector hinfo(1, 1, 1, 1); host_strided_batch_vector hinfoRes(1, 1, 1, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dW(size_W, 1, size_W, 1); device_strided_batch_vector dIblock(size_iblock, 1, size_iblock, 1); device_strided_batch_vector dIsplit(size_isplit, 1, size_isplit, 1); device_strided_batch_vector dnev(1, 1, 1, 1); device_strided_batch_vector dnsplit(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); if(size_iblock) CHECK_HIP_ERROR(dIblock.memcheck()); if(size_isplit) CHECK_HIP_ERROR(dIsplit.memcheck()); CHECK_HIP_ERROR(dnev.memcheck()); CHECK_HIP_ERROR(dnsplit.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD.data(), dE.data(), dnev.data(), dnsplit.data(), dW.data(), dIblock.data(), dIsplit.data(), dinfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) stebz_getError(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo, hD, hE, hnev, hnevRes, hnsplit, hnsplitRes, hW, hWRes, hIblock, hIblockRes, hIsplit, hIsplitRes, hinfo, hinfoRes, &max_error); // collect performance data if(argus.timing) stebz_getPerfData(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo, hD, hE, hnev, hnsplit, hW, hIblock, hIsplit, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("erange", "eorder", "n", "vl", "vu", "il", "iu", "abstol"); rocsolver_bench_output(erangeC, eorderC, n, vl, vu, il, iu, abstol); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STEBZ(...) extern template void testing_stebz<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STEBZ, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_stedc.hpp000066400000000000000000000503131436600607200226400ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void stedc_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, S dD, S dE, T dC, const rocblas_int ldc, U dInfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_stedc(nullptr, evect, n, dD, dE, dC, ldc, dInfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, rocblas_evect(0), n, dD, dE, dC, ldc, dInfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, evect, n, (S) nullptr, dE, dC, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, evect, n, dD, (S) nullptr, dC, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, evect, n, dD, dE, (T) nullptr, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, evect, n, dD, dE, dC, ldc, (U) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_stedc(handle, evect, 0, (S) nullptr, (S) nullptr, (T) nullptr, ldc, dInfo), rocblas_status_success); } template void testing_stedc_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_original; rocblas_int n = 1; rocblas_int ldc = 1; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments stedc_checkBadArgs(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()); } template void stedc_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hC, Uh& hInfo) { if(CPU) { using S = decltype(std::real(T{})); // if the matrix is too small (n < 4), simply initialize D and E if(n < 4) { rocblas_init(hD, true); rocblas_init(hE, true); } // otherwise, the marix will be divided in exactly 2 independent blocks, if the size is even, // or 3 if the size is odd. The 2 main independent blocks will have the same eigenvalues. // The last block, when the size is odd, will have eigenvalue equal 1. else { rocblas_int N1 = n / 2; rocblas_int E = n - 2 * N1; // a. initialize the eigenvalues for the uppermost sub-blocks of the main independent blocks. // The second sub-block will have some repeated eigenvalues in order to test the deflation process S d; rocblas_int NN1 = N1 / 2; rocblas_int NN2 = N1 - NN1; rocblas_int s1 = NN1 * NN1; rocblas_int s2 = NN2 * NN2; rocblas_int sw = NN2 * 32; std::vector A1(s1); std::vector A2(s2); for(rocblas_int i = 0; i < NN1; ++i) { for(rocblas_int j = 0; j < NN1; ++j) { if(i == j) { d = (i + 1) / S(NN1); A1[i + i * NN1] = d; A2[i + i * NN2] = (i % 2 == 0) ? d : -d; } else { A1[i + j * NN1] = 0; A2[i + j * NN2] = 0; } } } if(NN2 > NN1) { for(rocblas_int i = 0; i < NN1; ++i) { A2[NN1 + i * NN2] = 0; A2[i + NN1 * NN2] = 0; } A2[NN1 + NN1 * NN2] = 0; } // b. find the corresponding tridiagonal matrices containing the setup eigenvalues of each sub-block // first find random orthogonal matrices Q1 and Q2 Sh Q1(s1, 1, s1, 1); Sh Q2(s2, 1, s2, 1); rocblas_init(Q1, true); rocblas_init(Q2, true); std::vector hW(sw); std::vector ipiv1(NN1); std::vector ipiv2(NN2); cpu_geqrf(NN1, NN1, Q1.data(), NN1, ipiv1.data(), hW.data(), sw); cpu_geqrf(NN2, NN2, Q2.data(), NN2, ipiv2.data(), hW.data(), sw); // now multiply the orthogonal matrices by the diagonals A1 and A2 to hide the eigenvalues cpu_ormqr_unmqr(rocblas_side_left, rocblas_operation_transpose, NN1, NN1, NN1, Q1.data(), NN1, ipiv1.data(), A1.data(), NN1, hW.data(), sw); cpu_ormqr_unmqr(rocblas_side_right, rocblas_operation_none, NN1, NN1, NN1, Q1.data(), NN1, ipiv1.data(), A1.data(), NN1, hW.data(), sw); cpu_ormqr_unmqr(rocblas_side_left, rocblas_operation_transpose, NN2, NN2, NN2, Q2.data(), NN2, ipiv2.data(), A2.data(), NN2, hW.data(), sw); cpu_ormqr_unmqr(rocblas_side_right, rocblas_operation_none, NN2, NN2, NN2, Q2.data(), NN2, ipiv2.data(), A2.data(), NN2, hW.data(), sw); // finally, perform tridiagonalization cpu_sytrd_hetrd(rocblas_fill_upper, NN1, A1.data(), NN1, hD[0], hE[0], ipiv1.data(), hW.data(), sw); cpu_sytrd_hetrd(rocblas_fill_upper, NN2, A2.data(), NN2, hD[0] + NN1, hE[0] + NN1, ipiv2.data(), hW.data(), sw); // c. integrate blocks into final matrix // integrate the 2 sub-blocks into the first independent block hE[0][NN1 - 1] = 1; hD[0][NN1 - 1] += 1; hD[0][NN1] += 1; // copy the independent block over for(rocblas_int i = 0; i < N1; ++i) { hD[0][N1 + i] = hD[0][i]; hE[0][N1 + i] = hE[0][i]; } hE[0][N1 - 1] = 0; hE[0][2 * N1 - 1] = 0; // integrate the 2 sub-blocks into the second independent block // (using negative p to test secular eqn algorithm) hE[0][N1 + NN1 - 1] = -1; hD[0][N1 + NN1 - 1] -= 2; hD[0][N1 + NN1] -= 2; // if there is a third independent block, initialize it with 1 if(E == 1) hD[0][n - 1] = 1; } // initialize C to the identity matrix if(evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hC[0][i + j * ldc] = 1; else hC[0][i + j * ldc] = 0; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void stedc_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hDRes, Sh& hE, Sh& hERes, Th& hC, Th& hCRes, Uh& hInfo, Uh& hInfoRes, double* max_err, double* max_errv) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int lgn = floor(log(n - 1) / log(2)) + 1; size_t lwork = (COMPLEX ? n * n : 0); size_t lrwork = (evect == rocblas_evect_none ? 1 : 1 + 3 * n + 4 * n * n + 2 * n * lgn); size_t liwork = (evect == rocblas_evect_none ? 1 : 6 + 6 * n + 5 * n * lgn); std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); // input data initialization stedc_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_stedc(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data())); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hERes.transfer_from(dE)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hCRes.transfer_from(dC)); // if eigenvectors were required, prepare matrix A (upper triangular) for implicit tests rocblas_int lda = n; size_t size_A = lda * n; host_strided_batch_vector hA(size_A, 1, size_A, 1); if(evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = i; j < n; j++) { if(i == j) hA[0][i + j * lda] = hD[0][i]; else if(i + 1 == j) hA[0][i + j * lda] = hE[0][i]; else hA[0][i + j * lda] = 0; } } } // CPU lapack cpu_stedc(evect, n, hD[0], hE[0], hC[0], ldc, work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, hInfo[0]); // check info if(hInfo[0][0] != hInfoRes[0][0]) *max_err = 1; else *max_err = 0; double err; if(hInfo[0][0] == 0) { // check that eigenvalues are correct and in order // error is ||hD - hDRes|| / ||hD|| // using frobenius norm err = norm_error('F', 1, n, 1, hD[0], hDRes[0]); *max_err = err > *max_err ? err : *max_err; // check eigenvectors if required if(evect != rocblas_evect_none) { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[0][j]; cpu_symv_hemv(rocblas_fill_upper, n, alpha, hA[0], lda, hCRes[0] + j * ldc, 1, beta, hC[0] + j * ldc, 1); } // error is ||hC - hCRes|| / ||hC|| // using frobenius norm *max_errv = norm_error('F', n, n, ldc, hCRes[0], hC[0]); } } } template void stedc_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hC, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int lgn = floor(log(n - 1) / log(2)) + 1; size_t lwork = (COMPLEX ? n * n : 0); size_t lrwork = (evect == rocblas_evect_none ? 1 : 1 + 3 * n + 4 * n * n + 2 * n * lgn); size_t liwork = (evect == rocblas_evect_none ? 1 : 6 + 6 * n + 5 * n * lgn); std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); if(!perf) { stedc_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_stedc(evect, n, hD[0], hE[0], hC[0], ldc, work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } stedc_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // cold calls for(int iter = 0; iter < 2; iter++) { stedc_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); CHECK_ROCBLAS_ERROR( rocsolver_stedc(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { stedc_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); start = get_time_us_sync(stream); rocsolver_stedc(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_stedc(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); rocblas_int n = argus.get("n"); rocblas_int ldc = argus.get("ldc", n); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = n; size_t size_C = ldc * n; double max_err = 0, max_errv = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; size_t size_ERes = (argus.unit_check || argus.norm_check) ? size_E : 0; size_t size_CRes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (n < 0 || (evect != rocblas_evect_none && ldc < n)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, evect, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_stedc(handle, evect, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hDRes(size_DRes, 1, size_DRes, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hERes(size_ERes, 1, size_ERes, 1); host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCRes(size_CRes, 1, size_CRes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_stedc(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) stedc_getError(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hDRes, hE, hERes, hC, hCRes, hInfo, hInfoRes, &max_err, &max_errv); // collect performance data if(argus.timing) stedc_getPerfData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_err, n); if(evect != rocblas_evect_none) ROCSOLVER_TEST_CHECK(T, max_errv, n * n); } // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("evect", "n", "ldc"); rocsolver_bench_output(evectC, n, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, std::max(max_err, max_errv)); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, std::max(max_err, max_errv)); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STEDC(...) extern template void testing_stedc<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STEDC, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_stein.hpp000066400000000000000000000503261436600607200226640ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void stein_checkBadArgs(const rocblas_handle handle, const rocblas_int n, S dD, S dE, U dNev, S dW, U dIblock, U dIsplit, T dZ, const rocblas_int ldz, U dIfail, U dInfo) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_stein(nullptr, n, dD, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, (S) nullptr, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD, (S) nullptr, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD, dE, (U) nullptr, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD, dE, dNev, (S) nullptr, dIblock, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_stein(handle, n, dD, dE, dNev, dW, (U) nullptr, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_stein(handle, n, dD, dE, dNev, dW, dIblock, (U) nullptr, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD, dE, dNev, dW, dIblock, dIsplit, (T) nullptr, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_stein(handle, n, dD, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, (U) nullptr, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, (U) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, 0, (S) nullptr, (S) nullptr, dNev, (S) nullptr, (U) nullptr, (U) nullptr, (T) nullptr, ldz, (U) nullptr, dInfo), rocblas_status_success); } template void testing_stein_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int ldz = 1; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dIblock(1, 1, 1, 1); device_strided_batch_vector dIsplit(1, 1, 1, 1); device_strided_batch_vector dZ(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dIblock.memcheck()); CHECK_HIP_ERROR(dIsplit.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments stein_checkBadArgs(handle, n, dD.data(), dE.data(), dNev.data(), dW.data(), dIblock.data(), dIsplit.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()); } template void stein_initData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nev, Sd& dD, Sd& dE, Ud& dNev, Sd& dW, Ud& dIblock, Ud& dIsplit, Sh& hD, Sh& hE, Uh& hNev, Sh& hW, Uh& hIblock, Uh& hIsplit) { if(CPU) { using S = decltype(std::real(T{})); rocblas_init(hD, true); rocblas_init(hE, true); rocblas_int nsplit, info; size_t lwork = 4 * n; size_t liwork = 3 * n; std::vector work(lwork); std::vector iwork(liwork); // scale matrix for(rocblas_int i = 0; i < n; i++) { hD[0][i] += 10; hE[0][i] -= 5; if(i == n / 4 || i == n / 2 || i == n - 1) hE[0][i] = 0; if(i == n / 7 || i == n / 5 || i == n / 3) hD[0][i] *= -1; } // compute a subset of the eigenvalues S il = n - nev + 1; S iu = n; S abstol = 2 * get_safemin(); cpu_stebz(rocblas_erange_index, rocblas_eorder_blocks, n, S(0), S(0), il, iu, abstol, hD[0], hE[0], hNev[0], &nsplit, hW[0], hIblock[0], hIsplit[0], work.data(), iwork.data(), &info); } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); CHECK_HIP_ERROR(dNev.transfer_from(hNev)); CHECK_HIP_ERROR(dW.transfer_from(hW)); CHECK_HIP_ERROR(dIblock.transfer_from(hIblock)); CHECK_HIP_ERROR(dIsplit.transfer_from(hIsplit)); } } template void stein_getError(const rocblas_handle handle, const rocblas_int n, const rocblas_int nev, Sd& dD, Sd& dE, Ud& dNev, Sd& dW, Ud& dIblock, Ud& dIsplit, Td& dZ, const rocblas_int ldz, Ud& dIfail, Ud& dInfo, Sh& hD, Sh& hE, Uh& hNev, Sh& hW, Uh& hIblock, Uh& hIsplit, Th& hZ, Th& hZRes, Uh& hIfail, Uh& hIfailRes, Uh& hInfo, Uh& hInfoRes, double* max_err) { using S = decltype(std::real(T{})); size_t lwork = 5 * n; size_t liwork = n; size_t lifail = n; std::vector work(lwork); std::vector iwork(liwork); // input data initialization stein_initData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, hD, hE, hNev, hW, hIblock, hIsplit); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_stein(handle, n, dD.data(), dE.data(), dNev.data(), dW.data(), dIblock.data(), dIsplit.data(), dZ.data(), ldz, dIfail.data(), dInfo.data())); CHECK_HIP_ERROR(hZRes.transfer_from(dZ)); CHECK_HIP_ERROR(hIfailRes.transfer_from(dIfail)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack cpu_stein(n, hD[0], hE[0], hNev[0], hW[0], hIblock[0], hIsplit[0], hZ[0], ldz, work.data(), iwork.data(), hIfail[0], hInfo[0]); // check info if(hInfo[0][0] != hInfoRes[0][0]) *max_err = 1; else *max_err = 0; double err; if(hInfo[0][0] == 0) { // check ifail err = 0; for(int j = 0; j < hNev[0][0]; j++) { if(hIfailRes[0][j] != 0) err++; } *max_err = err > *max_err ? err : *max_err; // need to implicitly test eigenvectors due to non-uniqueness of eigenvectors under scaling // for each of the nev eigenvalues w_j, verify that the associated eigenvector is in the // null space of (A - w_i * I) T alpha, t1, t2; for(int j = 0; j < hNev[0][0]; j++) { for(int i = 0; i < n; i++) { alpha = hW[0][j] - hD[0][i]; hZ[0][i + j * ldz] = hZRes[0][i + j * ldz] * alpha; } t1 = hZRes[0][j * ldz]; hZRes[0][j * ldz] = hE[0][0] * hZRes[0][1 + j * ldz]; for(int i = 1; i < n - 1; i++) { t2 = hZRes[0][i + j * ldz]; hZRes[0][i + j * ldz] = hE[0][i - 1] * t1 + hE[0][i] * hZRes[0][(i + 1) + j * ldz]; t1 = t2; } hZRes[0][(n - 1) + j * ldz] = hE[0][n - 2] * t1; } // error is then ||hZ - hZRes|| / ||hZ|| // using frobenius norm err = norm_error('F', n, hNev[0][0], ldz, hZ[0], hZRes[0]); *max_err = err > *max_err ? err : *max_err; } else { // check ifail err = 0; for(int j = 0; j < hInfo[0][0]; j++) { if(hIfailRes[0][j] == 0) err++; } *max_err = err > *max_err ? err : *max_err; } } template void stein_getPerfData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nev, Sd& dD, Sd& dE, Ud& dNev, Sd& dW, Ud& dIblock, Ud& dIsplit, Td& dZ, const rocblas_int ldz, Ud& dIfail, Ud& dInfo, Sh& hD, Sh& hE, Uh& hNev, Sh& hW, Uh& hIblock, Uh& hIsplit, Th& hZ, Uh& hIfail, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { using S = decltype(std::real(T{})); size_t lwork = 5 * n; size_t liwork = n; size_t lifail = n; std::vector work(lwork); std::vector iwork(liwork); if(!perf) { stein_initData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, hD, hE, hNev, hW, hIblock, hIsplit); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_stein(n, hD[0], hE[0], hNev[0], hW[0], hIblock[0], hIsplit[0], hZ[0], ldz, work.data(), iwork.data(), hIfail[0], hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } stein_initData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, hD, hE, hNev, hW, hIblock, hIsplit); // cold calls for(int iter = 0; iter < 2; iter++) { stein_initData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, hD, hE, hNev, hW, hIblock, hIsplit); CHECK_ROCBLAS_ERROR(rocsolver_stein(handle, n, dD.data(), dE.data(), dNev.data(), dW.data(), dIblock.data(), dIsplit.data(), dZ.data(), ldz, dIfail.data(), dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { stein_initData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, hD, hE, hNev, hW, hIblock, hIsplit); start = get_time_us_sync(stream); rocsolver_stein(handle, n, dD.data(), dE.data(), dNev.data(), dW.data(), dIblock.data(), dIsplit.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_stein(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int nev = argus.get("nev", n < 5 ? n : 5); rocblas_int ldz = argus.get("ldz", n); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = size_D; size_t size_W = size_D; size_t size_iblock = size_D; size_t size_isplit = size_D; size_t size_Z = ldz * n; size_t size_ifail = size_D; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ZRes = (argus.unit_check || argus.norm_check) ? size_Z : 0; size_t size_ifailRes = (argus.unit_check || argus.norm_check) ? size_ifail : 0; // check invalid sizes bool invalid_size = (n < 0 || ldz < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_stein(handle, n, (S*)nullptr, (S*)nullptr, (rocblas_int*)nullptr, (S*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (T*)nullptr, ldz, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_stein(handle, n, (S*)nullptr, (S*)nullptr, (rocblas_int*)nullptr, (S*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (T*)nullptr, ldz, (rocblas_int*)nullptr, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations // host host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hNev(1, 1, 1, 1); host_strided_batch_vector hW(size_W, 1, size_W, 1); host_strided_batch_vector hIblock(size_iblock, 1, size_iblock, 1); host_strided_batch_vector hIsplit(size_isplit, 1, size_isplit, 1); host_strided_batch_vector hZ(size_Z, 1, size_Z, 1); host_strided_batch_vector hZRes(size_ZRes, 1, size_ZRes, 1); host_strided_batch_vector hIfail(size_ifail, 1, size_ifail, 1); host_strided_batch_vector hIfailRes(size_ifailRes, 1, size_ifailRes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); // device device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dW(size_W, 1, size_W, 1); device_strided_batch_vector dIblock(size_iblock, 1, size_iblock, 1); device_strided_batch_vector dIsplit(size_isplit, 1, size_isplit, 1); device_strided_batch_vector dZ(size_Z, 1, size_Z, 1); device_strided_batch_vector dIfail(size_ifail, 1, size_ifail, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); if(size_iblock) CHECK_HIP_ERROR(dIblock.memcheck()); if(size_isplit) CHECK_HIP_ERROR(dIsplit.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); if(size_ifail) CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD.data(), dE.data(), dNev.data(), dW.data(), dIblock.data(), dIsplit.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) stein_getError(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo, hD, hE, hNev, hW, hIblock, hIsplit, hZ, hZRes, hIfail, hIfailRes, hInfo, hInfoRes, &max_error); // collect performance data if(argus.timing) stein_getPerfData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo, hD, hE, hNev, hW, hIblock, hIsplit, hZ, hIfail, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("n", "nev", "ldz"); rocsolver_bench_output(n, nev, ldz); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STEIN(...) extern template void testing_stein<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STEIN, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_steqr.hpp000066400000000000000000000362741436600607200227060ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void steqr_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, S dD, S dE, T dC, const rocblas_int ldc, U dInfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_steqr(nullptr, evect, n, dD, dE, dC, ldc, dInfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, rocblas_evect(0), n, dD, dE, dC, ldc, dInfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, evect, n, (S) nullptr, dE, dC, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, evect, n, dD, (S) nullptr, dC, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, evect, n, dD, dE, (T) nullptr, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, evect, n, dD, dE, dC, ldc, (U) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_steqr(handle, evect, 0, (S) nullptr, (S) nullptr, (T) nullptr, ldc, dInfo), rocblas_status_success); } template void testing_steqr_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_original; rocblas_int n = 1; rocblas_int ldc = 1; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments steqr_checkBadArgs(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()); } template void steqr_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hC, Uh& hInfo) { if(CPU) { using S = decltype(std::real(T{})); rocblas_init(hD, true); rocblas_init(hE, true); // scale matrix and add random splits for(rocblas_int i = 0; i < n; i++) { hD[0][i] += 400; hE[0][i] -= 5; } // add fixed splits in the matrix to test split handling rocblas_int k = n / 2; hE[0][k] = 0; hE[0][k - 1] = 0; // initialize C to the identity matrix if(evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hC[0][i + j * ldc] = 1; else hC[0][i + j * ldc] = 0; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void steqr_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hDRes, Sh& hE, Sh& hERes, Th& hC, Th& hCRes, Uh& hInfo, Uh& hInfoRes, double* max_err) { using S = decltype(std::real(T{})); size_t lwork = (evect == rocblas_evect_none ? 0 : 2 * n - 2); std::vector work(lwork); // input data initialization steqr_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_steqr(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data())); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hERes.transfer_from(dE)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hCRes.transfer_from(dC)); // if eigenvectors were required, prepare matrix A (upper triangular) for implicit tests rocblas_int lda = n; size_t size_A = lda * n; host_strided_batch_vector hA(size_A, 1, size_A, 1); if(evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = i; j < n; j++) { if(i == j) hA[0][i + j * lda] = hD[0][i]; else if(i + 1 == j) hA[0][i + j * lda] = hE[0][i]; else hA[0][i + j * lda] = 0; } } } // CPU lapack cpu_steqr(evect, n, hD[0], hE[0], hC[0], ldc, work.data(), hInfo[0]); // check info if(hInfo[0][0] != hInfoRes[0][0]) *max_err = 1; else *max_err = 0; double err; if(hInfo[0][0] == 0) { // check that eigenvalues are correct and in order // error is ||hD - hDRes|| / ||hD|| // using frobenius norm err = norm_error('F', 1, n, 1, hD[0], hDRes[0]); *max_err = err > *max_err ? err : *max_err; // check eigenvectors if required if(evect != rocblas_evect_none) { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[0][j]; cpu_symv_hemv(rocblas_fill_upper, n, alpha, hA[0], lda, hCRes[0] + j * ldc, 1, beta, hC[0] + j * ldc, 1); } // error is ||hC - hCRes|| / ||hC|| // using frobenius norm err = norm_error('F', n, n, ldc, hC[0], hCRes[0]); *max_err = err > *max_err ? err : *max_err; } } } template void steqr_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hC, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { using S = decltype(std::real(T{})); size_t lwork = (evect == rocblas_evect_none ? 0 : 2 * n - 2); std::vector work(lwork); if(!perf) { steqr_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_steqr(evect, n, hD[0], hE[0], hC[0], ldc, work.data(), hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } steqr_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // cold calls for(int iter = 0; iter < 2; iter++) { steqr_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); CHECK_ROCBLAS_ERROR( rocsolver_steqr(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { steqr_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); start = get_time_us_sync(stream); rocsolver_steqr(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_steqr(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); rocblas_int n = argus.get("n"); rocblas_int ldc = argus.get("ldc", n); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = n; size_t size_C = ldc * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; size_t size_ERes = (argus.unit_check || argus.norm_check) ? size_E : 0; size_t size_CRes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (n < 0 || (evect != rocblas_evect_none && ldc < n)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, evect, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_steqr(handle, evect, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hDRes(size_DRes, 1, size_DRes, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hERes(size_ERes, 1, size_ERes, 1); host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCRes(size_CRes, 1, size_CRes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_steqr(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) steqr_getError(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hDRes, hE, hERes, hC, hCRes, hInfo, hInfoRes, &max_error); // collect performance data if(argus.timing) steqr_getPerfData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("evect", "n", "ldc"); rocsolver_bench_output(evectC, n, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STEQR(...) extern template void testing_steqr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STEQR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_sterf.hpp000066400000000000000000000235421436600607200226650ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void sterf_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dD, T dE, U dInfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sterf(nullptr, n, dD, dE, dInfo), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sterf(handle, n, (T) nullptr, dE, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sterf(handle, n, dD, (T) nullptr, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sterf(handle, n, dD, dE, (U) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sterf(handle, 0, (T) nullptr, (T) nullptr, dInfo), rocblas_status_success); } template void testing_sterf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sterf_checkBadArgs(handle, n, dD.data(), dE.data(), dInfo.data()); } template void sterf_initData(const rocblas_handle handle, const rocblas_int n, Td& dD, Td& dE, Ud& dInfo, Th& hD, Th& hE, Uh& hInfo) { if(CPU) { rocblas_init(hD, true); rocblas_init(hE, true); // scale matrix and add random splits for(rocblas_int i = 0; i < n; i++) { hD[0][i] += 400; hE[0][i] -= 5; } // add fixed splits in the matrix to test split handling rocblas_int k = n / 2; hE[0][k] = 0; hE[0][k - 1] = 0; } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); } } template void sterf_getError(const rocblas_handle handle, const rocblas_int n, Td& dD, Td& dE, Ud& dInfo, Th& hD, Th& hDRes, Th& hE, Th& hERes, Uh& hInfo, double* max_err) { // input data initialization sterf_initData(handle, n, dD, dE, dInfo, hD, hE, hInfo); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sterf(handle, n, dD.data(), dE.data(), dInfo.data())); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hERes.transfer_from(dE)); // CPU lapack cpu_sterf(n, hD[0], hE[0]); // error is ||hD - hDRes|| / ||hD|| // using frobenius norm *max_err = norm_error('F', 1, n, 1, hD[0], hDRes[0]); } template void sterf_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dD, Td& dE, Ud& dInfo, Th& hD, Th& hE, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { sterf_initData(handle, n, dD, dE, dInfo, hD, hE, hInfo); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_sterf(n, hD[0], hE[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sterf_initData(handle, n, dD, dE, dInfo, hD, hE, hInfo); // cold calls for(int iter = 0; iter < 2; iter++) { sterf_initData(handle, n, dD, dE, dInfo, hD, hE, hInfo); CHECK_ROCBLAS_ERROR(rocsolver_sterf(handle, n, dD.data(), dE.data(), dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sterf_initData(handle, n, dD, dE, dInfo, hD, hE, hInfo); start = get_time_us_sync(stream); rocsolver_sterf(handle, n, dD.data(), dE.data(), dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sterf(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; size_t size_ERes = (argus.unit_check || argus.norm_check) ? size_E : 0; // check invalid sizes bool invalid_size = (n < 0); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_sterf(handle, n, (T*)nullptr, (T*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_sterf(handle, n, (T*)nullptr, (T*)nullptr, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hDRes(size_DRes, 1, size_DRes, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hERes(size_ERes, 1, size_ERes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sterf(handle, n, dD.data(), dE.data(), dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sterf_getError(handle, n, dD, dE, dInfo, hD, hDRes, hE, hERes, hInfo, &max_error); // collect performance data if(argus.timing) sterf_getPerfData(handle, n, dD, dE, dInfo, hD, hE, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("n"); rocsolver_bench_output(n); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STERF(...) extern template void testing_sterf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STERF, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_syev_heev.hpp000066400000000000000000000563411436600607200235420ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void syev_heev_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, S dD, const rocblas_stride stD, S dE, const rocblas_stride stE, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, nullptr, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, rocblas_evect(0), uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, rocblas_fill_full, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T) nullptr, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA, lda, stA, (S) nullptr, stD, dE, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, (S) nullptr, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, 0, (T) nullptr, lda, stA, (S) nullptr, stD, (S) nullptr, stE, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, 0), rocblas_status_success); } template void testing_syev_heev_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syev_heev_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syev_heev_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); } } template void syev_heev_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test && evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syev_heev_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Id& dinfo, const rocblas_int bc, Th& hA, Th& hAres, Sh& hD, Sh& hDres, Ih& hinfo, Ih& hinfoRes, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int sizeE = 3 * n - 1; int lwork = (COMPLEX ? 2 * n - 1 : 0); std::vector work(lwork); std::vector hE(sizeE); std::vector A(lda * n * bc); // input data initialization syev_heev_initData(handle, evect, n, dA, lda, bc, hA, A); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc)); CHECK_HIP_ERROR(hDres.transfer_from(dD)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(hAres.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_syev_heev(evect, uplo, n, hA[b], lda, hD[b], work.data(), lwork, hE.data(), sizeE, hinfo[b]); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). double err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK // error is ||hD - hDRes|| / ||hD|| // using frobenius norm if(hinfo[b][0] == 0) err = norm_error('F', 1, n, 1, hD[b], hDres[b]); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hinfo[b][0] == 0) { // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hDres[b][j]; cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hAres[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hAres[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void syev_heev_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Id& dinfo, const rocblas_int bc, Th& hA, Sh& hD, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int sizeE = 3 * n - 1; int lwork = (COMPLEX ? 2 * n - 1 : 0); std::vector work(lwork); std::vector hE(sizeE); std::vector A; if(!perf) { syev_heev_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_syev_heev(evect, uplo, n, hA[b], lda, hD[b], work.data(), lwork, hE.data(), sizeE, hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } syev_heev_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syev_heev_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syev_heev_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syev_heev(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stD = argus.get("strideD", n); rocblas_stride stE = argus.get("strideE", n); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_D = n; size_t size_E = size_D; size_t size_Ares = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_Dres = (argus.unit_check || argus.norm_check) ? size_D : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hDres(size_Dres, 1, stD, bc); // device device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hAres(size_Ares, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syev_heev_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syev_heev_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hD, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hAres(size_Ares, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syev_heev_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syev_heev_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hD, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("evect", "uplo", "n", "lda", "strideD", "strideE", "batch_c"); rocsolver_bench_output(evectC, uploC, n, lda, stD, stE, bc); } else if(STRIDED) { rocsolver_bench_output("evect", "uplo", "n", "lda", "strideA", "strideD", "strideE", "batch_c"); rocsolver_bench_output(evectC, uploC, n, lda, stA, stD, stE, bc); } else { rocsolver_bench_output("evect", "uplo", "n", "lda"); rocsolver_bench_output(evectC, uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYEV_HEEV(...) \ extern template void testing_syev_heev<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYEV_HEEV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_syevd_heevd.hpp000066400000000000000000000601311436600607200240420ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void syevd_heevd_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, S dD, const rocblas_stride stD, S dE, const rocblas_stride stE, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, nullptr, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, rocblas_evect(0), uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, rocblas_fill_full, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T) nullptr, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA, lda, stA, (S) nullptr, stD, dE, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, (S) nullptr, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, 0, (T) nullptr, lda, stA, (S) nullptr, stD, (S) nullptr, stE, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, 0), rocblas_status_success); } template void testing_syevd_heevd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevd_heevd_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevd_heevd_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); } } template void syevd_heevd_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test && evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syevd_heevd_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Id& dinfo, const rocblas_int bc, Th& hA, Th& hAres, Sh& hD, Sh& hDres, Ih& hinfo, Ih& hinfoRes, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int sizeE, lwork; if(!COMPLEX) { sizeE = (evect == rocblas_evect_none ? 2 * n + 1 : 1 + 6 * n + 2 * n * n); lwork = 0; } else { sizeE = (evect == rocblas_evect_none ? n : 1 + 5 * n + 2 * n * n); lwork = (evect == rocblas_evect_none ? n + 1 : 2 * n + n * n); } int liwork = (evect == rocblas_evect_none ? 1 : 3 + 5 * n); std::vector work(lwork); std::vector hE(sizeE); std::vector iwork(liwork); std::vector A(lda * n * bc); // input data initialization syevd_heevd_initData(handle, evect, n, dA, lda, bc, hA, A); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc)); CHECK_HIP_ERROR(hDres.transfer_from(dD)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(hAres.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_syevd_heevd(evect, uplo, n, hA[b], lda, hD[b], work.data(), lwork, hE.data(), sizeE, iwork.data(), liwork, hinfo[b]); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). double err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK // error is ||hD - hDRes|| / ||hD|| // using frobenius norm if(hinfo[b][0] == 0) err = norm_error('F', 1, n, 1, hD[b], hDres[b]); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hinfo[b][0] == 0) { // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hDres[b][j]; cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hAres[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hAres[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void syevd_heevd_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Id& dinfo, const rocblas_int bc, Th& hA, Sh& hD, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int sizeE, lwork; if(!COMPLEX) { sizeE = (evect == rocblas_evect_none ? 2 * n + 1 : 1 + 6 * n + 2 * n * n); lwork = 0; } else { sizeE = (evect == rocblas_evect_none ? n : 1 + 5 * n + 2 * n * n); lwork = (evect == rocblas_evect_none ? n + 1 : 2 * n + n * n); } int liwork = (evect == rocblas_evect_none ? 1 : 3 + 5 * n); std::vector work(lwork); std::vector hE(sizeE); std::vector iwork(liwork); std::vector A; if(!perf) { syevd_heevd_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_syevd_heevd(evect, uplo, n, hA[b], lda, hD[b], work.data(), lwork, hE.data(), sizeE, iwork.data(), liwork, hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } syevd_heevd_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syevd_heevd_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syevd_heevd_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syevd_heevd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stD = argus.get("strideD", n); rocblas_stride stE = argus.get("strideE", n); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_D = n; size_t size_E = size_D; size_t size_Ares = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_Dres = (argus.unit_check || argus.norm_check) ? size_D : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hDres(size_Dres, 1, stD, bc); // device device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hAres(size_Ares, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevd_heevd_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevd_heevd_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hD, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hAres(size_Ares, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevd_heevd_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevd_heevd_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hD, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("evect", "uplo", "n", "lda", "strideD", "strideE", "batch_c"); rocsolver_bench_output(evectC, uploC, n, lda, stD, stE, bc); } else if(STRIDED) { rocsolver_bench_output("evect", "uplo", "n", "lda", "strideA", "strideD", "strideE", "batch_c"); rocsolver_bench_output(evectC, uploC, n, lda, stA, stD, stE, bc); } else { rocsolver_bench_output("evect", "uplo", "n", "lda"); rocsolver_bench_output(evectC, uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYEVD_HEEVD(...) \ extern template void testing_syevd_heevd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYEVD_HEEVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_syevdx_heevdx_inplace.hpp000066400000000000000000000725301436600607200261230ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void syevdx_heevdx_inplace_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, const SS vl, const SS vu, const rocblas_int il, const rocblas_int iu, const SS abstol, U hNev, S dW, const rocblas_stride stW, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, nullptr, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, rocblas_evect(0), erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, rocblas_erange(0), uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, rocblas_fill_full, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, (T) nullptr, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, (U) nullptr, dW, stW, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, (S) nullptr, stW, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, 0, (T) nullptr, lda, stA, vl, vu, il, iu, abstol, hNev, (S) nullptr, stW, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, (U) nullptr, dW, stW, (U) nullptr, 0), rocblas_status_success); } template void testing_syevdx_heevdx_inplace_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_original; rocblas_erange erange = rocblas_erange_value; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stW = 1; rocblas_int bc = 1; S vl = 0.0; S vu = 1.0; rocblas_int il = 0; rocblas_int iu = 0; S abstol = 0; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); host_strided_batch_vector hNev(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevdx_heevdx_inplace_checkBadArgs(handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNev.data(), dW.data(), stW, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); host_strided_batch_vector hNev(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevdx_heevdx_inplace_checkBadArgs(handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNev.data(), dW.data(), stW, dinfo.data(), bc); } } template void syevdx_heevdx_inplace_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // construct well conditioned matrix A such that all eigenvalues are in (-20, 20) for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = i; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 10; else { if(j == i + 1) { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); } else hA[b][j + i * lda] = hA[b][i + j * lda] = 0; } } if(i == n / 4 || i == n / 2 || i == n - 1 || i == n / 7 || i == n / 5 || i == n / 3) hA[b][i + i * lda] *= -1; } // make copy of original data to test vectors if required if(test && evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syevdx_heevdx_inplace_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Ih& hNevRes, Sd& dW, const rocblas_stride stW, Id& dinfo, const rocblas_int bc, Th& hA, Th& hARes, Ih& hNev, Sh& hW, Sh& hWRes, Ih& hinfo, Ih& hinfoRes, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = !COMPLEX ? 35 * n : 33 * n; int lrwork = !COMPLEX ? 0 : 7 * n; int liwork = 5 * n; int ldz = n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); std::vector A(lda * n * bc); std::vector Z(ldz * n); std::vector ifail(n); // input data initialization syevdx_heevdx_inplace_initData(handle, evect, n, dA, lda, bc, hA, A); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dinfo.data(), bc)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; for(rocblas_int b = 0; b < bc; ++b) cpu_syevx_heevx(evect, erange, uplo, n, hA[b], lda, vl, vu, il, iu, atol, hNev[b], hW[b], Z.data(), ldz, work.data(), lwork, rwork.data(), iwork.data(), ifail.data(), hinfo[b]); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; // Check number of returned eigenvalues double err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hNev[b][0] != hNevRes[b][0]) err++; *max_err = err > *max_err ? err : *max_err; // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). for(rocblas_int b = 0; b < bc; ++b) { if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hinfo[b][0] == 0) err = norm_error('F', 1, hNev[b][0], 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hinfo[b][0] == 0) { // multiply A with each of the nev eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // error is ||hZ - hZRes|| / ||hZ|| // using frobenius norm err = norm_error('F', n, hNev[b][0], lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void syevdx_heevdx_inplace_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Ih& hNevRes, Sd& dW, const rocblas_stride stW, Id& dinfo, const rocblas_int bc, Th& hA, Ih& hNev, Sh& hW, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = !COMPLEX ? 35 * n : 33 * n; int lrwork = !COMPLEX ? 0 : 7 * n; int liwork = 5 * n; int ldz = n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); std::vector A; std::vector Z(ldz * n); std::vector ifail(n); // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; if(!perf) { syevdx_heevdx_inplace_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_syevx_heevx(evect, erange, uplo, n, hA[b], lda, vl, vu, il, iu, atol, hNev[b], hW[b], Z.data(), ldz, work.data(), lwork, rwork.data(), iwork.data(), ifail.data(), hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } syevdx_heevdx_inplace_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syevdx_heevdx_inplace_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syevdx_heevdx_inplace_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syevdx_heevdx_inplace(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char erangeC = argus.get("erange"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stW = argus.get("strideW", n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); S abstol = S(argus.get("abstol", 0)); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_W = n; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0 || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (il < 1 || iu < 0)) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu)))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hNev(1, 1, 1, bc); host_strided_batch_vector hNevRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWres(size_WRes, 1, stW, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); // device device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevdx_heevdx_inplace_getError( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNevRes, dW, stW, dinfo, bc, hA, hARes, hNev, hW, hWres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevdx_heevdx_inplace_getPerfData( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNevRes, dW, stW, dinfo, bc, hA, hNev, hW, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevdx_heevdx_inplace_getError( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNevRes, dW, stW, dinfo, bc, hA, hARes, hNev, hW, hWres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevdx_heevdx_inplace_getPerfData( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNevRes, dW, stW, dinfo, bc, hA, hNev, hW, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "vl", "vu", "il", "iu", "abstol", "strideW", "batch_c"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, vl, vu, il, iu, abstol, stW, bc); } else if(STRIDED) { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "strideA", "vl", "vu", "il", "iu", "abstol", "strideW", "batch_c"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, stA, vl, vu, il, iu, abstol, stW, bc); } else { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "vl", "vu", "il", "iu", "abstol"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, vl, vu, il, iu, abstol); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-5.5.1/clients/include/testing_syevj_heevj.hpp000066400000000000000000000675331436600607200240730ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void syevj_heevj_checkBadArgs(const rocblas_handle handle, const rocblas_esort esort, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, S dResidual, const rocblas_int max_sweeps, U dSweeps, S dW, const rocblas_stride stW, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, nullptr, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, rocblas_esort(0), evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, rocblas_evect(0), uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, rocblas_fill_full, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, (U) nullptr, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, 0, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stW, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, (U) nullptr, dW, stW, (U) nullptr, 0), rocblas_status_success); } template void testing_syevj_heevj_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_none; rocblas_esort esort = rocblas_esort_ascending; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stW = 1; rocblas_int bc = 1; S abstol = 0; rocblas_int max_sweeps = 100; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments syevj_heevj_checkBadArgs(handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments syevj_heevj_checkBadArgs(handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); } } template void syevj_heevj_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syevj_heevj_getError(const rocblas_handle handle, const rocblas_esort esort, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S abstol, Sd& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Sd& dW, const rocblas_stride stW, Id& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Sh& hResidualRes, Ih& hSweepsRes, Sh& hW, Sh& hWRes, Ih& hInfo, Ih& hInfoRes, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; S atol = (abstol <= 0) ? get_epsilon() : abstol; int lwork = (COMPLEX ? 2 * n - 1 : 0); int lrwork = 3 * n - 1; std::vector work(lwork); std::vector rwork(lrwork); std::vector A(lda * n * bc); // input data initialization syevj_heevj_initData(handle, evect, n, dA, lda, bc, hA, A); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc)); CHECK_HIP_ERROR(hResidualRes.transfer_from(dResidual)); CHECK_HIP_ERROR(hSweepsRes.transfer_from(dSweeps)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_syev_heev(evect, uplo, n, hA[b], lda, hW[b], work.data(), lwork, rwork.data(), lrwork, hInfo[b]); // (We expect the used input matrices to always converge) // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfoRes[b][0] != 0) *max_err += 1; // Also check validity of residual for(rocblas_int b = 0; b < bc; ++b) if(hResidualRes[b][0] < 0 || hResidualRes[b][0] > snorm('F', n, n, A.data() + b * lda * n, lda) * atol) *max_err += 1; // Also check validity of sweeps for(rocblas_int b = 0; b < bc; ++b) if(hSweepsRes[b][0] < 0 || hSweepsRes[b][0] > max_sweeps) *max_err += 1; double err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK // (no need to test the non-sorted case --lapack return sorted eigenvalues--) // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hInfo[b][0] == 0 && esort == rocblas_esort_ascending) err = norm_error('F', 1, n, 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfo[b][0] == 0) { // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void syevj_heevj_getPerfData(const rocblas_handle handle, const rocblas_esort esort, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S abstol, Sd& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Sd& dW, const rocblas_stride stW, Id& dInfo, const rocblas_int bc, Th& hA, Sh& hW, Ih& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = (COMPLEX ? 2 * n - 1 : 0); int lrwork = 3 * n - 1; std::vector work(lwork); std::vector rwork(lrwork); std::vector A; if(!perf) { syevj_heevj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_syev_heev(evect, uplo, n, hA[b], lda, hW[b], work.data(), lwork, rwork.data(), lrwork, hInfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } syevj_heevj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syevj_heevj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syevj_heevj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syevj_heevj(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char esortC = argus.get("esort"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stW = argus.get("strideD", n); S abstol = S(argus.get("abstol", 0)); rocblas_int max_sweeps = argus.get("max_sweeps", 100); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_esort esort = char2rocblas_esort(esortC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_W = n; size_t size_Ares = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_Wres = (argus.unit_check || argus.norm_check) ? size_W : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_syevj_heevj( STRIDED, handle, esort, evect, uplo, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hResidualRes(1, 1, 1, bc); host_strided_batch_vector hSweepsRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); host_strided_batch_vector hWRes(size_Wres, 1, stW, bc); // device device_strided_batch_vector dResidual(1, 1, 1, bc); device_strided_batch_vector dSweeps(1, 1, 1, bc); device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_Ares, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevj_heevj_getError(handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hARes, hResidualRes, hSweepsRes, hW, hWRes, hInfo, hInfoRes, &max_error); } // collect performance data if(argus.timing) { syevj_heevj_getPerfData( handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_Ares, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevj_heevj_getError(handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hARes, hResidualRes, hSweepsRes, hW, hWRes, hInfo, hInfoRes, &max_error); } // collect performance data if(argus.timing) { syevj_heevj_getPerfData( handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("esort", "evect", "uplo", "n", "lda", "abstol", "max_sweeps", "strideW", "batch_c"); rocsolver_bench_output(esortC, evectC, uploC, n, lda, abstol, max_sweeps, stW, bc); } else if(STRIDED) { rocsolver_bench_output("esort", "evect", "uplo", "n", "lda", "strideA", "abstol", "max_sweeps", "strideW", "batch_c"); rocsolver_bench_output(esortC, evectC, uploC, n, lda, stA, abstol, max_sweeps, stW, bc); } else { rocsolver_bench_output("esort", "evect", "uplo", "n", "lda", "abstol", "max_sweeps"); rocsolver_bench_output(esortC, evectC, uploC, n, lda, abstol, max_sweeps); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYEVJ_HEEVJ(...) \ extern template void testing_syevj_heevj<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYEVJ_HEEVJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_syevx_heevx.hpp000066400000000000000000001032431436600607200241140ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void syevx_heevx_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, const SS vl, const SS vu, const rocblas_int il, const rocblas_int iu, const SS abstol, U dNev, S dW, const rocblas_stride stW, T dZ, const rocblas_int ldz, const rocblas_stride stZ, U dIfail, const rocblas_stride stF, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, nullptr, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, rocblas_evect(0), erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, rocblas_erange(0), uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, rocblas_fill_full, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, (T) nullptr, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, (U) nullptr, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, (S) nullptr, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, (T) nullptr, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, (U) nullptr, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, 0, (T) nullptr, lda, stA, vl, vu, il, iu, abstol, dNev, (S) nullptr, stW, (T) nullptr, ldz, stZ, (U) nullptr, stF, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, (U) nullptr, dW, stW, dZ, ldz, stZ, dIfail, stF, (U) nullptr, 0), rocblas_status_success); } template void testing_syevx_heevx_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_original; rocblas_erange erange = rocblas_erange_value; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldz = 1; rocblas_stride stA = 1; rocblas_stride stW = 1; rocblas_stride stZ = 1; rocblas_stride stF = 1; rocblas_int bc = 1; S vl = 0.0; S vu = 1.0; rocblas_int il = 0; rocblas_int iu = 0; S abstol = 0; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dZ(1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevx_heevx_checkBadArgs(handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dZ(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevx_heevx_checkBadArgs(handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc); } } template void syevx_heevx_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // construct well conditioned matrix A such that all eigenvalues are in (-20, 20) for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = i; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 10; else { if(j == i + 1) { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); } else hA[b][j + i * lda] = hA[b][i + j * lda] = 0; } } if(i == n / 4 || i == n / 2 || i == n - 1 || i == n / 7 || i == n / 5 || i == n / 3) hA[b][i + i * lda] *= -1; } // make copy of original data to test vectors if required if(test && evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syevx_heevx_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Id& dNev, Sd& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Id& dIfail, const rocblas_stride stF, Id& dinfo, const rocblas_int bc, Th& hA, Ih& hNev, Ih& hNevRes, Sh& hW, Sh& hWRes, Th& hZ, Th& hZRes, Ih& hIfail, Ih& hIfailRes, Ih& hinfo, Ih& hinfoRes, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = !COMPLEX ? 35 * n : 33 * n; int lrwork = !COMPLEX ? 0 : 7 * n; int liwork = 5 * n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); std::vector A(lda * n * bc); // input data initialization syevx_heevx_initData(handle, evect, n, dA, lda, bc, hA, A); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syevx_heevx( STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc)); CHECK_HIP_ERROR(hNevRes.transfer_from(dNev)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(evect == rocblas_evect_original) { CHECK_HIP_ERROR(hZRes.transfer_from(dZ)); CHECK_HIP_ERROR(hIfailRes.transfer_from(dIfail)); } // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; for(rocblas_int b = 0; b < bc; ++b) cpu_syevx_heevx(evect, erange, uplo, n, hA[b], lda, vl, vu, il, iu, atol, hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(), hIfail[b], hinfo[b]); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; // Check number of returned eigenvalues double err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hNev[b][0] != hNevRes[b][0]) err++; *max_err = err > *max_err ? err : *max_err; // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). for(rocblas_int b = 0; b < bc; ++b) { if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hinfo[b][0] == 0) err = norm_error('F', 1, hNev[b][0], 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hinfo[b][0] == 0) { // check ifail err = 0; for(int j = 0; j < hNev[b][0]; j++) { if(hIfailRes[b][j] != 0) err++; } *max_err = err > *max_err ? err : *max_err; // multiply A with each of the nev eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hZRes[b] + j * ldz, 1, beta, hZ[b] + j * ldz, 1); } // error is ||hZ - hZRes|| / ||hZ|| // using frobenius norm err = norm_error('F', n, hNev[b][0], ldz, hZ[b], hZRes[b]); *max_err = err > *max_err ? err : *max_err; } else { // check ifail err = 0; for(int j = 0; j < hinfo[b][0]; j++) { if(hIfailRes[b][j] == 0) err++; } *max_err = err > *max_err ? err : *max_err; } } } } template void syevx_heevx_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Id& dNev, Sd& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Id& dIfail, const rocblas_stride stF, Id& dinfo, const rocblas_int bc, Th& hA, Ih& hNev, Sh& hW, Th& hZ, Ih& hIfail, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = !COMPLEX ? 35 * n : 33 * n; int lrwork = !COMPLEX ? 0 : 7 * n; int liwork = 5 * n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); std::vector A; // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; if(!perf) { syevx_heevx_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_syevx_heevx(evect, erange, uplo, n, hA[b], lda, vl, vu, il, iu, atol, hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(), hIfail[b], hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } syevx_heevx_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syevx_heevx_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syevx_heevx( STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syevx_heevx_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syevx_heevx(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char erangeC = argus.get("erange"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldz = argus.get("ldz", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stW = argus.get("strideW", n); rocblas_stride stZ = argus.get("strideZ", ldz * n); rocblas_stride stF = argus.get("strideF", n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); S abstol = S(argus.get("abstol", 0)); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_W = n; size_t size_Z = size_t(ldz) * n; size_t size_ifail = n; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; size_t size_ZRes = (argus.unit_check || argus.norm_check) ? size_Z : 0; size_t size_ifailRes = (argus.unit_check || argus.norm_check) ? size_ifail : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || (evect != rocblas_evect_none && ldz < n) || bc < 0 || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (il < 1 || iu < 0)) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu)))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syevx_heevx( STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_syevx_heevx( STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hNev(1, 1, 1, bc); host_strided_batch_vector hNevRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWres(size_WRes, 1, stW, bc); host_strided_batch_vector hIfail(size_ifail, 1, stF, bc); host_strided_batch_vector hIfailRes(size_ifailRes, 1, stF, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); // device device_strided_batch_vector dNev(1, 1, 1, bc); device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dIfail(size_ifail, 1, stF, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); CHECK_HIP_ERROR(dNev.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); if(size_ifail) CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hZ(size_Z, 1, bc); host_batch_vector hZRes(size_ZRes, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dZ(size_Z, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevx_heevx_getError(handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc, hA, hNev, hNevRes, hW, hWres, hZ, hZRes, hIfail, hIfailRes, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevx_heevx_getPerfData(handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc, hA, hNev, hW, hZ, hIfail, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hZ(size_Z, 1, stZ, bc); host_strided_batch_vector hZRes(size_ZRes, 1, stZ, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dZ(size_Z, 1, stZ, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevx_heevx_getError(handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc, hA, hNev, hNevRes, hW, hWres, hZ, hZRes, hIfail, hIfailRes, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevx_heevx_getPerfData(handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc, hA, hNev, hW, hZ, hIfail, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "vl", "vu", "il", "iu", "abstol", "strideW", "ldz", "strideF", "batch_c"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, vl, vu, il, iu, abstol, stW, ldz, stF, bc); } else if(STRIDED) { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "strideA", "vl", "vu", "il", "iu", "abstol", "strideW", "ldz", "strideZ", "strideF", "batch_c"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, stA, vl, vu, il, iu, abstol, stW, ldz, stZ, stF, bc); } else { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "vl", "vu", "il", "iu", "abstol", "ldz"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, vl, vu, il, iu, abstol, ldz); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYEVX_HEEVX(...) \ extern template void testing_syevx_heevx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYEVX_HEEVX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_sygsx_hegsx.hpp000066400000000000000000000550001436600607200241070ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void sygsx_hegsx_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, nullptr, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, rocblas_eform(0), uplo, n, dA, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, 0, (T) nullptr, lda, stA, dB, ldb, stB, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, 0, dA, lda, stA, (T) nullptr, ldb, stB, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, 0), rocblas_status_success); } template void testing_sygsx_hegsx_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_eform itype = rocblas_eform_ax; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_int ldb = 1; rocblas_stride stB = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); // check bad arguments sygsx_hegsx_checkBadArgs(handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); // check bad arguments sygsx_hegsx_checkBadArgs(handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc); } } template void sygsx_hegsx_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& M, const bool test) { if(CPU) { rocblas_int info; const rocblas_int ldu = n; host_strided_batch_vector U(n * n, 1, n * n, bc); rocblas_init(hA, true); rocblas_init(U, true); for(rocblas_int b = 0; b < bc; ++b) { // for testing purposes, we start with the reduced matrix M of the standard equivalent problem. // Then we construct the generalized pair (A, B) from there for(rocblas_int i = 0; i < n; i++) { // scale matrices and set hA = M (symmetric/hermitian), hB = U (upper triangular) or hB = U' for(rocblas_int j = i; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 100; U[b][i + j * ldu] = std::real(U[b][i + j * ldu]) / 100 + 1; hB[b][i + j * ldb] = U[b][i + j * ldu]; } else { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); U[b][i + j * ldu] = (U[b][i + j * ldu] - 5) / 100; if(uplo == rocblas_fill_upper) { hB[b][i + j * ldb] = U[b][i + j * ldu]; hB[b][j + i * ldb] = 0; } else { hB[b][j + i * ldb] = sconj(U[b][i + j * ldu]); hB[b][i + j * ldb] = 0; } } } } // store M = hA for implicit testing if(test) { for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < n; j++) M[b][i + j * lda] = hA[b][i + j * lda]; } T one = T(1); if(itype == rocblas_eform_ax) { // form A = U' M U cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trmm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } else { // form A = inv(U) M inv(U') cpu_trsm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trsm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygsx_hegsx_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hARes, Th& hB, double* max_err) { constexpr bool VERIFY_IMPLICIT_TEST = false; host_strided_batch_vector M(lda * n, 1, lda * n, bc); // input data initialization sygsx_hegsx_initData(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, M, true); // execute computations // use verify_implicit_test to check correctness of the implicit test using // CPU lapack if(!VERIFY_IMPLICIT_TEST) { // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); } else { // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { memcpy(hARes[b], hA[b], lda * n * sizeof(T)); SYGST ? cpu_sygst_hegst(itype, uplo, n, hARes[b], lda, hB[b], ldb) : cpu_sygs2_hegs2(itype, uplo, n, hARes[b], lda, hB[b], ldb); } } // error is ||M - hARes|| / ||M|| // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(uplo == rocblas_fill_upper) err = norm_error_upperTr('F', n, n, lda, M[b], hARes[b]); else err = norm_error_lowerTr('F', n, n, lda, M[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void sygsx_hegsx_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { host_strided_batch_vector M(lda * n, 1, lda * n, bc); if(!perf) { sygsx_hegsx_initData(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, M, false); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { SYGST ? cpu_sygst_hegst(itype, uplo, n, hA[b], lda, hB[b], ldb) : cpu_sygs2_hegs2(itype, uplo, n, hA[b], lda, hB[b], ldb); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygsx_hegsx_initData(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, M, false); // cold calls for(int iter = 0; iter < 2; iter++) { sygsx_hegsx_initData(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, M, false); CHECK_ROCBLAS_ERROR(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygsx_hegsx_initData(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, M, false); start = get_time_us_sync(stream); rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygsx_hegsx(Arguments& argus) { // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hB(size_B, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygsx_hegsx_getError(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hARes, hB, &max_error); // collect performance data if(argus.timing) sygsx_hegsx_getPerfData( handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygsx_hegsx_getError(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hARes, hB, &max_error); // collect performance data if(argus.timing) sygsx_hegsx_getPerfData( handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "uplo", "n", "lda", "ldb", "batch_c"); rocsolver_bench_output(itypeC, uploC, n, lda, ldb, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "uplo", "n", "lda", "strideA", "ldb", "strideB", "batch_c"); rocsolver_bench_output(itypeC, uploC, n, lda, stA, ldb, stB, bc); } else { rocsolver_bench_output("itype", "uplo", "n", "lda", "ldb"); rocsolver_bench_output(itypeC, uploC, n, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGSX_HEGSX(...) \ extern template void testing_sygsx_hegsx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGSX_HEGSX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_sygv_hegv.hpp000066400000000000000000000751431436600607200235470ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void sygv_hegv_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, U dD, const rocblas_stride stD, U dE, const rocblas_stride stE, rocblas_int* dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, nullptr, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, rocblas_eform(0), evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, rocblas_evect(0), uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, rocblas_evect_tridiagonal, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, (U) nullptr, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, (U) nullptr, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (U) nullptr, stD, (U) nullptr, stE, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_sygv_hegv_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_upper; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygv_hegv_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygv_hegv_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); } } template void sygv_hegv_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_init(hA, true); rocblas_init(hB, false); for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; hB[b][i + j * ldb] = std::real(hB[b][i + j * ldb]) + 400; } else { hA[b][i + j * lda] -= 4; } } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygv_hegv_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dD, const rocblas_stride stD, Ud& dE, const rocblas_stride stE, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Th& hB, Uh& hD, Uh& hDRes, Vh& hInfo, Vh& hInfoRes, double* max_err, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); rocblas_int lwork = (COMPLEX ? 2 * n - 1 : 3 * n - 1); rocblas_int lrwork = (COMPLEX ? 3 * n - 2 : 0); std::vector work(lwork); std::vector rwork(lrwork); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); // input data initialization sygv_hegv_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc)); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_sygv_hegv(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hD[b], work.data(), lwork, rwork.data(), hInfo[b]); } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved. // We do test with indefinite matrices B). // check info for non-convergence and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; double err; for(rocblas_int b = 0; b < bc; ++b) { if(evect == rocblas_evect_none) { // only eigenvalues needed; can compare with LAPACK // error is ||hD - hDRes|| / ||hD|| // using frobenius norm if(hInfoRes[b][0] == 0) { err = norm_error('F', 1, n, 1, hD[b], hDRes[b]); *max_err = err > *max_err ? err : *max_err; } } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfoRes[b][0] == 0) { T alpha = 1; T beta = 0; // hARes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, n, alpha, B[b], ldb, hARes[b], lda, beta, hB[b], ldb); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // move B*x into hARes for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < n; j++) hARes[b][i + j * lda] = hB[b][i + j * ldb]; } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, hA[b] + j * lda, 1); } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void sygv_hegv_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dD, const rocblas_stride stD, Ud& dE, const rocblas_stride stE, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hD, Vh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); rocblas_int lwork = (COMPLEX ? 2 * n - 1 : 3 * n - 1); rocblas_int lrwork = (COMPLEX ? 3 * n - 2 : 0); std::vector work(lwork); std::vector rwork(lrwork); host_strided_batch_vector A(1, 1, 1, 1); host_strided_batch_vector B(1, 1, 1, 1); if(!perf) { sygv_hegv_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_sygv_hegv(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hD[b], work.data(), lwork, rwork.data(), hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygv_hegv_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygv_hegv_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygv_hegv_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygv_hegv(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stD = argus.get("strideD", n); rocblas_stride stE = argus.get("strideE", n); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stDRes = (argus.unit_check || argus.norm_check) ? stD : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_D = size_t(n); size_t size_E = size_D; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hDRes(size_DRes, 1, stDRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); // device device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hB(size_B, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygv_hegv_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hARes, hB, hD, hDRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygv_hegv_getPerfData( handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygv_hegv_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hARes, hB, hD, hDRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygv_hegv_getPerfData( handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideD", "strideE", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stD, stE, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideA", "strideB", "strideD", "strideE", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stA, stB, stD, stE, bc); } else { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGV_HEGV(...) \ extern template void testing_sygv_hegv<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGV_HEGV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_sygvd_hegvd.hpp000066400000000000000000000770721436600607200240620ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void sygvd_hegvd_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, U dD, const rocblas_stride stD, U dE, const rocblas_stride stE, rocblas_int* dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, nullptr, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, rocblas_eform(0), evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, rocblas_evect(0), uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, rocblas_evect_tridiagonal, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, (U) nullptr, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, (U) nullptr, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (U) nullptr, stD, (U) nullptr, stE, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_sygvd_hegvd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_upper; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvd_hegvd_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvd_hegvd_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); } } template void sygvd_hegvd_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_init(hA, true); rocblas_init(hB, false); for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; hB[b][i + j * ldb] = std::real(hB[b][i + j * ldb]) + 400; } else { hA[b][i + j * lda] -= 4; } } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygvd_hegvd_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dD, const rocblas_stride stD, Ud& dE, const rocblas_stride stE, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Th& hB, Uh& hD, Uh& hDRes, Vh& hInfo, Vh& hInfoRes, double* max_err, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int lrwork, lwork; if(!COMPLEX) { lrwork = (evect == rocblas_evect_none ? 2 * n + 1 : 1 + 6 * n + 2 * n * n); lwork = 0; } else { lrwork = (evect == rocblas_evect_none ? n : 1 + 5 * n + 2 * n * n); lwork = (evect == rocblas_evect_none ? n + 1 : 2 * n + n * n); } int liwork = (evect == rocblas_evect_none ? 1 : 3 + 5 * n); std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); // input data initialization sygvd_hegvd_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc)); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvd_hegvd(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hD[b], work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, hInfo[b]); } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved. // We do test with indefinite matrices B). // check info for non-convergence and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; double err; for(rocblas_int b = 0; b < bc; ++b) { if(evect == rocblas_evect_none) { // only eigenvalues needed; can compare with LAPACK // error is ||hD - hDRes|| / ||hD|| // using frobenius norm if(hInfoRes[b][0] == 0) { err = norm_error('F', 1, n, 1, hD[b], hDRes[b]); *max_err = err > *max_err ? err : *max_err; } } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfoRes[b][0] == 0) { T alpha = 1; T beta = 0; // hARes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, n, alpha, B[b], ldb, hARes[b], lda, beta, hB[b], ldb); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // move B*x into hARes for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < n; j++) hARes[b][i + j * lda] = hB[b][i + j * ldb]; } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, hA[b] + j * lda, 1); } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void sygvd_hegvd_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dD, const rocblas_stride stD, Ud& dE, const rocblas_stride stE, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hD, Vh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int lrwork, lwork; if(!COMPLEX) { lrwork = (evect == rocblas_evect_none ? 2 * n + 1 : 1 + 6 * n + 2 * n * n); lwork = 0; } else { lrwork = (evect == rocblas_evect_none ? n : 1 + 5 * n + 2 * n * n); lwork = (evect == rocblas_evect_none ? n + 1 : 2 * n + n * n); } int liwork = (evect == rocblas_evect_none ? 1 : 3 + 5 * n); std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(1, 1, 1, 1); host_strided_batch_vector B(1, 1, 1, 1); if(!perf) { sygvd_hegvd_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvd_hegvd(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hD[b], work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygvd_hegvd_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygvd_hegvd_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygvd_hegvd_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygvd_hegvd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stD = argus.get("strideD", n); rocblas_stride stE = argus.get("strideE", n); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stDRes = (argus.unit_check || argus.norm_check) ? stD : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_D = size_t(n); size_t size_E = size_D; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygvd_hegvd( STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hDRes(size_DRes, 1, stDRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); // device device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hB(size_B, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvd_hegvd_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hARes, hB, hD, hDRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvd_hegvd_getPerfData( handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvd_hegvd_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hARes, hB, hD, hDRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvd_hegvd_getPerfData( handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideD", "strideE", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stD, stE, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideA", "strideB", "strideD", "strideE", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stA, stB, stD, stE, bc); } else { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGVD_HEGVD(...) \ extern template void testing_sygvd_hegvd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGVD_HEGVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_sygvdx_hegvdx_inplace.hpp000066400000000000000000001141071436600607200261240ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void sygvdx_hegvdx_inplace_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, rocblas_int* hNev, U dW, const rocblas_stride stW, rocblas_int* dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace(STRIDED, nullptr, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, rocblas_eform(0), evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, rocblas_evect_tridiagonal, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, rocblas_erange(0), uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, (U) nullptr, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, vl, vu, il, iu, abstol, hNev, (U) nullptr, stW, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, dW, stW, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_sygvdx_hegvdx_inplace_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stW = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_original; rocblas_erange erange = rocblas_erange_value; rocblas_fill uplo = rocblas_fill_upper; S vl = 0.0; S vu = 1.0; rocblas_int il = 0; rocblas_int iu = 0; S abstol = 0; if(BATCHED) { // memory allocations host_strided_batch_vector hNev(1, 1, 1, 1); device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvdx_hegvdx_inplace_checkBadArgs( handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNev.data(), dW.data(), stW, dInfo.data(), bc); } else { // memory allocations host_strided_batch_vector hNev(1, 1, 1, 1); device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvdx_hegvdx_inplace_checkBadArgs( handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNev.data(), dW.data(), stW, dInfo.data(), bc); } } template void sygvdx_hegvdx_inplace_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_int ldu = n; host_strided_batch_vector U(n * n, 1, n * n, bc); rocblas_init(hA, true); rocblas_init(U, true); for(rocblas_int b = 0; b < bc; ++b) { // for testing purposes, we start with a reduced matrix M for the standard equivalent problem // with spectrum in a desired range (-20, 20). Then we construct the generalized pair // (A, B) from there. for(rocblas_int i = 0; i < n; i++) { // scale matrices and set hA = M (symmetric/hermitian), hB = U (upper triangular) for(rocblas_int j = i; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 10; U[b][i + j * ldu] = std::real(U[b][i + j * ldu]) / 100 + 1; hB[b][i + j * ldb] = U[b][i + j * ldu]; } else { if(j == i + 1) { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); } else hA[b][j + i * lda] = hA[b][i + j * lda] = 0; U[b][i + j * ldu] = (U[b][i + j * ldu] - 5) / 100; hB[b][i + j * ldb] = U[b][i + j * ldu]; hB[b][j + i * ldb] = 0; U[b][j + i * ldu] = 0; } } if(i == n / 4 || i == n / 2 || i == n - 1 || i == n / 7 || i == n / 5 || i == n / 3) hA[b][i + i * lda] *= -1; } // form B = U' U T one = T(1); cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hB[b], ldb); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } if(itype == rocblas_eform_ax) { // form A = U' M U cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trmm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } else { // form A = inv(U) M inv(U') cpu_trsm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trsm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygvdx_hegvdx_inplace_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Vh& hNevRes, Ud& dW, const rocblas_stride stW, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Th& hB, Vh& hNev, Uh& hW, Uh& hWRes, Vh& hInfo, Vh& hInfoRes, double* max_err, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = (COMPLEX ? 2 * n : 8 * n); int lrwork = (COMPLEX ? 7 * n : 0); int liwork = 5 * n; int ldz = n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); std::vector Z(ldz * n); std::vector ifail(n); // input data initialization sygvdx_hegvdx_inplace_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dInfo.data(), bc)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvx_hegvx(itype, evect, erange, uplo, n, hA[b], lda, hB[b], ldb, vl, vu, il, iu, atol, hNev[b], hW[b], Z.data(), ldz, work.data(), lwork, rwork.data(), iwork.data(), ifail.data(), hInfo[b]); } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved. // We do test with indefinite matrices B). // check info for non-convergence and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; // Check number of returned eigenvalues for(rocblas_int b = 0; b < bc; ++b) if(hNev[b][0] != hNevRes[b][0]) *max_err += 1; double err; for(rocblas_int b = 0; b < bc; ++b) { if(evect == rocblas_evect_none) { // only eigenvalues needed; can compare with LAPACK // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hInfo[b][0] == 0) { err = norm_error('F', 1, hNev[b][0], 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfo[b][0] == 0) { T alpha = 1; T beta = 0; // hARes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, hNev[b][0], alpha, B[b], ldb, hARes[b], lda, beta, hB[b], ldb); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // move B*x into hARes for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < hNev[b][0]; j++) hARes[b][i + j * lda] = hB[b][i + j * ldb]; } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, hA[b] + j * lda, 1); } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, hNev[b][0], lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void sygvdx_hegvdx_inplace_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Vh& hNevRes, Ud& dW, const rocblas_stride stW, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Vh& hNev, Uh& hW, Vh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = (COMPLEX ? 2 * n : 8 * n); int lrwork = (COMPLEX ? 7 * n : 0); int liwork = 5 * n; int ldz = n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(1, 1, 1, 1); host_strided_batch_vector B(1, 1, 1, 1); std::vector Z(ldz * n); std::vector ifail(n); // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; if(!perf) { sygvdx_hegvdx_inplace_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvx_hegvx(itype, evect, erange, uplo, n, hA[b], lda, hB[b], ldb, vl, vu, il, iu, atol, hNev[b], hW[b], Z.data(), ldz, work.data(), lwork, rwork.data(), iwork.data(), ifail.data(), hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygvdx_hegvdx_inplace_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygvdx_hegvdx_inplace_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygvdx_hegvdx_inplace_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygvdx_hegvdx_inplace(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char erangeC = argus.get("erange"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stW = argus.get("strideW", n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); S abstol = S(argus.get("abstol", 0)); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stWRes = (argus.unit_check || argus.norm_check) ? stW : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_W = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0 || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (il < 1 || iu < 0)) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu)))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hNev(1, 1, 1, bc); host_strided_batch_vector hNevRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWRes(size_WRes, 1, stWRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); // device device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hB(size_B, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvdx_hegvdx_inplace_getError( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNevRes, dW, stW, dInfo, bc, hA, hARes, hB, hNev, hW, hWRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvdx_hegvdx_inplace_getPerfData( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNevRes, dW, stW, dInfo, bc, hA, hB, hNev, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvdx_hegvdx_inplace_getError( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNevRes, dW, stW, dInfo, bc, hA, hARes, hB, hNev, hW, hWRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvdx_hegvdx_inplace_getPerfData( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNevRes, dW, stW, dInfo, bc, hA, hB, hNev, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "vl", "vu", "il", "iu", "abstol", "strideW", "batch_c"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, vl, vu, il, iu, abstol, stW, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "strideA", "strideB", "vl", "vu", "il", "iu", "abstol", "strideW", "batch_c"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, stA, stB, vl, vu, il, iu, abstol, stW, bc); } else { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "vl", "vu", "il", "iu", "abstol"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, vl, vu, il, iu, abstol); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time", "gpu_time", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time", "gpu_time"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-5.5.1/clients/include/testing_sygvj_hegvj.hpp000066400000000000000000001042441436600607200240660ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void sygvj_hegvj_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, const SS abstol, S dResidual, const rocblas_int max_sweeps, U dSweeps, S dW, const rocblas_stride stW, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, nullptr, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, rocblas_eform(0), evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, rocblas_evect(0), uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, rocblas_evect_tridiagonal, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, (S) nullptr, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, (U) nullptr, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stW, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, (S) nullptr, max_sweeps, (U) nullptr, dW, stW, (U) nullptr, 0), rocblas_status_success); } template void testing_sygvj_hegvj_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stW = 1; rocblas_stride stE = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_upper; S abstol = 0; rocblas_int max_sweeps = 100; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvj_hegvj_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvj_hegvj_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); } } template void sygvj_hegvj_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_init(hA, true); rocblas_init(hB, false); for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; hB[b][i + j * ldb] = std::real(hB[b][i + j * ldb]) + 400; } else { hA[b][i + j * lda] -= 4; } } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygvj_hegvj_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S abstol, Sd& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Sd& dW, const rocblas_stride stW, Id& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Th& hB, Sh& hResidualRes, Ih& hSweepsRes, Sh& hW, Sh& hWRes, Ih& hInfo, Ih& hInfoRes, double* max_err, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; S atol = (abstol <= 0) ? get_epsilon() : abstol; rocblas_int lwork = (COMPLEX ? 2 * n - 1 : 3 * n - 1); rocblas_int lrwork = (COMPLEX ? 3 * n - 2 : 0); std::vector work(lwork); std::vector rwork(lrwork); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); // input data initialization sygvj_hegvj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygvj_hegvj( STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc)); CHECK_HIP_ERROR(hResidualRes.transfer_from(dResidual)); CHECK_HIP_ERROR(hSweepsRes.transfer_from(dSweeps)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_sygv_hegv(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hW[b], work.data(), lwork, rwork.data(), hInfo[b]); } // (We expect the used input matrices to always converge) // check info for non-convergence and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; // Also check validity of residual for(rocblas_int b = 0; b < bc; ++b) if(hInfoRes[b][0] == 0 && hResidualRes[b][0] < 0) *max_err += 1; // Also check validity of sweeps for(rocblas_int b = 0; b < bc; ++b) if(hInfoRes[b][0] == 0 && (hSweepsRes[b][0] < 0 || hSweepsRes[b][0] > max_sweeps)) *max_err += 1; double err; for(rocblas_int b = 0; b < bc; ++b) { if(evect == rocblas_evect_none) { // only eigenvalues needed; can compare with LAPACK // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hInfoRes[b][0] == 0) { err = norm_error('F', 1, n, 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfoRes[b][0] == 0) { T alpha = 1; T beta = 0; // hARes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, n, alpha, B[b], ldb, hARes[b], lda, beta, hB[b], ldb); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // move B*x into hARes for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < n; j++) hARes[b][i + j * lda] = hB[b][i + j * ldb]; } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, hA[b] + j * lda, 1); } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void sygvj_hegvj_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S abstol, Sd& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Sd& dW, const rocblas_stride stW, Id& dInfo, const rocblas_int bc, Th& hA, Th& hB, Sh& hW, Ih& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; rocblas_int lwork = (COMPLEX ? 2 * n - 1 : 3 * n - 1); rocblas_int lrwork = (COMPLEX ? 3 * n - 2 : 0); std::vector work(lwork); std::vector rwork(lrwork); host_strided_batch_vector A(1, 1, 1, 1); host_strided_batch_vector B(1, 1, 1, 1); if(!perf) { sygvj_hegvj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_sygv_hegv(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hW[b], work.data(), lwork, rwork.data(), hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygvj_hegvj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygvj_hegvj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygvj_hegvj( STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygvj_hegvj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygvj_hegvj(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stW = argus.get("strideD", n); S abstol = S(argus.get("abstol", 0)); rocblas_int max_sweeps = argus.get("max_sweeps", 100); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stWRes = (argus.unit_check || argus.norm_check) ? stW : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_W = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygvj_hegvj( STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygvj_hegvj( STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hResidualRes(1, 1, 1, bc); host_strided_batch_vector hSweepsRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWRes(size_WRes, 1, stWRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); // device device_strided_batch_vector dResidual(1, 1, 1, bc); device_strided_batch_vector dSweeps(1, 1, 1, bc); device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hB(size_B, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvj_hegvj_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hARes, hB, hResidualRes, hSweepsRes, hW, hWRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvj_hegvj_getPerfData(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hB, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvj_hegvj_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hARes, hB, hResidualRes, hSweepsRes, hW, hWRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvj_hegvj_getPerfData(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hB, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "abstol", "max_sweeps", "strideW", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, abstol, max_sweeps, stW, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideA", "strideB", "abstol", "max_sweeps", "strideW", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stA, stB, abstol, max_sweeps, stW, bc); } else { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "abstol", "max_sweeps"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, abstol, max_sweeps); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGVJ_HEGVJ(...) \ extern template void testing_sygvj_hegvj<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGVJ_HEGVJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_sygvx_hegvx.hpp000066400000000000000000001266761436600607200241370ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void sygvx_hegvx_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, rocblas_int* dNev, U dW, const rocblas_stride stW, T dZ, const rocblas_int ldz, const rocblas_stride stZ, rocblas_int* dIfail, const rocblas_stride stF, rocblas_int* dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, nullptr, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, rocblas_eform(0), evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, rocblas_evect_tridiagonal, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, rocblas_erange(0), uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, (U) nullptr, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, (T) nullptr, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, (rocblas_int*)nullptr, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, vl, vu, il, iu, abstol, dNev, (U) nullptr, stW, (T) nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, dW, stW, dZ, ldz, stZ, dIfail, stF, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_sygvx_hegvx_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldz = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stW = 1; rocblas_stride stZ = 1; rocblas_stride stF = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_original; rocblas_erange erange = rocblas_erange_value; rocblas_fill uplo = rocblas_fill_upper; S vl = 0.0; S vu = 1.0; rocblas_int il = 0; rocblas_int iu = 0; S abstol = 0; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_batch_vector dZ(1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvx_hegvx_checkBadArgs(handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dZ(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvx_hegvx_checkBadArgs(handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc); } } template void sygvx_hegvx_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_int ldu = n; host_strided_batch_vector U(n * n, 1, n * n, bc); rocblas_init(hA, true); rocblas_init(U, true); for(rocblas_int b = 0; b < bc; ++b) { // for testing purposes, we start with a reduced matrix M for the standard equivalent problem // with spectrum in a desired range (-20, 20). Then we construct the generalized pair // (A, B) from there. for(rocblas_int i = 0; i < n; i++) { // scale matrices and set hA = M (symmetric/hermitian), hB = U (upper triangular) for(rocblas_int j = i; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 10; U[b][i + j * ldu] = std::real(U[b][i + j * ldu]) / 100 + 1; hB[b][i + j * ldb] = U[b][i + j * ldu]; } else { if(j == i + 1) { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); } else hA[b][j + i * lda] = hA[b][i + j * lda] = 0; U[b][i + j * ldu] = (U[b][i + j * ldu] - 5) / 100; hB[b][i + j * ldb] = U[b][i + j * ldu]; hB[b][j + i * ldb] = 0; U[b][j + i * ldu] = 0; } } if(i == n / 4 || i == n / 2 || i == n - 1 || i == n / 7 || i == n / 5 || i == n / 3) hA[b][i + i * lda] *= -1; } // form B = U' U T one = T(1); cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hB[b], ldb); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } if(itype == rocblas_eform_ax) { // form A = U' M U cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trmm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } else { // form A = inv(U) M inv(U') cpu_trsm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trsm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygvx_hegvx_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Vd& dNev, Ud& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Vd& dIfail, const rocblas_stride stF, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Vh& hNev, Vh& hNevRes, Uh& hW, Uh& hWRes, Th& hZ, Th& hZRes, Vh& hIfail, Vh& hIfailRes, Vh& hInfo, Vh& hInfoRes, double* max_err, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = (COMPLEX ? 2 * n : 8 * n); int lrwork = (COMPLEX ? 7 * n : 0); int liwork = 5 * n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); // input data initialization sygvx_hegvx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc)); CHECK_HIP_ERROR(hNevRes.transfer_from(dNev)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) { CHECK_HIP_ERROR(hZRes.transfer_from(dZ)); CHECK_HIP_ERROR(hIfailRes.transfer_from(dIfail)); } // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvx_hegvx(itype, evect, erange, uplo, n, hA[b], lda, hB[b], ldb, vl, vu, il, iu, atol, hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(), hIfail[b], hInfo[b]); } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved. // We do test with indefinite matrices B). // check info for non-convergence and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; // Check number of returned eigenvalues for(rocblas_int b = 0; b < bc; ++b) if(hNev[b][0] != hNevRes[b][0]) *max_err += 1; double err; for(rocblas_int b = 0; b < bc; ++b) { if(evect == rocblas_evect_none) { // only eigenvalues needed; can compare with LAPACK // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hInfo[b][0] == 0) { err = norm_error('F', 1, hNev[b][0], 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfo[b][0] == 0) { // check ifail err = 0; for(int j = 0; j < hNev[b][0]; j++) { if(hIfailRes[b][j] != 0) err++; } *max_err = err > *max_err ? err : *max_err; T alpha = 1; T beta = 0; // hZRes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, hNev[b][0], alpha, B[b], ldb, hZRes[b], ldz, beta, hB[b], ldb); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hZRes[b] + j * ldz, 1, beta, hA[b] + j * lda, 1); } // move B*x into hZRes for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < hNev[b][0]; j++) hZRes[b][i + j * ldz] = hB[b][i + j * ldb]; } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, hA[b] + j * lda, 1); } } // error is ||hA - hZRes|| / ||hA|| // using frobenius norm err = norm_error('F', n, hNev[b][0], lda, hA[b], hZRes[b], ldz); *max_err = err > *max_err ? err : *max_err; } else if(hInfo[b][0] <= n) { // check ifail err = 0; for(int j = 0; j < hInfo[b][0]; j++) { if(hIfailRes[b][j] == 0) err++; } *max_err = err > *max_err ? err : *max_err; } } } } template void sygvx_hegvx_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Vd& dNev, Ud& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Vd& dIfail, const rocblas_stride stF, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Vh& hNev, Uh& hW, Th& hZ, Vh& hIfail, Vh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = (COMPLEX ? 2 * n : 8 * n); int lrwork = (COMPLEX ? 7 * n : 0); int liwork = 5 * n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(1, 1, 1, 1); host_strided_batch_vector B(1, 1, 1, 1); // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; if(!perf) { sygvx_hegvx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvx_hegvx(itype, evect, erange, uplo, n, hA[b], lda, hB[b], ldb, vl, vu, il, iu, atol, hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(), hIfail[b], hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygvx_hegvx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygvx_hegvx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygvx_hegvx( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygvx_hegvx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygvx_hegvx(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char erangeC = argus.get("erange"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_int ldz = argus.get("ldz", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stW = argus.get("strideW", n); rocblas_stride stF = argus.get("strideF", n); rocblas_stride stZ = argus.get("strideZ", ldz * n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); S abstol = S(argus.get("abstol", 0)); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stWRes = (argus.unit_check || argus.norm_check) ? stW : 0; rocblas_stride stZRes = (argus.unit_check || argus.norm_check) ? stZ : 0; rocblas_stride stFRes = (argus.unit_check || argus.norm_check) ? stF : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_W = size_t(n); size_t size_Z = size_t(ldz) * n; size_t size_ifail = size_W; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; size_t size_ZRes = (argus.unit_check || argus.norm_check) ? size_Z : 0; size_t size_ifailRes = (argus.unit_check || argus.norm_check) ? size_ifail : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || (evect != rocblas_evect_none && ldz < n) || bc < 0 || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (il < 1 || iu < 0)) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu)))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygvx_hegvx( STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygvx_hegvx( STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hNev(1, 1, 1, bc); host_strided_batch_vector hNevRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWRes(size_WRes, 1, stWRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); host_strided_batch_vector hIfail(size_ifail, 1, stF, bc); host_strided_batch_vector hIfailRes(size_ifailRes, 1, stFRes, bc); // device device_strided_batch_vector dNev(1, 1, 1, bc); device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); device_strided_batch_vector dIfail(size_ifail, 1, stF, bc); CHECK_HIP_ERROR(dNev.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(size_ifail) CHECK_HIP_ERROR(dIfail.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hZ(size_Z, 1, bc); host_batch_vector hZRes(size_ZRes, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dZ(size_Z, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvx_hegvx_getError(handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc, hA, hB, hNev, hNevRes, hW, hWRes, hZ, hZRes, hIfail, hIfailRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvx_hegvx_getPerfData( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc, hA, hB, hNev, hW, hZ, hIfail, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hZ(size_Z, 1, stZ, bc); host_strided_batch_vector hZRes(size_ZRes, 1, stZRes, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dZ(size_Z, 1, stZ, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvx_hegvx_getError(handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc, hA, hB, hNev, hNevRes, hW, hWRes, hZ, hZRes, hIfail, hIfailRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvx_hegvx_getPerfData( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc, hA, hB, hNev, hW, hZ, hIfail, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "vl", "vu", "il", "iu", "abstol", "strideW", "ldz", "strideF", "batch_c"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, vl, vu, il, iu, abstol, stW, ldz, stF, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "strideA", "strideB", "vl", "vu", "il", "iu", "abstol", "strideW", "ldz", "strideZ", "strideF", "batch_c"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, stA, stB, vl, vu, il, iu, abstol, stW, ldz, stZ, stF, bc); } else { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "vl", "vu", "il", "iu", "abstol", "ldz"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, vl, vu, il, iu, abstol, ldz); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time", "gpu_time", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time", "gpu_time"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGVX_HEGVX(...) \ extern template void testing_sygvx_hegvx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGVX_HEGVX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_sytf2_sytrf.hpp000066400000000000000000000554271436600607200240470ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void sytf2_sytrf_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, nullptr, uplo, n, dA, lda, stA, dIpiv, stP, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, rocblas_fill_full, n, dA, lda, stA, dIpiv, stP, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA, lda, stA, dIpiv, stP, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T) nullptr, lda, stA, dIpiv, stP, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA, lda, stA, (U) nullptr, stP, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA, lda, stA, dIpiv, stP, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, 0, (T) nullptr, lda, stA, (U) nullptr, stP, dinfo, bc), rocblas_status_success); if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA, lda, stA, dIpiv, stP, (U) nullptr, 0), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA, lda, stA, dIpiv, stP, dinfo, 0), rocblas_status_success); } template void testing_sytf2_sytrf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sytf2_sytrf_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sytf2_sytrf_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } } template void sytf2_sytrf_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(rocblas_int i = 0; i < n / 2; i++) { for(rocblas_int j = 0; j < n; j++) { tmp = hA[b][i + j * lda]; hA[b][i + j * lda] = hA[b][n - 1 - i + j * lda]; hA[b][n - 1 - i + j * lda] = tmp; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int j = n / 4 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[b][i + j * lda] = 0; hA[b][j + i * lda] = 0; } j = n / 2 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[b][i + j * lda] = 0; hA[b][j + i * lda] = 0; } j = n - 1 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[b][i + j * lda] = 0; hA[b][j + i * lda] = 0; } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void sytf2_sytrf_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hIpivRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { int lwork = (SYTRF ? 64 * n : 0); std::vector work(lwork); // input data initialization sytf2_sytrf_initData(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hIpivRes.transfer_from(dIpiv)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { SYTRF ? cpu_sytrf(uplo, n, hA[b], lda, hIpiv[b], work.data(), lwork, hInfo[b]) : cpu_sytf2(uplo, n, hA[b], lda, hIpiv[b], hInfo[b]); } // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; // also check pivoting (count the number of incorrect pivots) err = 0; for(rocblas_int i = 0; i < n; ++i) if(hIpiv[b][i] != hIpivRes[b][i]) err++; *max_err = err > *max_err ? err : *max_err; } // also check info err = 0; for(rocblas_int b = 0; b < bc; ++b) if(hInfo[b][0] != hInfoRes[b][0]) err++; *max_err += err; } template void sytf2_sytrf_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { int lwork = (SYTRF ? 64 * n : 0); std::vector work(lwork); if(!perf) { sytf2_sytrf_initData(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { SYTRF ? cpu_sytrf(uplo, n, hA[b], lda, hIpiv[b], work.data(), lwork, hInfo[b]) : cpu_sytf2(uplo, n, hA[b], lda, hIpiv[b], hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sytf2_sytrf_initData(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sytf2_sytrf_initData(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sytf2_sytrf_initData(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); start = get_time_us_sync(stream); rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sytf2_sytrf(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stPRes = (argus.unit_check || argus.norm_check) ? stP : 0; // check non-supported values if(uplo == rocblas_fill_full) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_PRes = (argus.unit_check || argus.norm_check) ? size_P : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY( rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hIpivRes(size_PRes, 1, stPRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sytf2_sytrf_getError(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hIpivRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sytf2_sytrf_getPerfData( handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hIpivRes(size_PRes, 1, stPRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sytf2_sytrf_getError(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hIpivRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sytf2_sytrf_getPerfData( handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(uploC, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(uploC, n, lda, stA, stP, bc); } else { rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYTF2_SYTRF(...) \ extern template void testing_sytf2_sytrf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYTF2_SYTRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_sytxx_hetxx.hpp000066400000000000000000000603031436600607200241550ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void sytxx_hetxx_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, S dD, const rocblas_stride stD, S dE, const rocblas_stride stE, U dTau, const rocblas_stride stP, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, nullptr, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, rocblas_fill_full, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T) nullptr, lda, stA, dD, stD, dE, stE, dTau, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA, lda, stA, (S) nullptr, stD, dE, stE, dTau, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA, lda, stA, dD, stD, (S) nullptr, stE, dTau, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, 0, (T) nullptr, lda, stA, (S) nullptr, stD, (S) nullptr, stE, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, 0), rocblas_status_success); } template void testing_sytxx_hetxx_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTau(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTau.memcheck()); // check bad arguments sytxx_hetxx_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTau(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTau.memcheck()); // check bad arguments sytxx_hetxx_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc); } } template , int> = 0> void sytxx_hetxx_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j || i == j + 1 || i == j - 1) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template , int> = 0> void sytxx_hetxx_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = hA[b][i + j * lda].real() + 400; else if(i == j + 1 || i == j - 1) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void sytxx_hetxx_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Ud& dTau, const rocblas_stride stP, const rocblas_int bc, Th& hA, Th& hARes, Sh& hD, Sh& hE, Uh& hTau, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; std::vector hW(32 * n); // input data initialization sytxx_hetxx_initData(handle, n, dA, lda, bc, hA); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hTau.transfer_from(dTau)); // Reconstruct matrix A from the factorization for implicit testing // A = H(n-1)...H(2)H(1)*T*H(1)'H(2)'...H(n-1)' if upper // A = H(1)H(2)...H(n-1)*T*H(n-1)'...H(2)'H(1)' if lower std::vector v(n); for(rocblas_int b = 0; b < bc; ++b) { T* a = hARes[b]; T* t = hTau[b]; if(uplo == rocblas_fill_lower) { for(rocblas_int i = 0; i < n - 2; ++i) a[i + (n - 1) * lda] = 0; a[(n - 2) + (n - 1) * lda] = a[(n - 1) + (n - 2) * lda]; // for each column for(rocblas_int j = n - 2; j >= 0; --j) { // prepare T and v for(rocblas_int i = 0; i < j - 1; ++i) a[i + j * lda] = 0; if(j > 0) a[(j - 1) + j * lda] = a[j + (j - 1) * lda]; for(rocblas_int i = j + 2; i < n; ++i) { v[i - j - 1] = a[i + j * lda]; a[i + j * lda] = 0; } v[0] = 1; // apply householder reflector cpu_larf(rocblas_side_left, n - 1 - j, n - j, v.data(), 1, t + j, a + (j + 1) + j * lda, lda, hW.data()); if(COMPLEX) cpu_lacgv(1, t + j, 1); cpu_larf(rocblas_side_right, n - j, n - 1 - j, v.data(), 1, t + j, a + j + (j + 1) * lda, lda, hW.data()); } } else { a[1] = a[lda]; for(rocblas_int i = 2; i < n; ++i) a[i] = 0; // for each column for(rocblas_int j = 1; j <= n - 1; ++j) { // prepare T and v for(rocblas_int i = 0; i < j - 1; ++i) { v[i] = a[i + j * lda]; a[i + j * lda] = 0; } v[j - 1] = 1; if(j < n - 1) a[(j + 1) + j * lda] = a[j + (j + 1) * lda]; for(rocblas_int i = j + 2; i < n; ++i) a[i + j * lda] = 0; // apply householder reflector cpu_larf(rocblas_side_left, j, j + 1, v.data(), 1, t + j - 1, a, lda, hW.data()); if(COMPLEX) cpu_lacgv(1, t + j - 1, 1); cpu_larf(rocblas_side_right, j + 1, j, v.data(), 1, t + j - 1, a, lda, hW.data()); } } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { *max_err = (uplo == rocblas_fill_lower) ? norm_error_lowerTr('F', n, n, lda, hA[b], hARes[b]) : norm_error_upperTr('F', n, n, lda, hA[b], hARes[b]); } } template void sytxx_hetxx_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Ud& dTau, const rocblas_stride stP, const rocblas_int bc, Th& hA, Sh& hD, Sh& hE, Uh& hTau, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(32 * n); if(!perf) { sytxx_hetxx_initData(handle, n, dA, lda, bc, hA); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { SYTRD ? cpu_sytrd_hetrd(uplo, n, hA[b], lda, hD[b], hE[b], hTau[b], hW.data(), 32 * n) : cpu_sytd2_hetd2(uplo, n, hA[b], lda, hD[b], hE[b], hTau[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sytxx_hetxx_initData(handle, n, dA, lda, bc, hA); // cold calls for(int iter = 0; iter < 2; iter++) { sytxx_hetxx_initData(handle, n, dA, lda, bc, hA); CHECK_ROCBLAS_ERROR(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sytxx_hetxx_initData(handle, n, dA, lda, bc, hA); start = get_time_us_sync(stream); rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sytxx_hetxx(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stD = argus.get("strideD", n); rocblas_stride stE = argus.get("strideE", n - 1); rocblas_stride stP = argus.get("strideP", n - 1); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = lda * n; size_t size_D = n; size_t size_E = n - 1; size_t size_tau = n - 1; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hE(size_E, 1, stE, bc); host_strided_batch_vector hTau(size_tau, 1, stP, bc); // device device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dTau(size_tau, 1, stP, bc); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_tau) CHECK_HIP_ERROR(dTau.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sytxx_hetxx_getError(handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc, hA, hARes, hD, hE, hTau, &max_error); // collect performance data if(argus.timing) sytxx_hetxx_getPerfData( handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc, hA, hD, hE, hTau, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sytxx_hetxx_getError(handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc, hA, hARes, hD, hE, hTau, &max_error); // collect performance data if(argus.timing) sytxx_hetxx_getPerfData( handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc, hA, hD, hE, hTau, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "lda", "strideD", "strideE", "strideP", "batch_c"); rocsolver_bench_output(uploC, n, lda, stD, stE, stP, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "lda", "strideA", "strideD", "strideE", "strideP", "batch_c"); rocsolver_bench_output(uploC, n, lda, stA, stD, stE, stP, bc); } else { rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYTXX_HETXX(...) \ extern template void testing_sytxx_hetxx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYTXX_HETXX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/include/testing_trtri.hpp000066400000000000000000000425531436600607200227110ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "client_util.hpp" #include "clientcommon.hpp" #include "lapack_host_reference.hpp" #include "norm.hpp" #include "rocsolver.hpp" #include "rocsolver_arguments.hpp" #include "rocsolver_test.hpp" template void trtri_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_diagonal diag, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, nullptr, uplo, diag, n, dA, lda, stA, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, rocblas_fill_full, diag, n, dA, lda, stA, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, uplo, rocblas_diagonal(0), n, dA, lda, stA, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA, lda, stA, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T) nullptr, lda, stA, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA, lda, stA, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, uplo, diag, 0, (T) nullptr, lda, stA, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA, lda, stA, (U) nullptr, 0), rocblas_status_success); } template void testing_trtri_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_int bc = 1; rocblas_diagonal diag = rocblas_diagonal_non_unit; rocblas_fill uplo = rocblas_fill_upper; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments trtri_checkBadArgs(handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments trtri_checkBadArgs(handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc); } } template void trtri_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = hA[b][i + j * lda] / 10.0 + 1; else hA[b][i + j * lda] = (hA[b][i + j * lda] - 4) / 10.0; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void trtri_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_diagonal diag, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization trtri_initData(handle, n, dA, lda, bc, hA, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_trtri(uplo, diag, n, hA[b], lda, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } template void trtri_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_diagonal diag, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { trtri_initData(handle, n, dA, lda, bc, hA, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_trtri(uplo, diag, n, hA[b], lda, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } trtri_initData(handle, n, dA, lda, bc, hA, singular); // cold calls for(int iter = 0; iter < 2; iter++) { trtri_initData(handle, n, dA, lda, bc, hA, singular); CHECK_ROCBLAS_ERROR( rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { trtri_initData(handle, n, dA, lda, bc, hA, singular); start = get_time_us_sync(stream); rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_trtri(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); char uploC = argus.get("uplo"); rocblas_fill uplo = char2rocblas_fill(uploC); char diagC = argus.get("diag"); rocblas_diagonal diag = char2rocblas_diagonal(diagC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) trtri_getError(handle, uplo, diag, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) trtri_getPerfData(handle, uplo, diag, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) trtri_getError(handle, uplo, diag, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) trtri_getPerfData(handle, uplo, diag, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "diag", "n", "lda", "batch_c"); rocsolver_bench_output(uploC, diagC, n, lda, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "diag", "n", "lda", "strideA", "batch_c"); rocsolver_bench_output(uploC, diagC, n, lda, stA, bc); } else { rocsolver_bench_output("uplo", "diag", "n", "lda"); rocsolver_bench_output(uploC, diagC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_TRTRI(...) extern template void testing_trtri<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_TRTRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-5.5.1/clients/rocblascommon/000077500000000000000000000000001436600607200205015ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/clients/rocblascommon/clients_utility.cpp000066400000000000000000000053271436600607200244400ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #include #include #include #include #include "clients_utility.hpp" #include "rocblas_random.hpp" // Random number generator // Note: We do not use random_device to initialize the RNG, because we want // repeatability in case of test failure. TODO: Add seed as an optional CLI // argument, and print the seed on output, to ensure repeatability. const rocblas_rng_t rocblas_seed(69069); // A fixed seed to start at // This records the main thread ID at startup const std::thread::id main_thread_id = std::this_thread::get_id(); // For the main thread, we use rocblas_seed; for other threads, we start with a // different seed but deterministically based on the thread id's hash function. thread_local rocblas_rng_t rocblas_rng = get_seed(); /* ============================================================================================ */ /* device query and print out their ID and name; return number of * compute-capable devices. */ rocblas_int query_device_property() { int device_count = 0; hipError_t status = hipGetDeviceCount(&device_count); if(status != hipSuccess) { fmt::print(stderr, "Query device error: cannot get device count\n"); return -1; } fmt::print("Query device success: there are {} devices\n", device_count); for(int i = 0;; i++) { fmt::print("{:-<79}\n", ""); // horizontal rule if(i >= device_count) break; hipDeviceProp_t props; status = hipGetDeviceProperties(&props, i); if(status != hipSuccess) { fmt::print(stderr, "Query device error: cannot get device ID {}'s property\n", i); continue; } fmt::print("Device ID {} : {}\nwith {:3.1f} GB memory, max. SCLK {} MHz, " "max. MCLK {} MHz, compute capability {}.{}\nmaxGridDimX {}, " "sharedMemPerBlock {:3.1f} KB, maxThreadsPerBlock {}, warpSize {}\n", i, props.name, props.totalGlobalMem / 1e9, int(props.clockRate / 1000), int(props.memoryClockRate / 1000), props.major, props.minor, props.maxGridSize[0], props.sharedMemPerBlock / 1e3, props.maxThreadsPerBlock, props.warpSize); } return device_count; } /* set current device to device_id */ void set_device(rocblas_int device_id) { hipError_t status = hipSetDevice(device_id); if(status != hipSuccess) fmt::print(stderr, "Set device error: cannot set device ID {}\n", device_id); } rocSOLVER-rocm-5.5.1/clients/rocblascommon/clients_utility.hpp000066400000000000000000000062551436600607200244460ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include #include #include #include #include "rocblas_test.hpp" /*!\file * \brief provide common utilities */ static constexpr char LIMITED_MEMORY_STRING[] = "Error: Attempting to allocate more memory than available."; // TODO: This is dependent on internal gtest behaviour. // Compared with result.message() when a test ended. Note that "Succeeded\n" is // added to the beginning of the message automatically by gtest, so this must be // compared. static constexpr char LIMITED_MEMORY_STRING_GTEST[] = "Succeeded\nError: Attempting to allocate more memory than available."; /* ============================================================================================ */ /*! \brief local handle which is automatically created and destroyed */ class rocblas_local_handle { rocblas_handle m_handle; public: rocblas_local_handle() { rocblas_create_handle(&m_handle); } ~rocblas_local_handle() { rocblas_destroy_handle(m_handle); } rocblas_local_handle(const rocblas_local_handle&) = delete; rocblas_local_handle(rocblas_local_handle&&) = delete; rocblas_local_handle& operator=(const rocblas_local_handle&) = delete; rocblas_local_handle& operator=(rocblas_local_handle&&) = delete; // Allow rocblas_local_handle to be used anywhere rocblas_handle is expected operator rocblas_handle&() { return m_handle; } operator const rocblas_handle&() const { return m_handle; } }; /* ============================================================================================ */ /* device query and print out their ID and name */ rocblas_int query_device_property(); /* set current device to device_id */ void set_device(rocblas_int device_id); /* ============================================================================================ */ template void print_strided_batched(const char* name, T* A, rocblas_int n1, rocblas_int n2, rocblas_int n3, rocblas_int s1, rocblas_int s2, rocblas_int s3) { // n1, n2, n3 are matrix dimensions, sometimes called m, n, batch_count // s1, s1, s3 are matrix strides, sometimes called 1, lda, stride_a std::string s = fmt::format("----------{}----------\n", name); int max_size = 8; for(int i3 = 0; i3 < n3 && i3 < max_size; i3++) { for(int i1 = 0; i1 < n1 && i1 < max_size; i1++) { for(int i2 = 0; i2 < n2 && i2 < max_size; i2++) { s += fmt::format("{}|", A[(i1 * s1) + (i2 * s2) + (i3 * s3)]); } s += '\n'; } if(i3 < (n3 - 1) && i3 < (max_size - 1)) s += '\n'; } std::fputs(s.c_str(), stdout); std::fflush(stdout); } rocSOLVER-rocm-5.5.1/clients/rocblascommon/d_vector.hpp000066400000000000000000000067321436600607200230270ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include #include #include #include "rocblas_init.hpp" #include "rocblas_test.hpp" /* ============================================================================================ */ /*! \brief base-class to allocate/deallocate device memory */ template class d_vector { private: size_t size, bytes; public: inline size_t nmemb() const noexcept { return size; } #ifdef ROCSOLVER_CLIENTS_TEST U guard[PAD]; d_vector(size_t s) : size(s) , bytes((s + PAD * 2) * sizeof(T)) { // Initialize guard with random data if(PAD > 0) { rocblas_init_nan(guard, PAD); } } #else d_vector(size_t s) : size(s) , bytes(s ? s * sizeof(T) : sizeof(T)) { } #endif T* device_vector_setup() { T* d; if((hipMalloc)(&d, bytes) != hipSuccess) { fmt::print(stderr, "Error allocating {} bytes ({} GB)\n", bytes, bytes >> 30); d = nullptr; } #ifdef ROCSOLVER_CLIENTS_TEST else { if(PAD > 0) { // Copy guard to device memory before allocated memory hipMemcpy(d, guard, sizeof(guard), hipMemcpyHostToDevice); // Point to allocated block d += PAD; // Copy guard to device memory after allocated memory hipMemcpy(d + size, guard, sizeof(guard), hipMemcpyHostToDevice); } } #endif return d; } void device_vector_check(T* d) { #ifdef ROCSOLVER_CLIENTS_TEST if(PAD > 0) { U host[PAD]; // Copy device memory after allocated memory to host hipMemcpy(host, d + this->size, sizeof(guard), hipMemcpyDeviceToHost); // Make sure no corruption has occurred EXPECT_EQ(memcmp(host, guard, sizeof(guard)), 0); // Point to guard before allocated memory d -= PAD; // Copy device memory after allocated memory to host hipMemcpy(host, d, sizeof(guard), hipMemcpyDeviceToHost); // Make sure no corruption has occurred EXPECT_EQ(memcmp(host, guard, sizeof(guard)), 0); } #endif } void device_vector_teardown(T* d) { if(d != nullptr) { #ifdef ROCSOLVER_CLIENTS_TEST if(PAD > 0) { U host[PAD]; // Copy device memory after allocated memory to host hipMemcpy(host, d + this->size, sizeof(guard), hipMemcpyDeviceToHost); // Make sure no corruption has occurred EXPECT_EQ(memcmp(host, guard, sizeof(guard)), 0); // Point to guard before allocated memory d -= PAD; // Copy device memory after allocated memory to host hipMemcpy(host, d, sizeof(guard), hipMemcpyDeviceToHost); // Make sure no corruption has occurred EXPECT_EQ(memcmp(host, guard, sizeof(guard)), 0); } #endif // Free device memory CHECK_HIP_ERROR((hipFree)(d)); } } }; rocSOLVER-rocm-5.5.1/clients/rocblascommon/device_batch_vector.hpp000066400000000000000000000164751436600607200252110ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2020 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "d_vector.hpp" // // Local declaration of the host strided batch vector. // template class host_batch_vector; //! //! @brief pseudo-vector subclass which uses a batch of device memory pointers //! and //! - an array of pointers in host memory //! - an array of pointers in device memory //! template class device_batch_vector : private d_vector { public: using value_type = T; public: //! //! @brief Disallow copying. //! device_batch_vector(const device_batch_vector&) = delete; //! //! @brief Disallow assigning. //! device_batch_vector& operator=(const device_batch_vector&) = delete; //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param batch_count The batch count. //! explicit device_batch_vector(rocblas_int n, rocblas_int inc, rocblas_int batch_count) : d_vector(size_t(n) * std::abs(inc)) , m_n(n) , m_inc(inc) , m_batch_count(batch_count) { if(false == this->try_initialize_memory()) { this->free_memory(); } } //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param stride (UNUSED) The stride. //! @param batch_count The batch count. //! explicit device_batch_vector(rocblas_int n, rocblas_int inc, rocblas_stride stride, rocblas_int batch_count) : device_batch_vector(n, inc, batch_count) { } //! //! @brief Constructor (kept for backward compatibility only, to be removed). //! @param batch_count The number of vectors. //! @param size_vector The size of each vectors. //! explicit device_batch_vector(rocblas_int batch_count, size_t size_vector) : device_batch_vector(size_vector, 1, batch_count) { } //! //! @brief Destructor. //! ~device_batch_vector() { this->free_memory(); } //! //! @brief Returns the length of the vector. //! rocblas_int n() const { return this->m_n; } //! //! @brief Returns the increment of the vector. //! rocblas_int inc() const { return this->m_inc; } //! //! @brief Returns the value of batch_count. //! rocblas_int batch_count() const { return this->m_batch_count; } //! //! @brief Returns the stride value. //! rocblas_stride stride() const { return 0; } //! //! @brief Access to device data. //! @return Pointer to the device data. //! T** ptr_on_device() { return this->m_device_data; } //! //! @brief Const access to device data. //! @return Const pointer to the device data. //! const T* const* ptr_on_device() const { return this->m_device_data; } T* const* data() { return this->m_device_data; } const T* const* data() const { return this->m_device_data; } //! //! @brief Random access. //! @param batch_index The batch index. //! @return Pointer to the array on device. //! T* operator[](rocblas_int batch_index) { return this->m_data[batch_index]; } //! //! @brief Constant random access. //! @param batch_index The batch index. //! @return Constant pointer to the array on device. //! const T* operator[](rocblas_int batch_index) const { return this->m_data[batch_index]; } //! //! @brief Const cast of the data on host. //! operator const T* const *() const { return this->m_data; } // clang-format off //! //! @brief Cast of the data on host. //! operator T**() { return this->m_data; } // clang-format on //! //! @brief Tell whether ressources allocation failed. //! explicit operator bool() const { return nullptr != this->m_data; } //! //! @brief Copy from a host batched vector. //! @param that The host_batch_vector to copy. //! hipError_t transfer_from(const host_batch_vector& that) { hipError_t hip_err; // // Copy each vector. // for(rocblas_int batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { if(hipSuccess != (hip_err = hipMemcpy((*this)[batch_index], that[batch_index], sizeof(T) * this->nmemb(), hipMemcpyHostToDevice))) { return hip_err; } } return hipSuccess; } //! //! @brief Check if memory exists. //! @return hipSuccess if memory exists, hipErrorOutOfMemory otherwise. //! hipError_t memcheck() const { if(*this) return hipSuccess; else return hipErrorOutOfMemory; } private: rocblas_int m_n{}; rocblas_int m_inc{}; rocblas_int m_batch_count{}; T** m_data{}; T** m_device_data{}; //! //! @brief Try to allocate the ressources. //! @return true if success false otherwise. //! bool try_initialize_memory() { bool success = false; success = (hipSuccess == (hipMalloc)(&this->m_device_data, this->m_batch_count * sizeof(T*))); if(success) { success = (nullptr != (this->m_data = (T**)calloc(this->m_batch_count, sizeof(T*)))); if(success) { for(rocblas_int batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { success = (nullptr != (this->m_data[batch_index] = this->device_vector_setup())); if(!success) { break; } } if(success) { success = (hipSuccess == hipMemcpy(this->m_device_data, this->m_data, sizeof(T*) * this->m_batch_count, hipMemcpyHostToDevice)); } } } return success; } //! //! @brief Free the ressources, as much as we can. //! void free_memory() { if(nullptr != this->m_data) { for(rocblas_int batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { if(nullptr != this->m_data[batch_index]) { this->device_vector_teardown(this->m_data[batch_index]); this->m_data[batch_index] = nullptr; } } free(this->m_data); this->m_data = nullptr; } if(nullptr != this->m_device_data) { auto tmp_device_data = this->m_device_data; this->m_device_data = nullptr; CHECK_HIP_ERROR((hipFree)(tmp_device_data)); } } }; rocSOLVER-rocm-5.5.1/clients/rocblascommon/device_strided_batch_vector.hpp000066400000000000000000000136421436600607200267200ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2020 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once // // Local declaration of the host strided batch vector. // template class host_strided_batch_vector; //! //! @brief Implementation of a strided batched vector on device. //! template class device_strided_batch_vector : public d_vector { public: using value_type = T; public: //! //! @brief The storage type to use. //! typedef enum class estorage { block, interleave, } storage; //! //! @brief Disallow copying. //! device_strided_batch_vector(const device_strided_batch_vector&) = delete; //! //! @brief Disallow assigning. //! device_strided_batch_vector& operator=(const device_strided_batch_vector&) = delete; //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param stride The stride. //! @param batch_count The batch count. //! @param stg The storage format to use. //! explicit device_strided_batch_vector(rocblas_int n, rocblas_int inc, rocblas_stride stride, rocblas_int batch_count, storage stg = storage::block) : d_vector(calculate_nmemb(n, inc, stride, batch_count, stg)) , m_storage(stg) , m_n(n) , m_inc(inc) , m_stride(stride) , m_batch_count(batch_count) { bool valid_parameters = true; switch(this->m_storage) { case storage::block: { if(std::abs(this->m_stride) < this->m_n * std::abs(this->m_inc)) { valid_parameters = false; } break; } case storage::interleave: { if(std::abs(this->m_inc) < std::abs(this->m_stride) * this->m_batch_count) { valid_parameters = false; } break; } } if(valid_parameters) { this->m_data = this->device_vector_setup(); } } //! //! @brief Destructor. //! ~device_strided_batch_vector() { if(nullptr != this->m_data) { this->device_vector_teardown(this->m_data); this->m_data = nullptr; } } //! //! @brief Returns the data pointer. //! T* data() { return this->m_data; } //! //! @brief Returns the data pointer. //! const T* data() const { return this->m_data; } //! //! @brief Returns the length. //! rocblas_int n() const { return this->m_n; } //! //! @brief Returns the increment. //! rocblas_int inc() const { return this->m_inc; } //! //! @brief Returns the batch count. //! rocblas_int batch_count() const { return this->m_batch_count; } //! //! @brief Returns the stride value. //! rocblas_stride stride() const { return this->m_stride; } //! //! @brief Returns pointer. //! @param batch_index The batch index. //! @return A mutable pointer to the batch_index'th vector. //! T* operator[](rocblas_int batch_index) { return (this->m_stride >= 0) ? this->m_data + batch_index * this->m_stride : this->m_data + (batch_index + 1 - this->m_batch_count) * this->m_stride; } //! //! @brief Returns non-mutable pointer. //! @param batch_index The batch index. //! @return A non-mutable mutable pointer to the batch_index'th vector. //! const T* operator[](rocblas_int batch_index) const { return (this->m_stride >= 0) ? this->m_data + batch_index * this->m_stride : this->m_data + (batch_index + 1 - this->m_batch_count) * this->m_stride; } //! //! @brief Cast operator. //! @remark Returns the pointer of the first vector. //! operator T*() { return (*this)[0]; } //! //! @brief Non-mutable cast operator. //! @remark Returns the non-mutable pointer of the first vector. //! operator const T*() const { return (*this)[0]; } //! //! @brief Tell whether ressources allocation failed. //! explicit operator bool() const { return nullptr != this->m_data; } //! //! @brief Transfer data from a strided batched vector on device. //! @param that That strided batched vector on device. //! @return The hip error. //! hipError_t transfer_from(const host_strided_batch_vector& that) { return hipMemcpy(this->data(), that.data(), sizeof(T) * this->nmemb(), hipMemcpyHostToDevice); } //! //! @brief Check if memory exists. //! @return hipSuccess if memory exists, hipErrorOutOfMemory otherwise. //! hipError_t memcheck() const { if(*this) return hipSuccess; else return hipErrorOutOfMemory; } private: storage m_storage{storage::block}; rocblas_int m_n{}; rocblas_int m_inc{}; rocblas_stride m_stride{}; rocblas_int m_batch_count{}; T* m_data{}; static size_t calculate_nmemb(rocblas_int n, rocblas_int inc, rocblas_stride stride, rocblas_int batch_count, storage st) { switch(st) { case storage::block: return size_t(std::abs(stride)) * batch_count; case storage::interleave: return size_t(n) * std::abs(inc); } return 0; } }; rocSOLVER-rocm-5.5.1/clients/rocblascommon/device_vector.hpp000066400000000000000000000067541436600607200240470ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2020 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "d_vector.hpp" // // Local declaration of the host vector. // template class host_vector; //! //! @brief pseudo-vector subclass which uses device memory //! template class device_vector : private d_vector { public: using value_type = T; public: //! //! @brief Disallow copying. //! device_vector(const device_vector&) = delete; //! //! @brief Disallow assigning //! device_vector& operator=(const device_vector&) = delete; //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @remark Must wrap constructor and destructor in functions to allow Google //! Test macros to work //! explicit device_vector(rocblas_int n, rocblas_int inc) : d_vector(n * std::abs(inc)) , m_n(n) , m_inc(inc) { this->m_data = this->device_vector_setup(); } //! //! @brief Constructor (kept for backward compatibility) //! @param s the size. //! @remark Must wrap constructor and destructor in functions to allow Google //! Test macros to work //! explicit device_vector(size_t s) : d_vector(s) , m_n(s) , m_inc(1) { this->m_data = this->device_vector_setup(); } //! //! @brief Destructor. //! ~device_vector() { this->device_vector_teardown(this->m_data); this->m_data = nullptr; } //! //! @brief Returns the length of the vector. //! rocblas_int n() const { return this->m_n; } //! //! @brief Returns the increment of the vector. //! rocblas_int inc() const { return this->m_inc; } //! //! @brief Returns the batch count (always 1). //! rocblas_int batch_count() const { return 1; } //! //! @brief Returns the stride (out of context, always 0) //! rocblas_stride stride() const { return 0; } //! //! @brief Returns the data pointer. //! T* data() { return this->m_data; } //! //! @brief Returns the data pointer. //! const T* data() const { return this->m_data; } //! //! @brief Decay into pointer wherever pointer is expected. //! operator T*() { return this->m_data; } //! //! @brief Decay into constant pointer wherever pointer is expected. //! operator const T*() const { return this->m_data; } //! //! @brief Tell whether malloc failed. //! explicit operator bool() const { return nullptr != this->m_data; } //! //! @brief Transfer data from a host vector. //! @param that The host vector. //! @return the hip error. //! hipError_t transfer_from(const host_vector& that) { return hipMemcpy(this->m_data, (const T*)that, this->nmemb() * sizeof(T), hipMemcpyHostToDevice); } hipError_t memcheck() const { if(*this) return hipSuccess; else return hipErrorOutOfMemory; } private: size_t m_size{}; rocblas_int m_n{}; rocblas_int m_inc{}; T* m_data{}; }; rocSOLVER-rocm-5.5.1/clients/rocblascommon/host_batch_vector.hpp000066400000000000000000000153161436600607200247200ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2021 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include "rocblas_init.hpp" // // Local declaration of the device batch vector. // template class device_batch_vector; //! //! @brief Implementation of the batch vector on host. //! template class host_batch_vector { public: using value_type = T; public: //! //! @brief Delete copy constructor. //! host_batch_vector(const host_batch_vector& that) = delete; //! //! @brief Delete copy assignement. //! host_batch_vector& operator=(const host_batch_vector& that) = delete; //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param batch_count The batch count. //! explicit host_batch_vector(rocblas_int n, rocblas_int inc, rocblas_int batch_count) : m_n(n) , m_inc(inc) , m_batch_count(batch_count) { if(false == this->try_initialize_memory()) { this->free_memory(); } } //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param stride (UNUSED) The stride. //! @param batch_count The batch count. //! explicit host_batch_vector(rocblas_int n, rocblas_int inc, rocblas_stride stride, rocblas_int batch_count) : host_batch_vector(n, inc, batch_count) { } //! //! @brief Destructor. //! ~host_batch_vector() { this->free_memory(); } //! //! @brief Returns the length of the vector. //! rocblas_int n() const { return this->m_n; } //! //! @brief Returns the increment of the vector. //! rocblas_int inc() const { return this->m_inc; } //! //! @brief Returns the batch count. //! rocblas_int batch_count() const { return this->m_batch_count; } //! //! @brief Returns the stride value. //! rocblas_stride stride() const { return 0; } //! //! @brief Random access to the vectors. //! @param batch_index the batch index. //! @return The mutable pointer. //! T* operator[](rocblas_int batch_index) { return this->m_data[batch_index]; } //! //! @brief Constant random access to the vectors. //! @param batch_index the batch index. //! @return The non-mutable pointer. //! const T* operator[](rocblas_int batch_index) const { return this->m_data[batch_index]; } // clang-format off //! //! @brief Cast to a double pointer. //! operator T**() { return this->m_data; } // clang-format on //! //! @brief Constant cast to a double pointer. //! operator const T* const *() { return this->m_data; } //! //! @brief Copy from a host batched vector. //! @param that the vector the data is copied from. //! @return true if the copy is done successfully, false otherwise. //! bool copy_from(const host_batch_vector& that) { if((this->batch_count() == that.batch_count()) && (this->n() == that.n()) && (this->inc() == that.inc())) { size_t num_bytes = this->n() * std::abs(this->inc()) * sizeof(T); for(rocblas_int batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { memcpy((*this)[batch_index], that[batch_index], num_bytes); } return true; } else { return false; } } //! //! @brief Transfer from a device batched vector. //! @param that the vector the data is copied from. //! @return the hip error. //! hipError_t transfer_from(const device_batch_vector& that) { hipError_t hip_err; size_t num_bytes = size_t(this->m_n) * std::abs(this->m_inc) * sizeof(T); for(rocblas_int batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { if(hipSuccess != (hip_err = hipMemcpy((*this)[batch_index], that[batch_index], num_bytes, hipMemcpyDeviceToHost))) { return hip_err; } } return hipSuccess; } //! //! @brief Check if memory exists. //! @return hipSuccess if memory exists, hipErrorOutOfMemory otherwise. //! hipError_t memcheck() const { return (nullptr != this->m_data) ? hipSuccess : hipErrorOutOfMemory; } private: rocblas_int m_n{}; rocblas_int m_inc{}; rocblas_int m_batch_count{}; T** m_data{}; bool try_initialize_memory() { bool success = (nullptr != (this->m_data = (T**)calloc(this->m_batch_count, sizeof(T*)))); if(success) { size_t nmemb = size_t(this->m_n) * std::abs(this->m_inc); for(rocblas_int batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { success = (nullptr != (this->m_data[batch_index] = (T*)calloc(nmemb, sizeof(T)))); if(false == success) { break; } } } return success; } void free_memory() { if(nullptr != this->m_data) { for(rocblas_int batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { if(nullptr != this->m_data[batch_index]) { free(this->m_data[batch_index]); this->m_data[batch_index] = nullptr; } } free(this->m_data); this->m_data = nullptr; } } }; //! //! @brief Overload output operator. //! @param os The ostream. //! @param that That host batch vector. //! template std::ostream& operator<<(std::ostream& os, const host_batch_vector& that) { auto n = that.n(); auto inc = std::abs(that.inc()); auto batch_count = that.batch_count(); for(rocblas_int batch_index = 0; batch_index < batch_count; ++batch_index) { auto batch_data = that[batch_index]; os << "[" << batch_index << "] = { " << batch_data[0]; for(rocblas_int i = 1; i < n; ++i) { os << ", " << batch_data[i * inc]; } os << " }" << std::endl; } return os; } rocSOLVER-rocm-5.5.1/clients/rocblascommon/host_strided_batch_vector.hpp000066400000000000000000000164141436600607200264360ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2021 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include // // Local declaration of the device strided batch vector. // template class device_strided_batch_vector; //! //! @brief Implementation of a host strided batched vector. //! template class host_strided_batch_vector { public: using value_type = T; public: //! //! @brief The storage type to use. //! typedef enum class estorage { block, interleave } storage; //! //! @brief Disallow copying. //! host_strided_batch_vector(const host_strided_batch_vector&) = delete; //! //! @brief Disallow assigning. //! host_strided_batch_vector& operator=(const host_strided_batch_vector&) = delete; //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param stride The stride. //! @param batch_count The batch count. //! @param stg The storage format to use. //! explicit host_strided_batch_vector(rocblas_int n, rocblas_int inc, rocblas_stride stride, rocblas_int batch_count, storage stg = storage::block) : m_storage(stg) , m_n(n) , m_inc(inc) , m_stride(stride) , m_batch_count(batch_count) , m_nmemb(calculate_nmemb(n, inc, stride, batch_count, stg)) { bool valid_parameters = this->m_nmemb > 0; if(valid_parameters) { switch(this->m_storage) { case storage::block: { if(std::abs(this->m_stride) < this->m_n * std::abs(this->m_inc)) { valid_parameters = false; } break; } case storage::interleave: { if(std::abs(this->m_inc) < std::abs(this->m_stride) * this->m_batch_count) { valid_parameters = false; } break; } } if(valid_parameters) { this->m_data = new T[this->m_nmemb]; } } } //! //! @brief Destructor. //! ~host_strided_batch_vector() { if(nullptr != this->m_data) { delete[] this->m_data; this->m_data = nullptr; } } //! //! @brief Returns the data pointer. //! T* data() { return this->m_data; } //! //! @brief Returns the data pointer. //! const T* data() const { return this->m_data; } //! //! @brief Returns the length. //! rocblas_int n() const { return this->m_n; } //! //! @brief Returns the increment. //! rocblas_int inc() const { return this->m_inc; } //! //! @brief Returns the batch count. //! rocblas_int batch_count() const { return this->m_batch_count; } //! //! @brief Returns the stride. //! rocblas_stride stride() const { return this->m_stride; } //! //! @brief Returns pointer. //! @param batch_index The batch index. //! @return A mutable pointer to the batch_index'th vector. //! T* operator[](rocblas_int batch_index) { return (this->m_stride >= 0) ? this->m_data + this->m_stride * batch_index : this->m_data + (batch_index + 1 - this->m_batch_count) * this->m_stride; } //! //! @brief Returns non-mutable pointer. //! @param batch_index The batch index. //! @return A non-mutable mutable pointer to the batch_index'th vector. //! const T* operator[](rocblas_int batch_index) const { return (this->m_stride >= 0) ? this->m_data + this->m_stride * batch_index : this->m_data + (batch_index + 1 - this->m_batch_count) * this->m_stride; } //! //! @brief Cast operator. //! @remark Returns the pointer of the first vector. //! operator T*() { return (*this)[0]; } //! //! @brief Non-mutable cast operator. //! @remark Returns the non-mutable pointer of the first vector. //! operator const T*() const { return (*this)[0]; } //! //! @brief Tell whether ressources allocation failed. //! explicit operator bool() const { return nullptr != this->m_data; } //! //! @brief Copy data from a strided batched vector on host. //! @param that That strided batched vector on host. //! @return true if successful, false otherwise. //! bool copy_from(const host_strided_batch_vector& that) { if(that.n() == this->m_n && that.inc() == this->m_inc && that.stride() == this->m_stride && that.batch_count() == this->m_batch_count) { memcpy(this->data(), that.data(), sizeof(T) * this->m_nmemb); return true; } else { return false; } } //! //! @brief Transfer data from a strided batched vector on device. //! @param that That strided batched vector on device. //! @return The hip error. //! template hipError_t transfer_from(const device_strided_batch_vector& that) { return hipMemcpy(this->m_data, that.data(), sizeof(T) * this->m_nmemb, hipMemcpyDeviceToHost); } //! //! @brief Check if memory exists. //! @return hipSuccess if memory exists, hipErrorOutOfMemory otherwise. //! hipError_t memcheck() const { return ((bool)*this) ? hipSuccess : hipErrorOutOfMemory; } private: storage m_storage{storage::block}; rocblas_int m_n{}; rocblas_int m_inc{}; rocblas_stride m_stride{}; rocblas_int m_batch_count{}; size_t m_nmemb{}; T* m_data{}; static size_t calculate_nmemb(rocblas_int n, rocblas_int inc, rocblas_stride stride, rocblas_int batch_count, storage st) { switch(st) { case storage::block: return size_t(std::abs(stride)) * batch_count; case storage::interleave: return size_t(n) * std::abs(inc); } return 0; } }; //! //! @brief Overload output operator. //! @param os The ostream. //! @param that That host strided batch vector. //! template std::ostream& operator<<(std::ostream& os, const host_strided_batch_vector& that) { auto n = that.n(); auto inc = std::abs(that.inc()); auto batch_count = that.batch_count(); for(rocblas_int batch_index = 0; batch_index < batch_count; ++batch_index) { auto batch_data = that[batch_index]; os << "[" << batch_index << "] = { " << batch_data[0]; for(rocblas_int i = 1; i < n; ++i) { os << ", " << batch_data[i * inc]; } os << " }" << std::endl; } return os; } rocSOLVER-rocm-5.5.1/clients/rocblascommon/host_vector.hpp000066400000000000000000000045571436600607200235640ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2020 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include //! //! @brief Pseudo-vector subclass which uses host memory. //! template struct host_vector : std::vector { // Inherit constructors using std::vector::vector; //! //! @brief Constructor. //! host_vector(size_t n, ptrdiff_t inc) : std::vector(n * std::abs(inc)) , m_n(n) , m_inc(inc) { } //! //! @brief Copy constructor from host_vector of other types convertible to T //! template {}, int> = 0> host_vector(const host_vector& x) : std::vector(x.size()) , m_n(x.size()) , m_inc(1) { for(size_t i = 0; i < m_n; ++i) (*this)[i] = x[i]; } //! //! @brief Decay into pointer wherever pointer is expected //! operator T*() { return this->data(); } //! //! @brief Decay into constant pointer wherever constant pointer is expected //! operator const T*() const { return this->data(); } //! //! @brief Transfer from a device vector. //! @param that That device vector. //! @return the hip error. //! hipError_t transfer_from(const device_vector& that) { return hipMemcpy(*this, that, sizeof(T) * this->size(), hipMemcpyDeviceToHost); } //! //! @brief Returns the length of the vector. //! size_t n() const { return m_n; } //! //! @brief Returns the increment of the vector. //! ptrdiff_t inc() const { return m_inc; } //! //! @brief Returns the batch count (always 1). //! static constexpr rocblas_int batch_count() { return 1; } //! //! @brief Returns the stride (out of context, always 0) //! static constexpr rocblas_stride stride() { return 0; } //! //! @brief Check if memory exists (out of context, always hipSuccess) //! static constexpr hipError_t memcheck() { return hipSuccess; } private: size_t m_n = 0; ptrdiff_t m_inc = 0; }; rocSOLVER-rocm-5.5.1/clients/rocblascommon/program_options.cpp000066400000000000000000000006621436600607200244330ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #include "rocblascommon/program_options.hpp" namespace roc { // Regular expression for token delimiters (whitespace and commas) const std::regex program_options_regex{"[, \\f\\n\\r\\t\\v]+", std::regex_constants::optimize}; } rocSOLVER-rocm-5.5.1/clients/rocblascommon/program_options.hpp000066400000000000000000000342071436600607200244420ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * ************************************************************************ */ // This emulates the required functionality of boost::program_options #pragma once #include #include #include #include #include #include #include #include #include #include namespace roc { // Regular expression for token delimiters (whitespace and commas) extern const std::regex program_options_regex; // Polymorphic base class to use with dynamic_cast class value_base { protected: bool m_has_actual = false; bool m_has_default = false; public: virtual ~value_base() = default; bool has_actual() const { return m_has_actual; } bool has_default() const { return m_has_default; } }; // Value parameters template class value : public value_base { T m_var; // Variable to be modified if no pointer provided T* m_var_ptr; // Pointer to variable to be modified public: // Constructor explicit value() : m_var_ptr(nullptr) { } explicit value(T var, bool defaulted) : m_var(var) , m_var_ptr(nullptr) { m_has_actual = !defaulted; m_has_default = defaulted; } explicit value(T* var_ptr) : m_var_ptr(var_ptr) { } // Allows actual_value() and default_value() value* operator->() { return this; } // Get the value const T& get_value() const { if(m_var_ptr) return *m_var_ptr; else return m_var; } // Set actual value value& actual_value(T val) { if(m_var_ptr) *m_var_ptr = std::move(val); else m_var = std::move(val); m_has_actual = true; return *this; } // Set default value value& default_value(T val) { if(!m_has_actual) { if(m_var_ptr) *m_var_ptr = std::move(val); else m_var = std::move(val); m_has_default = true; } return *this; } }; // bool_switch is a value, which is handled specially using bool_switch = value; class variable_value { std::shared_ptr m_val; public: // Constructor explicit variable_value() = default; template explicit variable_value(const T& xv, bool xdefaulted) : m_val(std::make_shared>(xv, xdefaulted)) { } explicit variable_value(std::shared_ptr val) : m_val(val) { } // Member functions bool empty() const { return !m_val.get() || (!m_val->has_actual() && !m_val->has_default()); } bool defaulted() const { return m_val.get() && !m_val->has_actual() && m_val->has_default(); } template const T& as() const { if(value* val = dynamic_cast*>(m_val.get())) return val->get_value(); else throw std::logic_error("Internal error: Invalid cast"); } }; using variables_map = std::map; class options_description { // desc_option describes a particular option class desc_option { std::string m_opts; std::shared_ptr m_val; std::string m_desc; public: // Constructor with options, value and description template desc_option(std::string opts, value val, std::string desc) : m_opts(std::move(opts)) , m_val(new auto(std::move(val))) , m_desc(std::move(desc)) { } // Constructor with options and description desc_option(std::string opts, std::string desc) : m_opts(std::move(opts)) , m_val(nullptr) , m_desc(std::move(desc)) { } // Accessors const std::string& get_opts() const { return m_opts; } const std::shared_ptr get_val() const { return m_val; } const std::string& get_desc() const { return m_desc; } // Set a value void set_val(int& argc, char**& argv, std::string inopt) const { // We test all supported types with dynamic_cast and parse accordingly bool match = false; if(auto* ptr = dynamic_cast*>(m_val.get())) { int32_t val; match = argc && sscanf(*argv, "%" SCNd32, &val) == 1; ptr->actual_value(val); } else if(auto* ptr = dynamic_cast*>(m_val.get())) { uint32_t val; match = argc && sscanf(*argv, "%" SCNu32, &val) == 1; ptr->actual_value(val); } else if(auto* ptr = dynamic_cast*>(m_val.get())) { int64_t val; match = argc && sscanf(*argv, "%" SCNd64, &val) == 1; ptr->actual_value(val); } else if(auto* ptr = dynamic_cast*>(m_val.get())) { uint64_t val; match = argc && sscanf(*argv, "%" SCNu64, &val) == 1; ptr->actual_value(val); } else if(auto* ptr = dynamic_cast*>(m_val.get())) { float val; match = argc && sscanf(*argv, "%f", &val) == 1; ptr->actual_value(val); } else if(auto* ptr = dynamic_cast*>(m_val.get())) { double val; match = argc && sscanf(*argv, "%lf", &val) == 1; ptr->actual_value(val); } else if(auto* ptr = dynamic_cast*>(m_val.get())) { char val; match = argc && sscanf(*argv, " %c", &val) == 1; ptr->actual_value(val); } else if(auto* ptr = dynamic_cast*>(m_val.get())) { // We handle bool specially, setting the value to true without argument ptr->actual_value(true); return; } else if(auto* ptr = dynamic_cast*>(m_val.get())) { if(argc) { ptr->actual_value(*argv); match = true; } } else { throw std::logic_error("Internal error: Unsupported data type"); } if(!match) throw std::invalid_argument(argc ? "Invalid value for " + inopt : "Missing required value for " + inopt); // Skip past the argument's value ++argv; --argc; } }; // Description and option list std::string m_desc; std::vector m_optlist; // desc_optionlist allows chains of options to be parenthesized class desc_optionlist { std::vector& m_list; public: explicit desc_optionlist(std::vector& list) : m_list(list) { } template desc_optionlist operator()(Ts&&... arg) { m_list.push_back(desc_option(std::forward(arg)...)); return *this; } }; // Parse an option at the current (argc, argv) position void parse_option(int& argc, char**& argv, variables_map& vm, bool ignoreUnknown) const { // Iterate across all options for(const auto& opt : m_optlist) { // Canonical name used for map std::string canonical_name; // Iterate across tokens in the opts for(std::sregex_token_iterator tok{opt.get_opts().begin(), opt.get_opts().end(), program_options_regex, -1}; tok != std::sregex_token_iterator(); ++tok) { // The first option in a list of options is the canonical name if(!canonical_name.length()) canonical_name = tok->str(); // If the length of the option is 1, it is single-dash; otherwise double-dash const char* prefix = tok->length() == 1 ? "-" : "--"; // If option matches if(*argv == prefix + tok->str()) { ++argv; --argc; // If option has a value, set it if(opt.get_val().get()) opt.set_val(argc, argv, prefix + tok->str()); // Add seen options to map vm[canonical_name] = variable_value(opt.get_val()); return; // Return successfully } } } // No options were matched if(ignoreUnknown) { ++argv; --argc; } else throw std::invalid_argument("Option " + std::string(argv[0]) + " is not defined."); } public: // Constructor explicit options_description(std::string desc) : m_desc(std::move(desc)) { } // Start a desc_optionlist chain desc_optionlist add_options() & { return desc_optionlist(m_optlist); } // Parse all options void parse_options(int& argc, char**& argv, variables_map& vm, bool ignoreUnknown = false) const { // Add options with default values to map for(const auto& opt : m_optlist) { std::sregex_token_iterator tok{opt.get_opts().begin(), opt.get_opts().end(), program_options_regex, -1}; // Canonical name used for map std::string canonical_name = tok->str(); if(opt.get_val().get() && opt.get_val()->has_default()) vm[canonical_name] = variable_value(opt.get_val()); } // Parse options while(argc) parse_option(argc, argv, vm, ignoreUnknown); } // Formatted output of command-line arguments description friend std::ostream& operator<<(std::ostream& os, const options_description& d) { // Iterate across all options for(const auto& opt : d.m_optlist) { bool first = true, printvalue = true; const char* delim = ""; std::ostringstream left; // Iterate across tokens in the opts for(std::sregex_token_iterator tok{opt.get_opts().begin(), opt.get_opts().end(), program_options_regex, -1}; tok != std::sregex_token_iterator(); ++tok, first = false, delim = " ") { // If the length of the option is 1, it is single-dash; otherwise double-dash const char* prefix = tok->length() == 1 ? "-" : "--"; left << delim << (first ? "" : "|") << prefix << tok->str(); if(tok->str() == "help" || tok->str() == "h") printvalue = false; } if(printvalue) left << " "; os << std::setw(26) << std::left << left.str() << " " << opt.get_desc() << " "; left.str(std::string()); // Print the default value of the variable type if it exists // We do not print the default value for bool const value_base* val = opt.get_val().get(); if(val && !dynamic_cast*>(val)) { if(val->has_default()) { // We test all supported types with dynamic_cast and print accordingly left << " (Default value is: "; if(dynamic_cast*>(val)) left << dynamic_cast*>(val)->get_value(); else if(dynamic_cast*>(val)) left << dynamic_cast*>(val)->get_value(); else if(dynamic_cast*>(val)) left << dynamic_cast*>(val)->get_value(); else if(dynamic_cast*>(val)) left << dynamic_cast*>(val)->get_value(); else if(dynamic_cast*>(val)) left << dynamic_cast*>(val)->get_value(); else if(dynamic_cast*>(val)) left << dynamic_cast*>(val)->get_value(); else if(dynamic_cast*>(val)) left << dynamic_cast*>(val)->get_value(); else if(dynamic_cast*>(val)) left << dynamic_cast*>(val)->get_value(); else throw std::logic_error("Internal error: Unsupported data type"); left << ")"; } } os << left.str() << "\n\n"; } return os << std::flush; } }; // Class representing command line parser class parse_command_line { variables_map m_vm; public: parse_command_line(int argc, char** argv, const options_description& desc, bool ignoreUnknown = false) { ++argv; // Skip argv[0] --argc; desc.parse_options(argc, argv, m_vm, ignoreUnknown); } // Copy the variables_map friend void store(const parse_command_line& p, variables_map& vm) { vm = p.m_vm; } // Move the variables_map friend void store(parse_command_line&& p, variables_map& vm) { vm = std::move(p.m_vm); } }; // We can define the notify() function as a no-op for our purposes inline void notify(const variables_map&) {} } rocSOLVER-rocm-5.5.1/clients/rocblascommon/rocblas_init.hpp000066400000000000000000000245041436600607200236670ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include #include #include #include #include "rocblas_math.hpp" #include "rocblas_random.hpp" /* ============================================================================================ */ /*! \brief matrix/vector initialization: */ // for vector x (M=1, N=lengthX, lda=incx); // for complex number, the real/imag part would be initialized with the same // value // Initialize vector with random values template void rocblas_init(std::vector& A, size_t M, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1) { for(size_t i_batch = 0; i_batch < batch_count; i_batch++) for(size_t i = 0; i < M; ++i) for(size_t j = 0; j < N; ++j) A[i + j * lda + i_batch * stride] = random_generator(); } // Initialize vector with random values template inline void rocblas_init(T* A, size_t M, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1) { for(size_t i_batch = 0; i_batch < batch_count; i_batch++) for(size_t i = 0; i < M; ++i) for(size_t j = 0; j < N; ++j) A[i + j * lda + i_batch * stride] = random_generator(); } template void rocblas_init_sin(std::vector& A, size_t M, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1) { for(size_t i_batch = 0; i_batch < batch_count; i_batch++) for(size_t i = 0; i < M; ++i) for(size_t j = 0; j < N; ++j) A[i + j * lda + i_batch * stride] = sin(i + j * lda + i_batch * stride); } // Initialize matrix so adjacent entries have alternating sign. // In gemm if either A or B are initialized with alernating // sign the reduction sum will be summing positive // and negative numbers, so it should not get too large. // This helps reduce floating point inaccuracies for 16bit // arithmetic where the exponent has only 5 bits, and the // mantissa 10 bits. template void rocblas_init_alternating_sign(std::vector& A, size_t M, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1) { for(size_t i_batch = 0; i_batch < batch_count; i_batch++) for(size_t i = 0; i < M; ++i) for(size_t j = 0; j < N; ++j) { auto value = random_generator(); A[i + j * lda + i_batch * stride] = (i ^ j) & 1 ? value : negate(value); } } template void rocblas_init_alternating_sign(T* A, size_t M, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1) { for(size_t i_batch = 0; i_batch < batch_count; i_batch++) for(size_t i = 0; i < M; ++i) for(size_t j = 0; j < N; ++j) { auto value = random_generator(); A[i + j * lda + i_batch * stride] = (i ^ j) & 1 ? value : negate(value); } } template void rocblas_init_cos(std::vector& A, size_t M, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1) { for(size_t i_batch = 0; i_batch < batch_count; i_batch++) for(size_t i = 0; i < M; ++i) for(size_t j = 0; j < N; ++j) A[i + j * lda + i_batch * stride] = cos(i + j * lda + i_batch * stride); } /*! \brief symmetric matrix initialization: */ // for real matrix only template void rocblas_init_symmetric(std::vector& A, size_t N, size_t lda) { for(size_t i = 0; i < N; ++i) for(size_t j = 0; j <= i; ++j) { auto value = random_generator(); // Warning: It's undefined behavior to assign to the // same array element twice in same sequence point (i==j) A[j + i * lda] = value; A[i + j * lda] = value; } } /*! \brief symmetric matrix initialization: */ template void rocblas_init_symmetric(T* A, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1) { for(size_t b = 0; b < batch_count; ++b) { for(size_t i = 0; i < N; ++i) for(size_t j = 0; j <= i; ++j) { auto value = random_generator(); // Warning: It's undefined behavior to assign to the // same array element twice in same sequence point (i==j) A[b * stride + j + i * lda] = value; A[b * stride + i + j * lda] = value; } } } /*! \brief symmetric matrix clear: */ template void rocblas_clear_symmetric(rocblas_fill uplo, T* A, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1) { for(size_t b = 0; b < batch_count; ++b) { for(size_t i = 0; i < N; ++i) for(size_t j = i + 1; j < N; ++j) { if(uplo == rocblas_fill_upper) A[b * stride + j + i * lda] = 0; // clear lower else A[b * stride + i + j * lda] = 0; // clear upper } } } /*! \brief hermitian matrix initialization: */ // for complex matrix only, the real/imag part would be initialized with the // same value except the diagonal elment must be real template void rocblas_init_hermitian(std::vector& A, size_t N, size_t lda) { for(size_t i = 0; i < N; ++i) for(size_t j = 0; j <= i; ++j) { auto value = random_generator(); A[j + i * lda] = value; value.y = (i == j) ? 0 : negate(value.y); A[i + j * lda] = value; } } // Initialize vector with HPL-like random values template void rocblas_init_hpl(std::vector& A, size_t M, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1) { for(size_t i_batch = 0; i_batch < batch_count; i_batch++) for(size_t i = 0; i < M; ++i) for(size_t j = 0; j < N; ++j) A[i + j * lda + i_batch * stride] = random_hpl_generator(); } /* ============================================================================================ */ /*! \brief Initialize an array with random data, with NaN where appropriate */ template void rocblas_init_nan(T* A, size_t N) { for(size_t i = 0; i < N; ++i) A[i] = T(rocblas_nan_rng()); } template void rocblas_init_nan(std::vector& A, size_t M, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1) { for(size_t i_batch = 0; i_batch < batch_count; i_batch++) for(size_t i = 0; i < M; ++i) for(size_t j = 0; j < N; ++j) A[i + j * lda + i_batch * stride] = T(rocblas_nan_rng()); } /* ============================================================================================ */ /*! \brief Packs strided_batched matricies into groups of 4 in N */ template void rocblas_packInt8(std::vector& A, size_t M, size_t N, size_t batch_count, size_t lda, size_t stride_a) { if(N % 4 != 0) fmt::print(stderr, "ERROR: dimension must be a multiple of 4 in order to pack\n"); std::vector temp(A); for(size_t count = 0; count < batch_count; count++) for(size_t colBase = 0; colBase < N; colBase += 4) for(size_t row = 0; row < lda; row++) for(size_t colOffset = 0; colOffset < 4; colOffset++) A[(colBase * lda + 4 * row) + colOffset + (stride_a * count)] = temp[(colBase + colOffset) * lda + row + (stride_a * count)]; } /* ============================================================================================ */ /*! \brief Packs matricies into groups of 4 in N */ template void rocblas_packInt8(std::vector& A, size_t M, size_t N, size_t lda) { /* Assumes original matrix provided in column major order, where N is a multiple of 4 ---------- N ---------- | | 00 05 10 15 20 25 30 35 |00 05 10 15|20 25 30 35| | | 01 06 11 16 21 26 31 36 |01 06 11 16|21 26 31 36| l M 02 07 12 17 22 27 32 37 --> |02 07 12 17|22 27 32 37| d | 03 08 13 18 23 28 33 38 |03 08 13 18|23 28 33 38| a | 04 09 14 19 24 29 34 39 |04 09 14 19|24 29 34 39| | ** ** ** ** ** ** ** ** |** ** ** **|** ** ** **| | ** ** ** ** ** ** ** ** |** ** ** **|** ** ** **| Input : 00 01 02 03 04 ** ** 05 ... 38 39 ** ** Output: 00 05 10 15 01 06 11 16 ... ** ** ** ** */ // call general code with batch_count = 1 and stride_a = 0 rocblas_packInt8(A, M, N, 1, lda, 0); } /* ============================================================================================ */ /*! \brief matrix matrix initialization: copies from A into same position in B */ template void rocblas_copy_matrix(const T* A, T* B, size_t M, size_t N, size_t lda, size_t ldb, size_t stridea = 0, size_t strideb = 0, size_t batch_count = 1) { for(size_t i_batch = 0; i_batch < batch_count; i_batch++) for(size_t i = 0; i < M; ++i) for(size_t j = 0; j < N; ++j) B[i + j * ldb + i_batch * strideb] = A[i + j * lda + i_batch * stridea]; } rocSOLVER-rocm-5.5.1/clients/rocblascommon/rocblas_math.hpp000066400000000000000000000041341436600607200236520ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include #include #include /* ============================================================================================ */ // Helper function to truncate float to bfloat16 inline __host__ rocblas_bfloat16 float_to_bfloat16_truncate(float val) { union { float fp32; uint32_t int32; } u = {val}; rocblas_bfloat16 ret; ret.data = uint16_t(u.int32 >> 16); if((u.int32 & 0x7fff0000) == 0x7f800000 && u.int32 & 0xffff) ret.data |= 1; // Preserve signaling NaN return ret; } /* ============================================================================================ */ /*! \brief returns true if value is NaN */ template {}, int> = 0> inline bool rocblas_isnan(T) { return false; } template {} && !rocblas_is_complex, int> = 0> inline bool rocblas_isnan(T arg) { return std::isnan(arg); } template , int> = 0> inline bool rocblas_isnan(const T& arg) { return rocblas_isnan(std::real(arg)) || rocblas_isnan(std::imag(arg)); } inline bool rocblas_isnan(rocblas_half arg) { union { rocblas_half fp; uint16_t data; } x = {arg}; return (~x.data & 0x7c00) == 0 && (x.data & 0x3ff) != 0; } /* ============================================================================================ */ /*! \brief negate a value */ template inline T negate(T x) { return -x; } template <> inline rocblas_half negate(rocblas_half arg) { union { rocblas_half fp; uint16_t data; } x = {arg}; x.data ^= 0x8000; return x.fp; } template <> inline rocblas_bfloat16 negate(rocblas_bfloat16 x) { x.data ^= 0x8000; return x; } rocSOLVER-rocm-5.5.1/clients/rocblascommon/rocblas_random.hpp000066400000000000000000000113761436600607200242070ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2021 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "rocblas/rocblas.h" #include "rocblas_math.hpp" #include #include #include /* ============================================================================================ */ // Random number generator using rocblas_rng_t = std::mt19937; extern thread_local rocblas_rng_t rocblas_rng; extern const rocblas_rng_t rocblas_seed; extern const std::thread::id main_thread_id; // For the main thread, we use rocblas_seed; for other threads, we start with a // different seed but deterministically based on the thread id's hash function. inline rocblas_rng_t get_seed() { auto tid = std::this_thread::get_id(); return tid == main_thread_id ? rocblas_seed : rocblas_rng_t(std::hash{}(tid)); } // Reset the seed (mainly to ensure repeatability of failures in a given suite) inline void rocblas_seedrand() { rocblas_rng = get_seed(); } /* ============================================================================================ */ /*! \brief Random number generator which generates NaN values */ class rocblas_nan_rng { // Generate random NaN values template static T random_nan_data() { static_assert(sizeof(UINT_T) == sizeof(T), "Type sizes do not match"); union { UINT_T u; T fp; } x; do x.u = std::uniform_int_distribution{}(rocblas_rng); while(!(x.u & (((UINT_T)1 << SIG) - 1))); // Reject Inf (mantissa == 0) x.u |= (((UINT_T)1 << EXP) - 1) << SIG; // Exponent = all 1's return x.fp; // NaN with random bits } public: // Random integer template {}, int> = 0> explicit operator T() const { return std::uniform_int_distribution{}(rocblas_rng); } // Random NaN double explicit operator double() const { return random_nan_data(); } // Random NaN float explicit operator float() const { return random_nan_data(); } // Random NaN half explicit operator rocblas_half() const { return random_nan_data(); } // Random NaN bfloat16 explicit operator rocblas_bfloat16() const { return random_nan_data(); } explicit operator rocblas_float_complex() const { return {float(*this), float(*this)}; } explicit operator rocblas_double_complex() const { return {double(*this), double(*this)}; } }; /* ============================================================================================ */ /* generate random number :*/ /*! \brief generate a random number in range [1,2,3,4,5,6,7,8,9,10] */ template inline T random_generator() { return std::uniform_int_distribution(1, 10)(rocblas_rng); } // for rocblas_float_complex, generate two random ints (same behaviour as for // floats) template <> inline rocblas_float_complex random_generator() { return {float(std::uniform_int_distribution(1, 10)(rocblas_rng)), float(std::uniform_int_distribution(1, 10)(rocblas_rng))}; }; // for rocblas_double_complex, generate two random ints (same behaviour as for // doubles) template <> inline rocblas_double_complex random_generator() { return {double(std::uniform_int_distribution(1, 10)(rocblas_rng)), double(std::uniform_int_distribution(1, 10)(rocblas_rng))}; }; // for rocblas_half, generate float, and convert to rocblas_half /*! \brief generate a random number in range [-2,-1,0,1,2] */ template <> inline rocblas_half random_generator() { return rocblas_half(std::uniform_int_distribution(-2, 2)(rocblas_rng)); }; // for rocblas_bfloat16, generate float, and convert to rocblas_bfloat16 /*! \brief generate a random number in range [-2,-1,0,1,2] */ template <> inline rocblas_bfloat16 random_generator() { return rocblas_bfloat16(std::uniform_int_distribution(-2, 2)(rocblas_rng)); }; /*! \brief generate a random number in range [1,2,3] */ template <> inline int8_t random_generator() { return int8_t(std::uniform_int_distribution(1, 3)(rocblas_rng)); }; /*! \brief generate a random number in HPL-like [-0.5,0.5] doubles */ template inline T random_hpl_generator() { return std::uniform_real_distribution(-0.5, 0.5)(rocblas_rng); } rocSOLVER-rocm-5.5.1/clients/rocblascommon/rocblas_test.hpp000066400000000000000000000132011436600607200236730ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include // Suppress warnings about hipMalloc(), hipFree() except in rocblas-test and // rocblas-bench #if !defined(ROCSOLVER_CLIENTS_TEST) && !defined(ROCBLAS_BENCH) #undef hipMalloc #undef hipFree #endif #ifdef ROCSOLVER_CLIENTS_TEST #include // Extra macro so that macro arguments get expanded before calling Google Test #define CHECK_HIP_ERROR2(ERROR) ASSERT_EQ(ERROR, hipSuccess) #define CHECK_HIP_ERROR(ERROR) CHECK_HIP_ERROR2(ERROR) #define CHECK_DEVICE_ALLOCATION(ERROR) \ do \ { \ auto error = ERROR; \ if(error == hipErrorOutOfMemory) \ { \ SUCCEED() << LIMITED_MEMORY_STRING; \ return; \ } \ else if(error != hipSuccess) \ { \ fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error, \ __FILE__, __LINE__); \ return; \ } \ } while(0) #define CHECK_ALLOC_QUERY(STATUS) \ do \ { \ auto status__ = (STATUS); \ ASSERT_TRUE(status__ == rocblas_status_size_increased \ || status__ == rocblas_status_size_unchanged); \ } while(0) #define EXPECT_ROCBLAS_STATUS ASSERT_EQ #else // ROCSOLVER_CLIENTS_TEST inline void rocblas_expect_status(rocblas_status status, rocblas_status expect) { if(status != expect) { fmt::print(stderr, "rocBLAS status error: Expected {}, received {}\n", rocblas_status_to_string(expect), rocblas_status_to_string(status)); if(expect == rocblas_status_success) exit(EXIT_FAILURE); } } #define CHECK_HIP_ERROR(ERROR) \ do \ { \ auto error = ERROR; \ if(error != hipSuccess) \ { \ fmt::print(stderr, "error: {} ({}) at {}:{}\n", hipGetErrorString(error), error, \ __FILE__, __LINE__); \ rocblas_abort(); \ } \ } while(0) #define CHECK_ALLOC_QUERY(STATUS) \ do \ { \ auto status__ = (STATUS); \ if(!(status__ == rocblas_status_size_increased || status__ == rocblas_status_size_unchanged)) \ { \ fmt::print(stderr, \ "rocBLAS status error: Expected rocblas_status_size_unchanged or " \ "rocblas_status_size_increase,\nreceived {}\n", \ rocblas_status_to_string(status__)); \ rocblas_abort(); \ } \ } while(0) #define CHECK_DEVICE_ALLOCATION CHECK_HIP_ERROR #define EXPECT_ROCBLAS_STATUS rocblas_expect_status #endif // ROCSOLVER_CLIENTS_TEST #define CHECK_ROCBLAS_ERROR2(STATUS) EXPECT_ROCBLAS_STATUS(STATUS, rocblas_status_success) #define CHECK_ROCBLAS_ERROR(STATUS) CHECK_ROCBLAS_ERROR2(STATUS) rocSOLVER-rocm-5.5.1/clients/rocblascommon/rocblas_vector.hpp000066400000000000000000000103171436600607200242230ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2021 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "device_batch_vector.hpp" #include "device_strided_batch_vector.hpp" #include "device_vector.hpp" #include "host_batch_vector.hpp" #include "host_strided_batch_vector.hpp" #include "host_vector.hpp" //! //! @brief Random number with type deductions. //! template void random_generator(T& n) { n = random_generator(); } //! //! //! template void random_nan_generator(T& n) { n = T(rocblas_nan_rng()); } //! //! @brief Template for initializing a host //! (non_batched|batched|strided_batched)vector. //! @param that That vector. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init_template(U& that, bool seedReset = false) { if(seedReset) { rocblas_seedrand(); } for(rocblas_int batch_index = 0; batch_index < that.batch_count(); ++batch_index) { auto batched_data = that[batch_index]; auto inc = std::abs(that.inc()); auto n = that.n(); if(inc < 0) { batched_data -= (n - 1) * inc; } for(rocblas_int i = 0; i < n; ++i) { random_generator(batched_data[i * inc]); } } } //! //! @brief Template for initializing a host //! (non_batched|batched|strided_batched)vector with NaNs. //! @param that That vector. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init_nan_template(U& that, bool seedReset = false) { if(seedReset) { rocblas_seedrand(); } for(rocblas_int batch_index = 0; batch_index < that.batch_count(); ++batch_index) { auto batched_data = that[batch_index]; auto inc = std::abs(that.inc()); auto n = that.n(); if(inc < 0) { batched_data -= (n - 1) * inc; } for(rocblas_int i = 0; i < n; ++i) { random_nan_generator(batched_data[i * inc]); } } } //! //! @brief Initialize a host_strided_batch_vector. //! @param that The host strided batch vector. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init(host_strided_batch_vector& that, bool seedReset = false) { rocblas_init_template(that, seedReset); } //! //! @brief Initialize a host_batch_vector. //! @param that The host batch vector. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init(host_batch_vector& that, bool seedReset = false) { rocblas_init_template(that, seedReset); } //! //! @brief Initialize a host_vector. //! @param that The host vector. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init(host_vector& that, bool seedReset = false) { if(seedReset) { rocblas_seedrand(); } rocblas_init(that, 1, that.size(), 1); } //! //! @brief Initialize a host_strided_batch_vector with NaNs. //! @param that The host strided batch vector to be initialized. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init_nan(host_strided_batch_vector& that, bool seedReset = false) { rocblas_init_nan_template(that, seedReset); } //! //! @brief Initialize a host_strided_batch_vector with NaNs. //! @param that The host strided batch vector to be initialized. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init_nan(host_batch_vector& that, bool seedReset = false) { rocblas_init_nan_template(that, seedReset); } //! //! @brief Initialize a host_strided_batch_vector with NaNs. //! @param that The host strided batch vector to be initialized. //! @param seedReset reset he seed if true, do not reset the seed otherwise. //! template void rocblas_init_nan(host_vector& that, bool seedReset = false) { rocblas_init_nan_template(that, seedReset); } rocSOLVER-rocm-5.5.1/clients/samples/000077500000000000000000000000001436600607200173075ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/clients/samples/.clang-format000077500000000000000000000001101436600607200216550ustar00rootroot00000000000000# Manually format sample code --- Language: Cpp DisableFormat: true --- rocSOLVER-rocm-5.5.1/clients/samples/CMakeLists.txt000077500000000000000000000025311436600607200220530ustar00rootroot00000000000000# ######################################################################## # Copyright (c) 2016-2022 Advanced Micro Devices, Inc. # ######################################################################## # declare sample programs add_executable(example-c-basic example_basic.c ) add_executable(example-cpp-basic example_basic.cpp ) add_executable(example-c-graph example_graph.c ) add_executable(example-c-hmm example_hmm.c ) add_executable(example-cpp-logging example_logging.cpp ) add_executable(example-c-batched example_batched.c ) add_executable(example-c-strided-batched example_strided_batched.c ) # group sample programs by language set(c_samples example-c-basic example-c-graph example-c-hmm example-c-batched example-c-strided-batched ) set(cpp_samples example-cpp-basic example-cpp-logging ) # set flags for building the sample programs foreach(exe ${c_samples} ${cpp_samples} ${fortran_samples}) target_link_libraries(${exe} PRIVATE roc::rocsolver) rocm_install(TARGETS ${exe} COMPONENT samples) endforeach() foreach(exe ${cpp_samples}) set_target_properties(${exe} PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED ON CXX_EXTENSIONS OFF ) endforeach() foreach(exe ${c_samples}) set_target_properties(${exe} PROPERTIES C_STANDARD 99 C_STANDARD_REQUIRED ON C_EXTENSIONS OFF ) endforeach() rocSOLVER-rocm-5.5.1/clients/samples/example_basic.c000066400000000000000000000064171436600607200222570ustar00rootroot00000000000000#include // for hip functions #include // for all the rocsolver C interfaces and type declarations #include // for printf #include // for malloc // Example: Compute the QR Factorization of a matrix on the GPU double *create_example_matrix(rocblas_int *M_out, rocblas_int *N_out, rocblas_int *lda_out) { // a *very* small example input; not a very efficient use of the API const double A[3][3] = { { 12, -51, 4}, { 6, 167, -68}, { -4, 24, -41} }; const rocblas_int M = 3; const rocblas_int N = 3; const rocblas_int lda = 3; *M_out = M; *N_out = N; *lda_out = lda; // note: rocsolver matrices must be stored in column major format, // i.e. entry (i,j) should be accessed by hA[i + j*lda] double *hA = (double*)malloc(sizeof(double)*lda*N); for (size_t i = 0; i < M; ++i) { for (size_t j = 0; j < N; ++j) { // copy A (2D array) into hA (1D array, column-major) hA[i + j*lda] = A[i][j]; } } return hA; } // We use rocsolver_dgeqrf to factor a real M-by-N matrix, A. // See https://rocsolver.readthedocs.io/en/latest/api_lapackfunc.html#c.rocsolver_dgeqrf int main() { rocblas_int M; // rows rocblas_int N; // cols rocblas_int lda; // leading dimension double *hA = create_example_matrix(&M, &N, &lda); // input matrix on CPU // let's print the input matrix, just to see it printf("A = [\n"); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% .3f ", hA[i + j*lda]); } printf(";\n"); } printf("]\n"); // initialization rocblas_handle handle; rocblas_create_handle(&handle); // Some rocsolver functions may trigger rocblas to load its GEMM kernels. // You can preload the kernels by explicitly invoking rocblas_initialize // (e.g., to exclude one-time initialization overhead from benchmarking). // preload rocBLAS GEMM kernels (optional) // rocblas_initialize(); // calculate the sizes of our arrays size_t size_A = lda * (size_t)N; // count of elements in matrix A size_t size_piv = (M < N) ? M : N; // count of Householder scalars // allocate memory on GPU double *dA, *dIpiv; hipMalloc((void**)&dA, sizeof(double)*size_A); hipMalloc((void**)&dIpiv, sizeof(double)*size_piv); // copy data to GPU hipMemcpy(dA, hA, sizeof(double)*size_A, hipMemcpyHostToDevice); // compute the QR factorization on the GPU rocsolver_dgeqrf(handle, M, N, dA, lda, dIpiv); // copy the results back to CPU double *hIpiv = (double*)malloc(sizeof(double)*size_piv); // householder scalars on CPU hipMemcpy(hA, dA, sizeof(double)*size_A, hipMemcpyDeviceToHost); hipMemcpy(hIpiv, dIpiv, sizeof(double)*size_piv, hipMemcpyDeviceToHost); // the results are now in hA and hIpiv // we can print some of the results if we want to see them printf("R = [\n"); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% .3f ", (i <= j) ? hA[i + j*lda] : 0); } printf(";\n"); } printf("]\n"); // clean up free(hIpiv); hipFree(dA); hipFree(dIpiv); free(hA); rocblas_destroy_handle(handle); } rocSOLVER-rocm-5.5.1/clients/samples/example_basic.cpp000066400000000000000000000056001436600607200226100ustar00rootroot00000000000000#include // for std::min #include // for hip functions #include // for all the rocsolver C interfaces and type declarations #include // for size_t, printf #include // Example: Compute the QR Factorization of a matrix on the GPU void get_example_matrix(std::vector& hA, rocblas_int& M, rocblas_int& N, rocblas_int& lda) { // a *very* small example input; not a very efficient use of the API const double A[3][3] = { { 12, -51, 4}, { 6, 167, -68}, { -4, 24, -41} }; M = 3; N = 3; lda = 3; // note: rocsolver matrices must be stored in column major format, // i.e. entry (i,j) should be accessed by hA[i + j*lda] hA.resize(size_t(lda) * N); for (size_t i = 0; i < M; ++i) { for (size_t j = 0; j < N; ++j) { // copy A (2D array) into hA (1D array, column-major) hA[i + j*lda] = A[i][j]; } } } // We use rocsolver_dgeqrf to factor a real M-by-N matrix, A. // See https://rocsolver.readthedocs.io/en/latest/api_lapackfunc.html#c.rocsolver_dgeqrf int main() { rocblas_int M; // rows rocblas_int N; // cols rocblas_int lda; // leading dimension std::vector hA; // input matrix on CPU get_example_matrix(hA, M, N, lda); // let's print the input matrix, just to see it printf("A = [\n"); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% .3f ", hA[i + j*lda]); } printf(";\n"); } printf("]\n"); // initialization rocblas_handle handle; rocblas_create_handle(&handle); // calculate the sizes of our arrays size_t size_A = size_t(lda) * N; // count of elements in matrix A size_t size_piv = size_t(std::min(M, N)); // count of Householder scalars // allocate memory on GPU double *dA, *dIpiv; hipMalloc(&dA, sizeof(double)*size_A); hipMalloc(&dIpiv, sizeof(double)*size_piv); // copy data to GPU hipMemcpy(dA, hA.data(), sizeof(double)*size_A, hipMemcpyHostToDevice); // compute the QR factorization on the GPU rocsolver_dgeqrf(handle, M, N, dA, lda, dIpiv); // copy the results back to CPU std::vector hIpiv(size_piv); // array for householder scalars on CPU hipMemcpy(hA.data(), dA, sizeof(double)*size_A, hipMemcpyDeviceToHost); hipMemcpy(hIpiv.data(), dIpiv, sizeof(double)*size_piv, hipMemcpyDeviceToHost); // the results are now in hA and hIpiv // we can print some of the results if we want to see them printf("R = [\n"); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% .3f ", (i <= j) ? hA[i + j*lda] : 0); } printf(";\n"); } printf("]\n"); // clean up hipFree(dA); hipFree(dIpiv); rocblas_destroy_handle(handle); } rocSOLVER-rocm-5.5.1/clients/samples/example_batched.c000066400000000000000000000102471436600607200225640ustar00rootroot00000000000000#include // for hip functions #include // for all the rocsolver C interfaces and type declarations #include // for printf #include // for malloc // Example: Compute the QR Factorizations of a batch of matrices on the GPU double **create_example_matrices(rocblas_int *M_out, rocblas_int *N_out, rocblas_int *lda_out, rocblas_int *batch_count_out) { // a small example input const double A[2][3][3] = { // First input matrix { { 12, -51, 4}, { 6, 167, -68}, { -4, 24, -41} }, // Second input matrix { { 3, -12, 11}, { 4, -46, -2}, { 0, 5, 15} } }; const rocblas_int M = 3; const rocblas_int N = 3; const rocblas_int lda = 3; const rocblas_int batch_count = 2; *M_out = M; *N_out = N; *lda_out = lda; *batch_count_out = batch_count; // allocate space for input matrix data on CPU double **hA = (double**)malloc(sizeof(double*)*batch_count); hA[0] = (double*)malloc(sizeof(double)*lda*N); hA[1] = (double*)malloc(sizeof(double)*lda*N); for (size_t b = 0; b < batch_count; ++b) for (size_t i = 0; i < M; ++i) for (size_t j = 0; j < N; ++j) hA[b][i + j*lda] = A[b][i][j]; return hA; } // Use rocsolver_dgeqrf_batched to factor a batch of real M-by-N matrices. int main() { rocblas_int M; // rows rocblas_int N; // cols rocblas_int lda; // leading dimension rocblas_int batch_count; // number of matricies double **hA = create_example_matrices(&M, &N, &lda, &batch_count); // print the input matrices for (size_t b = 0; b < batch_count; ++b) { printf("A[%zu] = [\n", b); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% 4.f ", hA[b][i + j*lda]); } printf(";\n"); } printf("]\n"); } // initialization rocblas_handle handle; rocblas_create_handle(&handle); // preload rocBLAS GEMM kernels (optional) // rocblas_initialize(); // calculate the sizes of the arrays size_t size_A = lda * (size_t)N; // count of elements in each matrix A rocblas_stride strideP = (M < N) ? M : N; // stride of Householder scalar sets size_t size_piv = strideP * (size_t)batch_count; // elements in array for Householder scalars // allocate memory on the CPU for an array of pointers, // then allocate memory for each matrix on the GPU. double **A = (double**)malloc(sizeof(double*)*batch_count); for (rocblas_int b = 0; b < batch_count; ++b) hipMalloc((void**)&A[b], sizeof(double)*size_A); // allocate memory on GPU for the array of pointers and Householder scalars double **dA, *dIpiv; hipMalloc((void**)&dA, sizeof(double*)*batch_count); hipMalloc((void**)&dIpiv, sizeof(double)*size_piv); // copy each matrix to the GPU for (rocblas_int b = 0; b < batch_count; ++b) hipMemcpy(A[b], hA[b], sizeof(double)*size_A, hipMemcpyHostToDevice); // copy the array of pointers to the GPU hipMemcpy(dA, A, sizeof(double*)*batch_count, hipMemcpyHostToDevice); // compute the QR factorizations on the GPU rocsolver_dgeqrf_batched(handle, M, N, dA, lda, dIpiv, strideP, batch_count); // copy the results back to CPU double *hIpiv = (double*)malloc(sizeof(double)*size_piv); // householder scalars on CPU hipMemcpy(hIpiv, dIpiv, sizeof(double)*size_piv, hipMemcpyDeviceToHost); for (rocblas_int b = 0; b < batch_count; ++b) hipMemcpy(hA[b], A[b], sizeof(double)*size_A, hipMemcpyDeviceToHost); // the results are now in hA and hIpiv // print some of the results for (size_t b = 0; b < batch_count; ++b) { printf("R[%zu] = [\n", b); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% 4.f ", (i <= j) ? hA[b][i + j*lda] : 0); } printf(";\n"); } printf("]\n"); } // clean up free(hIpiv); for (rocblas_int b = 0; b < batch_count; ++b) free(hA[b]); free(hA); for (rocblas_int b = 0; b < batch_count; ++b) hipFree(A[b]); free(A); hipFree(dA); hipFree(dIpiv); rocblas_destroy_handle(handle); } rocSOLVER-rocm-5.5.1/clients/samples/example_graph.c000066400000000000000000000100551436600607200222700ustar00rootroot00000000000000#include // for hip functions #include // for all the rocsolver C interfaces and type declarations #include // for printf #include // for malloc // Example: Compute the QR Factorization of a matrix asynchronously on the GPU using the hipGraph API double *create_example_matrix(rocblas_int *M_out, rocblas_int *N_out, rocblas_int *lda_out) { // a *very* small example input; not a very efficient use of the API const double A[3][3] = { { 12, -51, 4}, { 6, 167, -68}, { -4, 24, -41} }; const rocblas_int M = 3; const rocblas_int N = 3; const rocblas_int lda = 3; *M_out = M; *N_out = N; *lda_out = lda; // note: rocsolver matrices must be stored in column major format, // i.e. entry (i,j) should be accessed by hA[i + j*lda] double *hA = (double*)malloc(sizeof(double)*lda*N); for (size_t i = 0; i < M; ++i) { for (size_t j = 0; j < N; ++j) { // copy A (2D array) into hA (1D array, column-major) hA[i + j*lda] = A[i][j]; } } return hA; } // We use rocsolver_dgeqrf to factor a real M-by-N matrix, A. // See https://rocsolver.readthedocs.io/en/latest/api_lapackfunc.html#c.rocsolver_dgeqrf int main() { const rocblas_int ITER_COUNT = 10; rocblas_int M; // rows rocblas_int N; // cols rocblas_int lda; // leading dimension double *hA = create_example_matrix(&M, &N, &lda); // input matrix on CPU // let's print the input matrix, just to see it printf("A = [\n"); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% .3f ", hA[i + j*lda]); } printf(";\n"); } printf("]\n"); // initialization rocblas_handle handle; rocblas_create_handle(&handle); // Some rocsolver functions may trigger rocblas to load its GEMM kernels. // You can preload the kernels by explicitly invoking rocblas_initialize // (e.g., to exclude one-time initialization overhead from benchmarking). // preload rocBLAS GEMM kernels (optional) // rocblas_initialize(); // calculate the sizes of our arrays size_t size_A = lda * (size_t)N; // count of elements in matrix A size_t size_piv = (M < N) ? M : N; // count of Householder scalars // allocate memory on GPU double *dA, *dIpiv; hipMalloc((void**)&dA, sizeof(double)*size_A); hipMalloc((void**)&dIpiv, sizeof(double)*size_piv); // copy data to GPU hipMemcpy(dA, hA, sizeof(double)*size_A, hipMemcpyHostToDevice); // compute the QR factorization on the GPU // create the stream object hipStream_t stream; hipStreamCreate(&stream); rocblas_set_stream(handle, stream); // create graph management objects hipGraph_t graph; rocblas_int graph_ready = 0; hipGraphExec_t exec; for (int i = 0; i < ITER_COUNT; i++) { if (!graph_ready) { hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal); rocsolver_dgeqrf(handle, M, N, dA, lda, dIpiv); // returns immediately hipStreamEndCapture(stream, &graph); hipGraphInstantiate(&exec, graph, NULL, NULL, 0); hipGraphDestroy(graph); graph_ready = 1; } hipGraphLaunch(exec, stream); } // copy the results back to CPU double *hIpiv = (double*)malloc(sizeof(double)*size_piv); hipMemcpy(hA, dA, sizeof(double)*size_A, hipMemcpyDeviceToHost); // will block until the stream is completed hipMemcpy(hIpiv, dIpiv, sizeof(double)*size_piv, hipMemcpyDeviceToHost); // the results are now in hA and hIpiv // we can print some of the results if we want to see them printf("R = [\n"); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% .3f ", (i <= j) ? hA[i + j*lda] : 0); } printf(";\n"); } printf("]\n"); // clean up free(hIpiv); hipFree(dA); hipFree(dIpiv); free(hA); hipGraphExecDestroy(exec); rocblas_destroy_handle(handle); // order matters: the handle must be destroyed before the stream hipStreamDestroy(stream); } rocSOLVER-rocm-5.5.1/clients/samples/example_hmm.c000066400000000000000000000066421436600607200217570ustar00rootroot00000000000000#include // for hip functions #include // for all the rocsolver C interfaces and type declarations #include // for printf #include // for malloc // Example: Compute the QR Factorization of a matrix on the GPU // using unified memory (via hipMallocManaged) double* create_example_matrix(rocblas_int *M_out, rocblas_int *N_out, rocblas_int *lda_out) { // a *very* small example input; not a very efficient use of the API const double A_source[3][3] = { { 12, -51, 4}, { 6, 167, -68}, { -4, 24, -41} }; const rocblas_int M = 3; const rocblas_int N = 3; const rocblas_int lda = 3; *M_out = M; *N_out = N; *lda_out = lda; // note: rocsolver matrices must be stored in column major format, // i.e. entry (i,j) should be accessed by hA[i + j*lda] double* A; hipMallocManaged((void**)&A, sizeof(double)*lda*N, hipMemAttachGlobal); for (size_t i = 0; i < M; ++i) { for (size_t j = 0; j < N; ++j) { // copy A (2D array) into hA (1D array, column-major) A[i + j*lda] = A_source[i][j]; } } return A; } // We use rocsolver_dgeqrf to factor a real M-by-N matrix, A. // See https://rocsolver.readthedocs.io/en/latest/api_lapackfunc.html#c.rocsolver_dgeqrf int main() { rocblas_int M; // rows rocblas_int N; // cols rocblas_int lda; // leading dimension double* A = create_example_matrix(&M, &N, &lda); // input matrix on CPU // let's print the input matrix, just to see it printf("A = [\n"); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% .3f ", A[i + j*lda]); } printf(";\n"); } printf("]\n"); // initialization rocblas_handle handle; rocblas_create_handle(&handle); // check for enablement of managed memory int deviceID, hmm_enabled; hipGetDevice(&deviceID); hipDeviceGetAttribute(&hmm_enabled, hipDeviceAttributeManagedMemory, deviceID); if (!hmm_enabled) { printf("Managed memory not enabled on device %i\n", deviceID); rocblas_destroy_handle(handle); return 0; } // calculate the sizes of our arrays size_t size_piv = (M < N) ? M : N; // count of Householder scalars // allocate memory double *ipiv; hipMallocManaged((void**)&ipiv, sizeof(double)*size_piv, hipMemAttachGlobal); // determine workspace size size_t size_W; rocblas_start_device_memory_size_query(handle); rocsolver_dgetrf(handle, M, N, NULL, lda, NULL, NULL); rocblas_stop_device_memory_size_query(handle, &size_W); // create custom workspace double *work; hipMallocManaged((void**)&work, size_W, hipMemAttachGlobal); rocblas_set_workspace(handle, work, size_W); // compute the QR factorization on the GPU hipStream_t stream; rocblas_get_stream(handle, &stream); rocsolver_dgeqrf(handle, M, N, A, lda, ipiv); hipStreamSynchronize(stream); // the results are now in A and ipiv // we can print some of the results if we want to see them printf("R = [\n"); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% .3f ", (i <= j) ? A[i + j*lda] : 0); } printf(";\n"); } printf("]\n"); // clean up hipFree(A); hipFree(ipiv); hipFree(work); rocblas_destroy_handle(handle); } rocSOLVER-rocm-5.5.1/clients/samples/example_logging.cpp000066400000000000000000000057731436600607200231700ustar00rootroot00000000000000#include // for std::min #include // for hip functions #include // for all the rocsolver C interfaces and type declarations #include // for size_t, printf #include // Example: Compute the QR Factorization of a matrix on the GPU void get_example_matrix(std::vector& hA, rocblas_int& M, rocblas_int& N, rocblas_int& lda) { // a *very* small example input; not a very efficient use of the API const double A[3][3] = { { 12, -51, 4}, { 6, 167, -68}, { -4, 24, -41} }; M = 3; N = 3; lda = 3; // note: rocsolver matrices must be stored in column major format, // i.e. entry (i,j) should be accessed by hA[i + j*lda] hA.resize(size_t(lda) * N); for (size_t i = 0; i < M; ++i) { for (size_t j = 0; j < N; ++j) { // copy A (2D array) into hA (1D array, column-major) hA[i + j*lda] = A[i][j]; } } } // We use rocsolver_dgeqrf to factor a real M-by-N matrix, A. // See https://rocsolver.readthedocs.io/en/latest/api_lapackfunc.html#c.rocsolver_dgeqrf // and https://rocsolver.readthedocs.io/en/latest/userguide_logging.html int main() { rocblas_int M; // rows rocblas_int N; // cols rocblas_int lda; // leading dimension std::vector hA; // input matrix on CPU get_example_matrix(hA, M, N, lda); // initialization rocblas_handle handle; rocblas_create_handle(&handle); rocsolver_log_begin(); // calculate the sizes of our arrays size_t size_A = size_t(lda) * N; // count of elements in matrix A size_t size_piv = size_t(std::min(M, N)); // count of Householder scalars // allocate memory on GPU double *dA, *dIpiv; hipMalloc(&dA, sizeof(double)*size_A); hipMalloc(&dIpiv, sizeof(double)*size_piv); // copy data to GPU hipMemcpy(dA, hA.data(), sizeof(double)*size_A, hipMemcpyHostToDevice); // begin trace logging and profile logging (max depth = 4) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_trace | rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); rocsolver_log_set_max_levels(4); // compute the QR factorization on the GPU rocsolver_dgeqrf(handle, M, N, dA, lda, dIpiv); // stop logging, print profile results, and clear the profile data rocsolver_log_flush_profile(); rocsolver_log_restore_defaults(); // copy data to GPU hipMemcpy(dA, hA.data(), sizeof(double)*size_A, hipMemcpyHostToDevice); // begin bench logging and profile logging (max depth = 1) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_bench | rocblas_layer_mode_log_profile); // compute the QR factorization on the GPU rocsolver_dgeqrf(handle, M, N, dA, lda, dIpiv); // stop logging and print profile results rocsolver_log_write_profile(); rocsolver_log_restore_defaults(); // clean up hipFree(dA); hipFree(dIpiv); rocsolver_log_end(); rocblas_destroy_handle(handle); } rocSOLVER-rocm-5.5.1/clients/samples/example_strided_batched.c000066400000000000000000000072601436600607200243030ustar00rootroot00000000000000#include // for hip functions #include // for all the rocsolver C interfaces and type declarations #include // for printf #include // for malloc // Example: Compute the QR Factorizations of an array of matrices on the GPU double *create_example_matrices(rocblas_int *M_out, rocblas_int *N_out, rocblas_int *lda_out, rocblas_stride *strideA_out, rocblas_int *batch_count_out) { const double A[2][3][3] = { // First input matrix { { 12, -51, 4}, { 6, 167, -68}, { -4, 24, -41} }, // Second input matrix { { 3, -12, 11}, { 4, -46, -2}, { 0, 5, 15} } }; const rocblas_int M = 3; const rocblas_int N = 3; const rocblas_int lda = 3; const rocblas_stride strideA = lda * N; const rocblas_int batch_count = 2; *M_out = M; *N_out = N; *lda_out = lda; *strideA_out = strideA; *batch_count_out = batch_count; // allocate space for input matrix data on CPU double *hA = (double*)malloc(sizeof(double)*strideA*batch_count); // copy A (3D array) into hA (1D array, column-major) for (size_t b = 0; b < batch_count; ++b) for (size_t i = 0; i < M; ++i) for (size_t j = 0; j < N; ++j) hA[i + j*lda + b*strideA] = A[b][i][j]; return hA; } // Use rocsolver_dgeqrf_strided_batched to factor an array of real M-by-N matrices. int main() { rocblas_int M; // rows rocblas_int N; // cols rocblas_int lda; // leading dimension rocblas_stride strideA; // stride from start of one matrix to the next rocblas_int batch_count; // number of matricies double *hA = create_example_matrices(&M, &N, &lda, &strideA, &batch_count); // print the input matrices for (size_t b = 0; b < batch_count; ++b) { printf("A[%zu] = [\n", b); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% 4.f ", hA[i + j*lda + strideA*b]); } printf(";\n"); } printf("]\n"); } // initialization rocblas_handle handle; rocblas_create_handle(&handle); // preload rocBLAS GEMM kernels (optional) // rocblas_initialize(); // calculate the sizes of our arrays size_t size_A = strideA * (size_t)batch_count; // elements in array for matrices rocblas_stride strideP = (M < N) ? M : N; // stride of Householder scalar sets size_t size_piv = strideP * (size_t)batch_count; // elements in array for Householder scalars // allocate memory on GPU double *dA, *dIpiv; hipMalloc((void**)&dA, sizeof(double)*size_A); hipMalloc((void**)&dIpiv, sizeof(double)*size_piv); // copy data to GPU hipMemcpy(dA, hA, sizeof(double)*size_A, hipMemcpyHostToDevice); // compute the QR factorizations on the GPU rocsolver_dgeqrf_strided_batched(handle, M, N, dA, lda, strideA, dIpiv, strideP, batch_count); // copy the results back to CPU double *hIpiv = (double*)malloc(sizeof(double)*size_piv); // householder scalars on CPU hipMemcpy(hA, dA, sizeof(double)*size_A, hipMemcpyDeviceToHost); hipMemcpy(hIpiv, dIpiv, sizeof(double)*size_piv, hipMemcpyDeviceToHost); // the results are now in hA and hIpiv // print some of the results for (size_t b = 0; b < batch_count; ++b) { printf("R[%zu] = [\n", b); for (size_t i = 0; i < M; ++i) { printf(" "); for (size_t j = 0; j < N; ++j) { printf("% 4.f ", (i <= j) ? hA[i + j*lda + strideA*b] : 0); } printf(";\n"); } printf("]\n"); } // clean up free(hIpiv); hipFree(dA); hipFree(dIpiv); free(hA); rocblas_destroy_handle(handle); } rocSOLVER-rocm-5.5.1/cmake/000077500000000000000000000000001436600607200152625ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/cmake/armor-config.cmake000066400000000000000000000021761436600607200206550ustar00rootroot00000000000000# ######################################################################## # Copyright (c) 2020-2021 Advanced Micro Devices, Inc. # ######################################################################## # Enables increasingly expensive runtime correctness checks # 0 - Nothing # 1 - Inexpensive correctness checks (extra assertions, etc..) # Note: Some checks are added by the optimizer, so it can help to build # with optimizations enabled. e.g. -Og # 2 - Expensive correctness checks (debug iterators) macro(add_armor_flags target level) if(UNIX AND "${level}" GREATER "0") if("${level}" GREATER "1") # Building with std debug iterators is enabled by the defines below, but # requires building C++ dependencies with the same defines. target_compile_definitions(${target} PRIVATE _GLIBCXX_DEBUG ) endif() # Note that _FORTIFY_SOURCE does not work unless optimizations are enabled target_compile_definitions(${target} PRIVATE $<$>:_FORTIFY_SOURCE=1> _GLIBCXX_ASSERTIONS ROCSOLVER_VERIFY_ASSUMPTIONS ) endif() endmacro() rocSOLVER-rocm-5.5.1/cmake/get-rocm-cmake.cmake000066400000000000000000000032371436600607200210640ustar00rootroot00000000000000# This finds the rocm-cmake project, and installs it if not found # rocm-cmake contains common cmake code for rocm projects to help setup and install set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern) # By default, rocm software stack is expected at /opt/rocm # set environment variable ROCM_PATH to change location if(NOT ROCM_PATH) set(ROCM_PATH /opt/rocm) endif() find_package(ROCM 0.7.3 CONFIG QUIET PATHS ${ROCM_PATH}) if(NOT ROCM_FOUND) set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download") set(rocm_cmake_url "https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip") set(rocm_cmake_path "${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}") set(rocm_cmake_archive "${rocm_cmake_path}.zip") file(DOWNLOAD "${rocm_cmake_url}" "${rocm_cmake_archive}" STATUS status LOG log) list(GET status 0 status_code) list(GET status 1 status_string) if(status_code EQUAL 0) message(STATUS "downloading... done") else() message(FATAL_ERROR "error: downloading\n'${rocm_cmake_url}' failed status_code: ${status_code} status_string: ${status_string} log: ${log}\n") endif() execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzvf "${rocm_cmake_archive}" WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}) execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake . WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} ) execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}) find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake ) endif() rocSOLVER-rocm-5.5.1/cmake/os-detection.cmake000066400000000000000000000015501436600607200206620ustar00rootroot00000000000000# ######################################################################## # Copyright (c) 2019-2021 Advanced Micro Devices, Inc. # ######################################################################## function(get_os_id OS_ID) set(_os_id "unknown") if(EXISTS "/etc/os-release") read_key("ID" _os_id) endif() if(_os_id STREQUAL "opensuse-leap") set(_os_id "sles") endif() set(${OS_ID} ${_os_id} PARENT_SCOPE) set(${OS_ID}_${_os_id} TRUE PARENT_SCOPE) endfunction() function(read_key KEYVALUE OUTPUT) # Finds the line with the keyvalue file(STRINGS /etc/os-release _keyvalue_line REGEX "^${KEYVALUE}=") # Remove keyvalue= string(REGEX REPLACE "^${KEYVALUE}=\"?(.*)" "\\1" _output "${_keyvalue_line}") # Remove trailing quote string(REGEX REPLACE "\"$" "" _output "${_output}") set(${OUTPUT} ${_output} PARENT_SCOPE) endfunction() rocSOLVER-rocm-5.5.1/cmake/util.cmake000066400000000000000000000031631436600607200172440ustar00rootroot00000000000000# ######################################################################## # Copyright (c) 2021 Advanced Micro Devices, Inc. # ######################################################################## # A helper function to prefix a source list of files with a common path # into a new list (non-destructive) function(prepend_path prefix source_list_of_files return_list_of_files) foreach(file ${${source_list_of_files}}) if(IS_ABSOLUTE ${file}) list(APPEND new_list ${file}) else() list(APPEND new_list ${prefix}/${file}) endif() endforeach() set(${return_list_of_files} ${new_list} PARENT_SCOPE) endfunction() # Search through the location properties to find one that is set function(get_imported_target_location result_variable imported_target) string(TOUPPER "${CMAKE_BUILD_TYPE}" config) set(properties_to_search "IMPORTED_LOCATION" "IMPORTED_LOCATION_${config}" "IMPORTED_LOCATION_DEBUG" "IMPORTED_LOCATION_RELEASE" "IMPORTED_LOCATION_RELWITHDEBINFO" "IMPORTED_LOCATION_MINSIZEREL" ) foreach(property ${properties_to_search}) get_target_property(location "${imported_target}" "${property}") if(location) set("${result_variable}" "${location}" PARENT_SCOPE) return() endif() endforeach() set("${result_variable}" "${result_variable}-NOTFOUND" PARENT_SCOPE) endfunction() # Define an opposite way of specifying an existing option. # This may be useful for compatibility. macro(option_opposite option opposite) if(DEFINED "${opposite}") if(${opposite}) set("${option}" OFF) else() set("${option}" ON) endif() endif() endmacro() rocSOLVER-rocm-5.5.1/common/000077500000000000000000000000001436600607200154725ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/common/CMakeLists.txt000066400000000000000000000031441436600607200202340ustar00rootroot00000000000000# ######################################################################## # Copyright (c) 2021 Advanced Micro Devices, Inc. # ######################################################################## add_library(rocsolver-common INTERFACE) target_include_directories(rocsolver-common INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include ) set(source_files common_host_helpers.cpp ) prepend_path("${CMAKE_CURRENT_SOURCE_DIR}/src/" source_files source_paths) target_sources(rocsolver-common INTERFACE ${source_paths}) target_compile_definitions(rocsolver-common INTERFACE __HIP_HCC_COMPAT_MODE__=1) target_compile_options(rocsolver-common INTERFACE -Wno-unused-result # TODO: address [[nodiscard]] warnings ) if(WIN32) target_compile_definitions(rocsolver-common INTERFACE WIN32_LEAN_AND_MEAN _CRT_SECURE_NO_WARNINGS NOMINMAX __HIP_ROCclr__=1 __HIP_PLATFORM_AMD__=1 ) target_compile_options(rocsolver-common INTERFACE -fms-extensions -fms-compatibility -Wno-ignored-attributes -Wno-unused-command-line-argument ) endif() if(WERROR) target_compile_options(rocsolver-common INTERFACE -Werror=vla -Werror=reorder -Werror=return-type ) endif() if(BUILD_ADDRESS_SANITIZER) target_compile_options(rocsolver-common INTERFACE -fsanitize=address -shared-libasan ) target_link_options(rocsolver-common INTERFACE -fsanitize=address -shared-libasan -fuse-ld=lld ) endif() if(BUILD_CODE_COVERAGE) target_compile_options(rocsolver-common INTERFACE -fprofile-arcs -ftest-coverage ) target_link_options(rocsolver-common INTERFACE --coverage) endif() rocSOLVER-rocm-5.5.1/common/include/000077500000000000000000000000001436600607200171155ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/common/include/common_host_helpers.hpp000066400000000000000000000267121436600607200237050ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include #include #include #include #include #include #include "fmt_rocblas_types.hpp" #include "rocblas_utility.hpp" /* * =========================================================================== * common location for functions that are used by both the rocSOLVER * library and rocSOLVER client code. * =========================================================================== */ /* =============================================================================================== */ /* Number properties functions. */ template constexpr double get_epsilon() { using S = decltype(std::real(T{})); return std::numeric_limits::epsilon(); } template constexpr double get_safemin() { using S = decltype(std::real(T{})); auto eps = get_epsilon(); auto s1 = std::numeric_limits::min(); auto s2 = 1 / std::numeric_limits::max(); if(s2 > s1) return s2 * (1 + eps); return s1; } /* =============================================================================================== */ /* Timing functions. */ /*! \brief CPU Timer(in microsecond): synchronize with the default device and * return wall time */ double get_time_us(); /*! \brief CPU Timer(in microsecond): synchronize with given queue/stream and * return wall time */ double get_time_us_sync(hipStream_t stream); /*! \brief CPU Timer(in microsecond): no GPU synchronization and return wall * time */ double get_time_us_no_sync(); /* =============================================================================================== */ /* Print functions. */ inline void pairs_to_string(std::string& str, const char* sep) { // do nothing } template void pairs_to_string(std::string& str, const char* sep, T1 arg1, T2 arg2, Ts... args) { str += fmt::format("{} {}", arg1, arg2); if(sizeof...(Ts) > 0) { str += sep; pairs_to_string(str, sep, args...); } } /** Set of helpers to print out data hosted in the CPU and/or the GPU **/ /***********************************************************************/ /*! \brief Print provided data into specified stream */ template void print_to_stream(std::ostream& os, const std::string name, const rocblas_int m, const rocblas_int n, T* A, const rocblas_int lda, const rocblas_fill uplo) { std::string s; bool empty = name.empty(); if(!empty) s += fmt::format("{}-by-{} matrix: {}\n", m, n, name); if(uplo == rocblas_fill_full) { // normal case for(int i = 0; i < m; i++) { if(!empty) s += " "; for(int j = 0; j < n; j++) { s += fmt::format("{}", A[j * lda + i]); if(j < n - 1) s += ", "; } s += '\n'; } } else { // symmetric case for(int i = 0; i < min(m, n); i++) { if(!empty) s += " "; for(int j = 0; j < min(m, n); j++) { if(uplo == rocblas_fill_upper) { if(i < j) s += fmt::format("{}", A[j * lda + i]); else s += fmt::format("{}", A[i * lda + j]); } else { if(i > j) s += fmt::format("{}", A[j * lda + i]); else s += fmt::format("{}", A[i * lda + j]); } if(j < n - 1) s += ", "; } s += '\n'; } } s += '\n'; os << s; os.flush(); } /*! \brief Print data from a normal or strided_batched array on the GPU to screen*/ template void print_device_matrix(std::ostream& os, const std::string name, const rocblas_int m, const rocblas_int n, T* A, const rocblas_int lda, const rocblas_stride stride = 1, const rocblas_int idx = 0, const rocblas_fill uplo = rocblas_fill_full) { std::vector hA(lda * n); hipMemcpy(hA.data(), A + idx * stride, sizeof(T) * lda * n, hipMemcpyDeviceToHost); print_to_stream(os, name, m, n, hA.data(), lda, uplo); } /*! \brief Print data from a batched array on the GPU to screen*/ template void print_device_matrix(std::ostream& os, const std::string name, const rocblas_int m, const rocblas_int n, T* const A[], const rocblas_int lda, const rocblas_stride stride = 1, const rocblas_int idx = 0, const rocblas_fill uplo = rocblas_fill_full) { std::vector hA(lda * n); T* AA[1]; hipMemcpy(AA, A + idx, sizeof(T*), hipMemcpyDeviceToHost); hipMemcpy(hA.data(), AA[0], sizeof(T) * lda * n, hipMemcpyDeviceToHost); print_to_stream(os, name, m, n, hA.data(), lda, uplo); } /*! \brief Print data from a normal or strided_batched array on the GPU to file*/ template void print_device_matrix(const std::string file, const rocblas_int m, const rocblas_int n, T* A, const rocblas_int lda, const rocblas_stride stride = 1, const rocblas_int idx = 0, const rocblas_fill uplo = rocblas_fill_full) { std::ofstream os(file); std::vector hA(lda * n); hipMemcpy(hA.data(), A + idx * stride, sizeof(T) * lda * n, hipMemcpyDeviceToHost); print_to_stream(os, "", m, n, hA.data(), lda, uplo); } /*! \brief Print data from a batched array on the GPU to file*/ template void print_device_matrix(const std::string file, const rocblas_int m, const rocblas_int n, T* const A[], const rocblas_int lda, const rocblas_stride stride = 1, const rocblas_int idx = 0, const rocblas_fill uplo = rocblas_fill_full) { std::ofstream os(file); std::vector hA(lda * n); T* AA[1]; hipMemcpy(AA, A + idx, sizeof(T*), hipMemcpyDeviceToHost); hipMemcpy(hA.data(), AA[0], sizeof(T) * lda * n, hipMemcpyDeviceToHost); print_to_stream(os, "", m, n, hA.data(), lda, uplo); } /*! \brief Print data from a normal or strided_batched array on the CPU to screen*/ template void print_host_matrix(std::ostream& os, const std::string name, const rocblas_int m, const rocblas_int n, T* A, const rocblas_int lda, const rocblas_stride stride = 1, const rocblas_int idx = 0, const rocblas_fill uplo = rocblas_fill_full) { print_to_stream(os, name, m, n, A + idx * stride, lda, uplo); } /*! \brief Print data from a batched array on the CPU to screen*/ template void print_host_matrix(std::ostream& os, const std::string name, const rocblas_int m, const rocblas_int n, T* const A[], const rocblas_int lda, const rocblas_stride stride = 1, const rocblas_int idx = 0, const rocblas_fill uplo = rocblas_fill_full) { print_to_stream(os, name, m, n, A[idx], lda, uplo); } /*! \brief Print data from a normal or strided_batched array on the CPU to file*/ template void print_host_matrix(const std::string file, const rocblas_int m, const rocblas_int n, T* A, const rocblas_int lda, const rocblas_stride stride = 1, const rocblas_int idx = 0, const rocblas_fill uplo = rocblas_fill_full) { std::ofstream os(file); print_to_stream(os, "", m, n, A + idx * stride, lda, uplo); } /*! \brief Print data from a batched array on the CPU to file*/ template void print_host_matrix(const std::string file, const rocblas_int m, const rocblas_int n, T* const A[], const rocblas_int lda, const rocblas_stride stride = 1, const rocblas_int idx = 0, const rocblas_fill uplo = rocblas_fill_full) { std::ofstream os(file); print_to_stream(os, "", m, n, A[idx], lda, uplo); } /*! \brief Debugging purpose, print out CPU and GPU result matrix */ /*******************************************************************/ template void print_host_matrix(std::ostream& os, const std::string name, const rocblas_int m, const rocblas_int n, T* CPU_result, T* GPU_result, const rocblas_int lda) { std::string s; bool empty = name.empty(); if(!empty) s += fmt::format("{}-by-{} matrix: {}\n", m, n, name); for(size_t j = 0; j < n; j++) { for(size_t i = 0; i < m; i++) { s += fmt::format("matrix row {}, col {}, CPU result={}, GPU result={}\n", i, j, CPU_result[j * lda + i], GPU_result[j * lda + i]); } } s += '\n'; os << s; os.flush(); } template void print_host_matrix(std::ostream& os, const std::string name, const rocblas_int m, const rocblas_int n, T* CPU_result, T* GPU_result, const rocblas_int lda, double error_tolerance) { std::string s; bool empty = name.empty(); if(!empty) s += fmt::format("{}-by-{} matrix: {}\n", m, n, name); for(size_t j = 0; j < n; j++) { for(size_t i = 0; i < m; i++) { T comp = (CPU_result[j * lda + i] - GPU_result[j * lda + i]) / CPU_result[j * lda + i]; if(std::abs(comp) > error_tolerance) s += fmt::format("matrix row {}, col {}, CPU result={}, GPU result={}\n", i, j, CPU_result[j * lda + i], GPU_result[j * lda + i]); } } s += '\n'; os << s; os.flush(); } rocSOLVER-rocm-5.5.1/common/include/fmt_rocblas_types.hpp000066400000000000000000000017261436600607200233530ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2021-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include /* The format function for user-defined types cannot be const before fmt v8.0 but must be const in fmt v8.1 if the type is used in a tuple. */ #if FMT_VERSION < 80000 #define ROCSOLVER_FMT_CONST #else #define ROCSOLVER_FMT_CONST const #endif namespace fmt { template struct formatter> : formatter { template auto format(const rocblas_complex_num& value, FormatCtx& ctx) ROCSOLVER_FMT_CONST { formatter::format(value.real(), ctx); format_to(ctx.out(), "+"); formatter::format(value.imag(), ctx); format_to(ctx.out(), "*i"); return ctx.out(); } }; } rocSOLVER-rocm-5.5.1/common/include/rocblas_utility.hpp000066400000000000000000000343721436600607200230470ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2016-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include #include #include #include #include #include #include #include #pragma STDC CX_LIMITED_RANGE ON // half vectors typedef rocblas_half rocblas_half8 __attribute__((ext_vector_type(8))); typedef rocblas_half rocblas_half2 __attribute__((ext_vector_type(2))); #ifndef ROCSOLVER_CLIENTS_TEST extern "C" __device__ rocblas_half2 llvm_fma_v2f16(rocblas_half2, rocblas_half2, rocblas_half2) __asm("llvm.fma.v2f16"); __device__ inline rocblas_half2 rocblas_fmadd_half2(rocblas_half2 multiplier, rocblas_half2 multiplicand, rocblas_half2 addend) { return llvm_fma_v2f16(multiplier, multiplicand, addend); } // Conjugate a value. For most types, simply return argument; for // rocblas_float_complex and rocblas_double_complex, return std::conj(z) template , int> = 0> __device__ __host__ inline T conj(const T& z) { return z; } template , int> = 0> __device__ __host__ inline T conj(const T& z) { return std::conj(z); } // Load a scalar. If the argument is a pointer, dereference it; otherwise copy // it. Allows the same kernels to be used for host and device scalars. // For host scalars template __forceinline__ __device__ __host__ T load_scalar(T x) { return x; } // For device scalars template __forceinline__ __device__ __host__ T load_scalar(const T* xp) { return *xp; } // For rocblas_half2, we broadcast a fp16 across two halves template <> __forceinline__ __device__ __host__ rocblas_half2 load_scalar(const rocblas_half2* xp) { auto x = *reinterpret_cast(xp); return {x, x}; } // Load a batched scalar. This only works on the device. Used for batched // functions which may pass an array of scalars rather than a single scalar. // For device side array of scalars template __forceinline__ __device__ __host__ T load_scalar(T* x, rocblas_int idx, rocblas_int inc) { return x[idx * inc]; } // Overload for single scalar value template __forceinline__ __device__ __host__ T load_scalar(T x, rocblas_int idx, rocblas_int inc) { return x; } // Load a pointer from a batch. If the argument is a T**, use block to index it // and add the offset, if the argument is a T*, add block * stride to pointer // and add offset. // For device array of device pointers // For device pointers template __forceinline__ __device__ __host__ T* load_ptr_batch(T* p, rocblas_int block, ptrdiff_t offset, rocblas_stride stride) { return p + block * stride + offset; } // For device array of device pointers template __forceinline__ __device__ __host__ T* load_ptr_batch(T* const* p, rocblas_int block, ptrdiff_t offset, rocblas_stride stride) { return p[block] + offset; } template __forceinline__ __device__ __host__ T* load_ptr_batch(T** p, rocblas_int block, ptrdiff_t offset, rocblas_stride stride) { return p[block] + offset; } /* // Helper for batched functions with temporary memory, currently just trsm and // trsv. Copys addresses to array of pointers for batched versions. template ROCSOLVER_KERNEL void setup_batched_array_kernel(T* src, rocblas_stride src_stride, T* dst[]) { dst[hipBlockIdx_x] = src + hipBlockIdx_x * src_stride; } template void setup_batched_array(hipStream_t stream, T* src, rocblas_stride src_stride, T* dst[], rocblas_int batch_count) { dim3 grid(batch_count); dim3 threads(BLOCK); hipLaunchKernelGGL(setup_batched_array_kernel, grid, threads, 0, stream, src, src_stride, dst); } template ROCSOLVER_KERNEL void setup_device_pointer_array_kernel(T* src, rocblas_stride src_stride, T* dst[], rocblas_int batch_count) { ptrdiff_t tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if(tid < batch_count) dst[tid] = src + tid * src_stride; } template void setup_device_pointer_array(hipStream_t stream, T* src, rocblas_stride src_stride, T* dst[], rocblas_int batch_count) { int NB = 256; dim3 grid((batch_count - 1) / NB + 1); dim3 threads(NB); hipLaunchKernelGGL(setup_device_pointer_array_kernel, grid, threads, 0, stream, src, src_stride, dst, batch_count); } */ #endif // ROCSOLVER_CLIENTS_TEST inline bool isAligned(const void* pointer, size_t byte_count) { return reinterpret_cast(pointer) % byte_count == 0; } // clang-format off // // return letter N,T,C in place of rocblas_operation enum // constexpr char rocblas_transpose_letter(rocblas_operation trans) // { // switch(trans) // { // case rocblas_operation_none: return 'N'; // case rocblas_operation_transpose: return 'T'; // case rocblas_operation_conjugate_transpose: return 'C'; // } // return ' '; // } // // return letter L, R, B in place of rocblas_side enum // constexpr char rocblas_side_letter(rocblas_side side) // { // switch(side) // { // case rocblas_side_left: return 'L'; // case rocblas_side_right: return 'R'; // case rocblas_side_both: return 'B'; // } // return ' '; // } // // return letter U, L, B in place of rocblas_fill enum // constexpr char rocblas_fill_letter(rocblas_fill fill) // { // switch(fill) // { // case rocblas_fill_upper: return 'U'; // case rocblas_fill_lower: return 'L'; // case rocblas_fill_full: return 'F'; // } // return ' '; // } // // return letter N, U in place of rocblas_diagonal enum // constexpr char rocblas_diag_letter(rocblas_diagonal diag) // { // switch(diag) // { // case rocblas_diagonal_non_unit: return 'N'; // case rocblas_diagonal_unit: return 'U'; // } // return ' '; // } // return precision string for rocblas_datatype constexpr const char* rocblas_datatype_string(rocblas_datatype type) { switch(type) { case rocblas_datatype_f16_r: return "f16_r"; case rocblas_datatype_f32_r: return "f32_r"; case rocblas_datatype_f64_r: return "f64_r"; case rocblas_datatype_f16_c: return "f16_c"; case rocblas_datatype_f32_c: return "f32_c"; case rocblas_datatype_f64_c: return "f64_c"; case rocblas_datatype_i8_r: return "i8_r"; case rocblas_datatype_u8_r: return "u8_r"; case rocblas_datatype_i32_r: return "i32_r"; case rocblas_datatype_u32_r: return "u32_r"; case rocblas_datatype_i8_c: return "i8_c"; case rocblas_datatype_u8_c: return "u8_c"; case rocblas_datatype_i32_c: return "i32_c"; case rocblas_datatype_u32_c: return "u32_c"; case rocblas_datatype_bf16_r: return "bf16_r"; case rocblas_datatype_bf16_c: return "bf16_c"; case rocblas_datatype_invalid: return "invalid"; } return "invalid"; } // return sizeof rocblas_datatype constexpr size_t rocblas_sizeof_datatype(rocblas_datatype type) { switch(type) { case rocblas_datatype_f16_r: return 2; case rocblas_datatype_f32_r: return 4; case rocblas_datatype_f64_r: return 8; case rocblas_datatype_f16_c: return 4; case rocblas_datatype_f32_c: return 8; case rocblas_datatype_f64_c: return 16; case rocblas_datatype_i8_r: return 1; case rocblas_datatype_u8_r: return 1; case rocblas_datatype_i32_r: return 4; case rocblas_datatype_u32_r: return 4; case rocblas_datatype_i8_c: return 2; case rocblas_datatype_u8_c: return 2; case rocblas_datatype_i32_c: return 8; case rocblas_datatype_u32_c: return 8; case rocblas_datatype_bf16_r: return 2; case rocblas_datatype_bf16_c: return 4; case rocblas_datatype_invalid: return 4; } return 0; } // return rocblas_datatype from type template static constexpr rocblas_datatype rocblas_datatype_from_type = rocblas_datatype_invalid; template <> static constexpr auto rocblas_datatype_from_type = rocblas_datatype_f16_r; template <> static constexpr auto rocblas_datatype_from_type = rocblas_datatype_f32_r; template <> static constexpr auto rocblas_datatype_from_type = rocblas_datatype_f64_r; template <> static constexpr auto rocblas_datatype_from_type = rocblas_datatype_f32_c; template <> static constexpr auto rocblas_datatype_from_type = rocblas_datatype_f64_c; template <> static constexpr auto rocblas_datatype_from_type = rocblas_datatype_i8_r; template <> static constexpr auto rocblas_datatype_from_type = rocblas_datatype_u8_r; template <> static constexpr auto rocblas_datatype_from_type = rocblas_datatype_i32_r; template <> static constexpr auto rocblas_datatype_from_type = rocblas_datatype_u32_r; template <> static constexpr auto rocblas_datatype_from_type = rocblas_datatype_bf16_r; // return precision string for data type template static constexpr char rocblas_precision_string [] = "invalid"; template <> static constexpr char rocblas_precision_string[] = "bf16_r"; template <> static constexpr char rocblas_precision_string[] = "f16_r"; template <> static constexpr char rocblas_precision_string[] = "f32_r"; template <> static constexpr char rocblas_precision_string[] = "f64_r"; template <> static constexpr char rocblas_precision_string[] = "i8_r"; template <> static constexpr char rocblas_precision_string[] = "u8_r"; template <> static constexpr char rocblas_precision_string[] = "i32_r"; template <> static constexpr char rocblas_precision_string[] = "u32_r"; template <> static constexpr char rocblas_precision_string[] = "f32_c"; template <> static constexpr char rocblas_precision_string[] = "f64_c"; #if 0 // Not implemented template <> static constexpr char rocblas_precision_string[] = "f16_c"; template <> static constexpr char rocblas_precision_string[] = "i8_c"; template <> static constexpr char rocblas_precision_string[] = "u8_c"; template <> static constexpr char rocblas_precision_string[] = "i32_c"; template <> static constexpr char rocblas_precision_string[] = "u32_c"; #endif // clang-format on /******************************************************************************* * \brief convert hipError_t to rocblas_status * TODO - enumerate library calls to hip runtime, enumerate possible errors from *those calls ******************************************************************************/ constexpr rocblas_status get_rocblas_status_for_hip_status(hipError_t status) { switch(status) { // success case hipSuccess: return rocblas_status_success; // internal hip memory allocation case hipErrorMemoryAllocation: case hipErrorLaunchOutOfResources: return rocblas_status_memory_error; // user-allocated hip memory case hipErrorInvalidDevicePointer: // hip memory return rocblas_status_invalid_pointer; // user-allocated device, stream, event case hipErrorInvalidDevice: case hipErrorInvalidResourceHandle: return rocblas_status_invalid_handle; // library using hip incorrectly case hipErrorInvalidValue: return rocblas_status_internal_error; // hip runtime failing case hipErrorNoDevice: // no hip devices case hipErrorUnknown: default: return rocblas_status_internal_error; } } // Absolute value template , int> = 0> __device__ __host__ inline T rocblas_abs(T x) { return x < 0 ? -x : x; } // For complex, we have defined a __device__ __host__ compatible std::abs template , int> = 0> __device__ __host__ inline auto rocblas_abs(T x) { return std::abs(x); } // rocblas_bfloat16 is handled specially __device__ __host__ inline rocblas_bfloat16 rocblas_abs(rocblas_bfloat16 x) { x.data &= 0x7fff; return x; } // rocblas_half __device__ __host__ inline rocblas_half rocblas_abs(rocblas_half x) { union { rocblas_half x; uint16_t data; } t = {x}; t.data &= 0x7fff; return t.x; } // Get base types from complex types. template struct rocblas_real_t_impl { using type = T; }; template struct rocblas_real_t_impl>> { using type = decltype(std::real(T{})); }; template struct rocblas_real_t_impl> { using type = T; }; template using real_t = typename rocblas_real_t_impl::type; // Output rocblas_half value inline std::ostream& operator<<(std::ostream& os, rocblas_half x) { return os << float(x); } // Convert the current C++ exception to rocblas_status // This allows extern "C" functions to return this function in a catch(...) // block while converting all C++ exceptions to an equivalent rocblas_status // here inline rocblas_status exception_to_rocblas_status(std::exception_ptr e = std::current_exception()) try { if(e) std::rethrow_exception(e); return rocblas_status_success; } catch(const rocblas_status& status) { return status; } catch(const std::bad_alloc&) { return rocblas_status_memory_error; } catch(...) { return rocblas_status_internal_error; } rocSOLVER-rocm-5.5.1/common/include/rocsolver_datatype2string.hpp000066400000000000000000000253431436600607200250570ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2018-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "rocblas/rocblas.h" #include "rocsolver/rocsolver.h" #include typedef enum rocblas_initialization_ : int { rocblas_initialization_random_int = 111, rocblas_initialization_trig_float = 222, rocblas_initialization_hpl = 333, } rocblas_initialization; // return char from type template static constexpr char rocblas2char_precision = '\0'; template <> static constexpr auto rocblas2char_precision = 's'; template <> static constexpr auto rocblas2char_precision = 'd'; template <> static constexpr auto rocblas2char_precision = 'c'; template <> static constexpr auto rocblas2char_precision = 'z'; /* ============================================================================================ */ /* Convert rocblas constants to lapack char. */ constexpr auto rocblas2char_operation(rocblas_operation value) { switch(value) { case rocblas_operation_none: return 'N'; case rocblas_operation_transpose: return 'T'; case rocblas_operation_conjugate_transpose: return 'C'; } return '\0'; } constexpr auto rocblas2char_fill(rocblas_fill value) { switch(value) { case rocblas_fill_upper: return 'U'; case rocblas_fill_lower: return 'L'; case rocblas_fill_full: return 'F'; } return '\0'; } constexpr auto rocblas2char_diagonal(rocblas_diagonal value) { switch(value) { case rocblas_diagonal_unit: return 'U'; case rocblas_diagonal_non_unit: return 'N'; } return '\0'; } constexpr auto rocblas2char_side(rocblas_side value) { switch(value) { case rocblas_side_left: return 'L'; case rocblas_side_right: return 'R'; case rocblas_side_both: return 'B'; } return '\0'; } constexpr auto rocblas2char_direct(rocblas_direct value) { switch(value) { case rocblas_forward_direction: return 'F'; case rocblas_backward_direction: return 'B'; } return '\0'; } constexpr auto rocblas2char_storev(rocblas_storev value) { switch(value) { case rocblas_column_wise: return 'C'; case rocblas_row_wise: return 'R'; } return '\0'; } constexpr auto rocblas2char_workmode(rocblas_workmode value) { switch(value) { case rocblas_outofplace: return 'O'; case rocblas_inplace: return 'I'; } return '\0'; } constexpr auto rocblas2char_svect(rocblas_svect value, bool use_V = false) { switch(value) { case rocblas_svect_all: return 'A'; case rocblas_svect_singular: return (use_V ? 'V' : 'S'); case rocblas_svect_overwrite: return 'O'; case rocblas_svect_none: return 'N'; } return '\0'; } constexpr auto rocblas2char_srange(rocblas_srange value) { switch(value) { case rocblas_srange_all: return 'A'; case rocblas_srange_value: return 'V'; case rocblas_srange_index: return 'I'; } return '\0'; } constexpr auto rocblas2char_evect(rocblas_evect value) { switch(value) { case rocblas_evect_original: return 'V'; case rocblas_evect_tridiagonal: return 'I'; case rocblas_evect_none: return 'N'; } return '\0'; } constexpr auto rocblas2char_eform(rocblas_eform value) { switch(value) { case rocblas_eform_ax: return '1'; case rocblas_eform_abx: return '2'; case rocblas_eform_bax: return '3'; } return '\0'; } constexpr auto rocblas2char_erange(rocblas_erange value) { switch(value) { case rocblas_erange_all: return 'A'; case rocblas_erange_value: return 'V'; case rocblas_erange_index: return 'I'; } return '\0'; } constexpr auto rocblas2char_eorder(rocblas_eorder value) { switch(value) { case rocblas_eorder_blocks: return 'B'; case rocblas_eorder_entire: return 'E'; } return '\0'; } constexpr auto rocblas2char_esort(rocblas_esort value) { switch(value) { case rocblas_esort_none: return 'N'; case rocblas_esort_ascending: return 'A'; } return '\0'; } // return precision string for rocblas_datatype constexpr auto rocblas2string_datatype(rocblas_datatype type) { switch(type) { case rocblas_datatype_f16_r: return "f16_r"; case rocblas_datatype_f32_r: return "f32_r"; case rocblas_datatype_f64_r: return "f64_r"; case rocblas_datatype_f16_c: return "f16_k"; case rocblas_datatype_f32_c: return "f32_c"; case rocblas_datatype_f64_c: return "f64_c"; case rocblas_datatype_i8_r: return "i8_r"; case rocblas_datatype_u8_r: return "u8_r"; case rocblas_datatype_i32_r: return "i32_r"; case rocblas_datatype_u32_r: return "u32_r"; case rocblas_datatype_i8_c: return "i8_c"; case rocblas_datatype_u8_c: return "u8_c"; case rocblas_datatype_i32_c: return "i32_c"; case rocblas_datatype_u32_c: return "u32_c"; case rocblas_datatype_bf16_r: return "bf16_r"; case rocblas_datatype_bf16_c: return "bf16_c"; case rocblas_datatype_invalid: return "invalid"; } return "invalid"; } constexpr auto rocblas2string_initialization(rocblas_initialization init) { switch(init) { case rocblas_initialization_random_int: return "rand_int"; case rocblas_initialization_trig_float: return "trig_float"; case rocblas_initialization_hpl: return "hpl"; } return "invalid"; } /* ============================================================================================ */ /* Convert lapack char constants to rocblas type. */ constexpr rocblas_operation char2rocblas_operation(char value) { switch(value) { case 'N': case 'n': return rocblas_operation_none; case 'T': case 't': return rocblas_operation_transpose; case 'C': case 'c': return rocblas_operation_conjugate_transpose; default: return static_cast(0); } } constexpr rocblas_fill char2rocblas_fill(char value) { switch(value) { case 'U': case 'u': return rocblas_fill_upper; case 'L': case 'l': return rocblas_fill_lower; default: return static_cast(0); } } constexpr rocblas_diagonal char2rocblas_diagonal(char value) { switch(value) { case 'U': case 'u': return rocblas_diagonal_unit; case 'N': case 'n': return rocblas_diagonal_non_unit; default: return static_cast(0); } } constexpr rocblas_side char2rocblas_side(char value) { switch(value) { case 'L': case 'l': return rocblas_side_left; case 'R': case 'r': return rocblas_side_right; default: return static_cast(0); } } constexpr rocblas_direct char2rocblas_direct(char value) { switch(value) { case 'F': return rocblas_forward_direction; case 'B': return rocblas_backward_direction; default: return static_cast(0); } } constexpr rocblas_storev char2rocblas_storev(char value) { switch(value) { case 'C': return rocblas_column_wise; case 'R': return rocblas_row_wise; default: return static_cast(0); } } constexpr rocblas_workmode char2rocblas_workmode(char value) { switch(value) { case 'O': return rocblas_outofplace; case 'I': return rocblas_inplace; default: return static_cast(0); } } constexpr rocblas_svect char2rocblas_svect(char value) { switch(value) { case 'A': return rocblas_svect_all; case 'S': case 'V': return rocblas_svect_singular; case 'O': return rocblas_svect_overwrite; case 'N': return rocblas_svect_none; default: return static_cast(0); } } constexpr rocblas_srange char2rocblas_srange(char value) { switch(value) { case 'A': return rocblas_srange_all; case 'V': return rocblas_srange_value; case 'I': return rocblas_srange_index; default: return static_cast(-1); } } constexpr rocblas_evect char2rocblas_evect(char value) { switch(value) { case 'V': return rocblas_evect_original; case 'I': return rocblas_evect_tridiagonal; case 'N': return rocblas_evect_none; default: return static_cast(0); } } constexpr rocblas_eform char2rocblas_eform(char value) { switch(value) { case '1': return rocblas_eform_ax; case '2': return rocblas_eform_abx; case '3': return rocblas_eform_bax; default: return static_cast(0); } } constexpr rocblas_erange char2rocblas_erange(char value) { switch(value) { case 'A': return rocblas_erange_all; case 'V': return rocblas_erange_value; case 'I': return rocblas_erange_index; default: return static_cast(0); } } constexpr rocblas_eorder char2rocblas_eorder(char value) { switch(value) { case 'B': return rocblas_eorder_blocks; case 'E': return rocblas_eorder_entire; default: return static_cast(0); } } constexpr rocblas_esort char2rocblas_esort(char value) { switch(value) { case 'N': return rocblas_esort_none; case 'A': return rocblas_esort_ascending; default: return static_cast(0); } } // clang-format off inline rocblas_initialization string2rocblas_initialization(const std::string& value) { return value == "rand_int" ? rocblas_initialization_random_int : value == "trig_float" ? rocblas_initialization_trig_float : value == "hpl" ? rocblas_initialization_hpl : static_cast(0); } inline rocblas_datatype string2rocblas_datatype(const std::string& value) { return value == "f16_r" || value == "h" ? rocblas_datatype_f16_r : value == "f32_r" || value == "s" ? rocblas_datatype_f32_r : value == "f64_r" || value == "d" ? rocblas_datatype_f64_r : value == "bf16_r" ? rocblas_datatype_bf16_r : value == "f16_c" ? rocblas_datatype_f16_c : value == "f32_c" || value == "c" ? rocblas_datatype_f32_c : value == "f64_c" || value == "z" ? rocblas_datatype_f64_c : value == "bf16_c" ? rocblas_datatype_bf16_c : value == "i8_r" ? rocblas_datatype_i8_r : value == "i32_r" ? rocblas_datatype_i32_r : value == "i8_c" ? rocblas_datatype_i8_c : value == "i32_c" ? rocblas_datatype_i32_c : value == "u8_r" ? rocblas_datatype_u8_r : value == "u32_r" ? rocblas_datatype_u32_r : value == "u8_c" ? rocblas_datatype_u8_c : value == "u32_c" ? rocblas_datatype_u32_c : rocblas_datatype_invalid; } // clang-format on rocSOLVER-rocm-5.5.1/common/src/000077500000000000000000000000001436600607200162615ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/common/src/common_host_helpers.cpp000066400000000000000000000022051436600607200230330ustar00rootroot00000000000000/* ************************************************************************ * Copyright (c) 2020-2021 Advanced Micro Devices, Inc. * ************************************************************************ */ #include #include "common_host_helpers.hpp" /*********************************************************************** * timing functions * ***********************************************************************/ /* CPU Timer (in microseconds): no GPU synchronization */ double get_time_us_no_sync() { namespace sc = std::chrono; const sc::steady_clock::time_point t = sc::steady_clock::now(); return double(sc::duration_cast(t.time_since_epoch()).count()); } /* CPU Timer (in microseconds): synchronize with the default device and return wall time */ double get_time_us() { hipDeviceSynchronize(); return get_time_us_no_sync(); } /* CPU Timer (in microseconds): synchronize with given queue/stream and return wall time */ double get_time_us_sync(hipStream_t stream) { hipStreamSynchronize(stream); return get_time_us_no_sync(); } rocSOLVER-rocm-5.5.1/custom.properties000066400000000000000000000001401436600607200176250ustar00rootroot00000000000000booktitle=rocSOLVER API Guide spreadsheet.xml=docs/classification-map.xml document.locale=enusrocSOLVER-rocm-5.5.1/docs/000077500000000000000000000000001436600607200151325ustar00rootroot00000000000000rocSOLVER-rocm-5.5.1/docs/Dockerfile000066400000000000000000000001771436600607200171310ustar00rootroot00000000000000FROM readthedocs/build:latest USER root:root RUN apt-get install -qq doxygen RUN pip3 install breathe sphinx sphinx_rtd_theme rocSOLVER-rocm-5.5.1/docs/Doxyfile000066400000000000000000003223251436600607200166470ustar00rootroot00000000000000# Doxyfile 1.8.10 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all text # before the first occurrence of this tag. Doxygen uses libiconv (or the iconv # built into libc) for the transcoding. See http://www.gnu.org/software/libiconv # for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = "rocsolver" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = v0.1 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "implementation of LAPACK routines on the ROCm platform" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = ./rocmlogo.png # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = docBin # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = YES # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: # FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: # Fortran. In the later case the parser tries to guess whether the code is fixed # or free formatted code, this is the default for Fortran type files), VHDL. For # instance to make doxygen treat .inc files as Fortran files (default is PHP), # and .f files as C (default is Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = YES # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = YES # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = YES # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. # The default value is: NO. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS # then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but # at the end of the doxygen process doxygen will return with a non-zero status. # Possible values are: NO, YES and FAIL_ON_WARNINGS. # The default value is: NO. WARN_AS_ERROR = YES #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = ../library/include/rocsolver ../library/src/include/ideal_sizes.hpp # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: http://www.gnu.org/software/libiconv) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, # *.vhdl, *.ucf, *.qsf, *.as and *.js. FILE_PATTERNS = *.c \ *.cc \ *.cxx \ *.cpp \ *.c++ \ *.java \ *.ii \ *.ixx \ *.ipp \ *.i++ \ *.inl \ *.idl \ *.ddl \ *.odl \ *.h \ *.hh \ *.hxx \ *.hpp \ *.h++ \ *.cs \ *.d \ *.php \ *.php4 \ *.php5 \ *.phtml \ *.inc \ *.m \ *.markdown \ *.md \ *.mm \ *.dox \ *.py \ *.f90 \ *.f \ *.for \ *.tcl \ *.vhd \ *.vhdl \ *.ucf \ *.qsf \ *.as \ *.js # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = ../README.md #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # function all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see http://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the config file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: http://clang.llvm.org/) for more accurate parsing at the # cost of reduced performance. This can be particularly helpful with template # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was # compiled with the --with-libclang option. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = YES # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # http://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: http://developer.apple.com/tools/xcode/), introduced with # OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 1 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # http://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = YES # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from http://www.mathjax.org before deployment. # The default value is: http://cdn.mathjax.org/mathjax/latest. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /