pax_global_header00006660000000000000000000000064132674362260014525gustar00rootroot0000000000000052 comment=f036aef3a425560161de362f390d238f4e7c1721 libgpuarray-0.7.6/000077500000000000000000000000001326743622600140605ustar00rootroot00000000000000libgpuarray-0.7.6/.appveyor.yml000066400000000000000000000031441326743622600165300ustar00rootroot00000000000000version: '1.0.{build}' # This number doesn't matter pull_requests: do_not_increment_build_number: true platform: - x64 image: Visual Studio 2015 clone_folder: C:\projects\libgpuarray configuration: - Release environment: BINSTAR_TOKEN: secure: 58KqJcKtfCBVCuIzpnkLm4XZLQqKq95Hs8Ly20HWaMSla67nusrp3y4sy6XzZOBQ matrix: - CONDA_LOC: "C:\\Miniconda-x64" PATCH_VS2008: "1" - CONDA_LOC: "C:\\Miniconda35-x64" PATCH_VS2008: "0" - CONDA_LOC: "C:\\Miniconda36-x64" PATCH_VS2008: "0" install: # This breaks conda-build because of git - cmd: rmdir C:\cygwin /s /q - cmd: call %CONDA_LOC%\Scripts\activate.bat - cmd: set PYTHONUNBUFFERED=1 - cmd: conda install -n root --yes conda conda-env conda-build anaconda-client # We borrow a trick from conda-forge to fix the VS2008 compiler - ps: | if($env:PATCH_VS2008 -eq '1') { cmd /c "conda config --append channels conda-forge 2>&1" cmd /c "conda install --yes vs2008_express_vc_python_patch 2>&1" cmd /c "call setup_x64 2>&1" } build: off test_script: - cmd: for /f "tokens=*" %%i in ('python -c "import versioneer; print(versioneer.get_version())"') do set GPUARRAY_VERSION=%%i - cmd: echo %GPUARRAY_VERSION% - cmd: conda build conda - cmd: mkdir pkgs - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\pygpu* pkgs\ /Y - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\libgpuarray* pkgs\ /Y - ps: | if($env:appveyor_repo_tag -eq 'True') { cmd /c "anaconda -t $env:BINSTAR_TOKEN upload --user=mila-udem pkgs/* 2>&1" } artifacts: - path: pkgs/* name: "Conda Packages" libgpuarray-0.7.6/.circleci/000077500000000000000000000000001326743622600157135ustar00rootroot00000000000000libgpuarray-0.7.6/.circleci/config.yml000066400000000000000000000024401326743622600177030ustar00rootroot00000000000000version: 2 jobs: build_pkgs: docker: - image: joaander/conda-build:20170905 steps: - checkout - run: name: "Checkout Merge Commit" command: | if [[ -n "${CIRCLE_PR_NUMBER}" ]] then git fetch -u origin "+refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge" git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" fi - run: name: "Build Recipe" command: | export GPUARRAY_VERSION=`python -c 'import versioneer; print(versioneer.get_version())'` conda build --python 2.7 conda conda build --python 3.5 conda/pygpu conda build --python 3.6 conda/pygpu - run: name: "Upload Tagged Versions" command: | if [[ -n "${CIRCLE_TAG}" ]] then anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /opt/conda/conda-bld/linux-64/libgpuarray* anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /opt/conda/conda-bld/linux-64/pygpu* fi - store_artifacts: path: /opt/conda/conda-bld/linux-64 workflows: version: 2 build_and_test: jobs: - build_pkgs: filters: tags: only: /.*/ libgpuarray-0.7.6/.clean000066400000000000000000000002551326743622600151450ustar00rootroot00000000000000Build build Debug Release lib __pycache__ .idea .*.sw[po] *~ *.pyc *.pyd *.pyo *.egg-info dist setuptools*egg setuptools.pth distribute*egg distribute*tar.gz *.so *.o *.log libgpuarray-0.7.6/.gitattributes000066400000000000000000000000371326743622600167530ustar00rootroot00000000000000pygpu/_version.py export-subst libgpuarray-0.7.6/.gitignore000066400000000000000000000004471326743622600160550ustar00rootroot00000000000000Build build Debug Release lib .idea .*.sw[po] *~ *.pyc *.pyd *.pyo *.egg-info MANIFEST dist setuptools*egg setuptools.pth distribute*egg distribute*tar.gz *.so *.o *.log doc/_build doc/_doxybuild pygpu/*.c pygpu/*.h pygpu/version.py src/gpuarray/abi_version.h src/private_config.h Makefile.conf libgpuarray-0.7.6/.jenkins-pr.sh000077500000000000000000000030561326743622600165610ustar00rootroot00000000000000#!/bin/bash # Script for Jenkins continuous integration testing of libgpuarray # Print commands as they are executed set -x # Anaconda python export PATH=/usr/local/miniconda2/bin:$PATH # CUDA export PATH=/usr/local/cuda/bin:$PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH # Can also set to "Debug", "Release" to go faster : ${GPUARRAY_CONFIG:="Release"} # Set these to " " to disable (empty doesn't work) : ${DEVICES_CUDA:="cuda"} # for multiple devices use "cuda0 cuda1" : ${DEVICES_OPENCL:=" "} git rev-parse HEAD # Build libgpuarray and run C tests mkdir build (cd build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} && make) # Test on different devices for dev in ${DEVICES_CUDA}; do echo "Testing libgpuarray for DEVICE=${dev}" (cd build && DEVICE=${dev} make test) done for dev in ${DEVICES_OPENCL}; do echo "Testing libgpuarray for DEVICE=${dev}" (cd build && DEVICE=${dev} make test) done export LD_LIBRARY_PATH=`pwd`/lib:${LD_LIBRARY_PATH} export LIBRARY_PATH=`pwd`/lib:${LIBRARY_PATH} export CPATH=`pwd`/src:${CPATH} # Build the pygpu modules python setup.py build_ext --inplace # Test it test=pygpu for dev in ${DEVICES_CUDA}; do echo "Testing pygpu for DEVICE=${dev}" DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml pygpu/tests done for dev in ${DEVICES_OPENCL}; do echo "Testing pygpu for DEVICE=${dev}" DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml pygpu/tests -e test_blas.py done libgpuarray-0.7.6/.jenkins_pr_mac.sh000077500000000000000000000032641326743622600174640ustar00rootroot00000000000000#!/bin/bash # Script for Jenkins continuous integration testing of libgpuarray on mac # Print commands as they are executed set -x # Set path for conda and cmake export PATH="/Users/jenkins/miniconda2/bin:/usr/local/bin:$PATH" # CUDA export PATH=/usr/local/cuda/bin:${PATH} export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:${DYLD_LIBRARY_PATH} export CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH} # Can also set to "Debug", "Release" to go faster : ${GPUARRAY_CONFIG:="Release"} # Set these to " " to disable (empty doesn't work) : ${DEVICES_CUDA:="cuda"} # for multiple devices use "cuda0 cuda1" : ${DEVICES_OPENCL:=" "} git rev-parse HEAD # Build libgpuarray and run C tests rm -rf build lib mkdir build (cd build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} && make) # Test on different devices for dev in ${DEVICES_CUDA}; do echo "Testing libgpuarray for DEVICE=${dev}" (cd build && DEVICE=${dev} make test) done for dev in ${DEVICES_OPENCL}; do echo "Testing libgpuarray for DEVICE=${dev}" (cd build && DEVICE=${dev} make test) done export PYTHONPATH=`pwd`/lib/python:$PYTHONPATH export DYLD_LIBRARY_PATH=`pwd`/lib:${DYLD_LIBRARY_PATH} export CPLUS_INCLUDE_PATH=`pwd`/src:${CPLUS_INCLUDE_PATH} # Build the pygpu modules python setup.py build_ext --inplace -I`pwd`/src -L`pwd`/lib # Test it test=pygpu_pr_mac for dev in ${DEVICES_CUDA}; do echo "Testing pygpu for DEVICE=${dev}" DEVICE=${dev} nosetests --with-xunit --xunit-file=${test}_${dev}tests.xml pygpu/tests done for dev in ${DEVICES_OPENCL}; do echo "Testing pygpu for DEVICE=${dev}" DEVICE=${dev} nosetests --with-xunit --xunit-file=${test}_${dev}tests.xml pygpu/tests -e test_blas.py done libgpuarray-0.7.6/.jenkins_pr_win.bat000066400000000000000000000024751326743622600176550ustar00rootroot00000000000000REM Set path for cuda, conda python and cmake REM Set conda python, cudnn, cmake path set PATH=%PATH%;C:\ProgramData\Miniconda2;C:\ProgramData\Miniconda2\Library\usr\bin;C:\ProgramData\Miniconda2\Library\bin;C:\ProgramData\Miniconda2\Scripts set PATH=%PATH%;%CUDNNPATH%\bin;C:\Program Files\CMake\bin REM Can also set to "Debug", "Release" to go faster set GPUARRAY_CONFIG="Release" REM Use spaces to seperate devices set DEVICES_CUDA=cuda set DEVICES_OPENCL= git rev-parse HEAD REM Clean up previous installs (to make sure no old files are left) rmdir %WORKSPACE%\lib /s/q mkdir %WORKSPACE%\lib rmdir build /s/q mkdir build REM Build libgpuarray and run C tests cd build cmake .. -DCMAKE_BUILD_TYPE=%GPUARRAY_CONFIG% -G "NMake Makefiles" nmake cd .. set PATH=%PATH%;%WORKSPACE%\lib REM Add conda gcc toolchain path set PATH=%PATH%;C:\ProgramData\Miniconda2\Library\mingw-w64\bin REM Build the pygpu modules python setup.py build_ext --inplace REM Test pygpu set test=pygpu for %%d in (%DEVICES_CUDA%) do ( echo "Testing pygpu for DEVICE=%%d" set DEVICE=%%d nosetests --with-xunit --xunit-file=%test%_%DEVICE%_tests.xml pygpu\tests ) for %%d in (%DEVICES_OPENCL%) do ( echo "Testing pygpu for DEVICE=%%d" set DEVICE=%%d nosetests --with-xunit --xunit-file=%test%_%DEVICE%_tests.xml pygpu\tests -e test_blas.py ) libgpuarray-0.7.6/.travis.yml000066400000000000000000000016071326743622600161750ustar00rootroot00000000000000language: c matrix: include: - os: osx compiler: clang before_install: - export PREFIX=$HOME/.local - brew update && brew install doxygen - export PYTHONUSERBASE=$PREFIX - pip2 install --user breathe sphinx==1.6.3 sphinx_rtd_theme cython numpy 'mako>=0.7' six - export PATH=$PATH:$PREFIX/bin - export CPATH=$CPATH:$PREFIX/include - export LIBRARY_PATH=$LIBRARY_PATH:$PREFIX/lib - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PREFIX/lib # Build with Debug and Release to flush out build problems script: - mkdir Debug - (cd Debug && cmake .. -DCMAKE_BUILD_TYPE=Debug && make) - mkdir Release - (cd Release && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX && make && make install) - python setup.py build_ext --inplace - (cd doc && make html) # Do not treat "shell_session_update: command not found" on MacOS as a failure after_script: set +e libgpuarray-0.7.6/CMakeLists.txt000066400000000000000000000021701326743622600166200ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.0) PROJECT(libgpuarray C) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") # -Wall is unbelieveably noisy with Visual Studio: # https://stackoverflow.com/q/4001736/3257826 if(MSVC) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3") else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unused-parameter -Werror=format-security -Wdeclaration-after-statement -std=gnu89") endif() enable_testing() execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_SOURCE_DIR}/lib) set(LIBRARY_OUTPUT_PATH ${CMAKE_SOURCE_DIR}/lib) set(CMAKE_OSX_ARCHITECTURES x86_64) set(CMAKE_C_VISIBILITY_PRESET hidden) set(CMAKE_VISIBILITY_INLINES_HIDDEN 1) # Make recent cmake not spam about stuff if(POLICY CMP0063) cmake_policy(SET CMP0063 OLD) endif() add_subdirectory(src) add_subdirectory(tests) # uninstall target configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in" "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY) add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) libgpuarray-0.7.6/CMakeModules/000077500000000000000000000000001326743622600163715ustar00rootroot00000000000000libgpuarray-0.7.6/CMakeModules/FindCLBlast.cmake000066400000000000000000000022271326743622600214630ustar00rootroot00000000000000# - Try to find CLBlast # Once done this will define # # CLBLAST_FOUND - system has CLBlast # CLBLAST_INCLUDE_DIRS - location of CLBlast.h # CLBLAST_LIBRARIES - location of libCLBlast IF(CLBLAST_INCLUDE_DIRS) # Already in cache, be silent set (CLBLAST_FIND_QUIETLY TRUE) ENDIF (CLBLAST_INCLUDE_DIRS) FIND_PATH(CLBLAST_ROOT_DIR NAMES include/clblast_c.h HINTS /usr/local/ $ENV{CLBLAST_ROOT} DOC "CLBlast root directory.") FIND_PATH(_CLBLAST_INCLUDE_DIRS NAMES clblast_c.h HINTS ${CLBLAST_ROOT_DIR}/include DOC "CLBlast Include directory") FIND_LIBRARY(_CLBLAST_LIBRARY NAMES libclblast.so HINTS ${CLBLAST_ROOT_DIR}/lib ${CLBLAST_ROOT_DIR}/lib64 ${CLBLAST_ROOT_DIR}/lib32 DOC "CLBlast lib directory") SET(CLBLAST_INCLUDE_DIRS ${_CLBLAST_INCLUDE_DIRS}) SET(CLBLAST_LIBRARIES ${_CLBLAST_LIBRARY}) # handle the QUIETLY and REQUIRED arguments and set CLBLAST_FOUND to TRUE if # all listed variables are TRUE INCLUDE (FindPackageHandleStandardArgs) FIND_PACKAGE_HANDLE_STANDARD_ARGS(CLBLAST DEFAULT_MSG CLBLAST_LIBRARIES CLBLAST_INCLUDE_DIRS) MARK_AS_ADVANCED(CLBLAST_LIBRARIES CLBLAST_INCLUDE_DIRS) libgpuarray-0.7.6/CMakeModules/FindNCCL.cmake000066400000000000000000000013471326743622600207200ustar00rootroot00000000000000# Find the NCCL libraries # # The following variables are optionally searched for defaults # NCCL_ROOT_DIR: Base directory where all NCCL components are found # # The following are set after configuration is done: # NCCL_FOUND # NCCL_INCLUDE_DIR # NCCL_LIBRARY find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ENV CUDA_PATH ENV NCCL_ROOT_DIR PATH_SUFFIXES include) find_library(NCCL_LIBRARY NAMES nccl PATHS ENV CUDA_PATH ENV NCCL_ROOT_DIR PATH_SUFFIXES lib64 lib) include(FindPackageHandleStandardArgs) find_package_handle_standard_args( NCCL FOUND_VAR NCCL_FOUND REQUIRED_VARS NCCL_LIBRARY NCCL_INCLUDE_DIR) mark_as_advanced( NCCL_INCLUDE_DIR NCCL_LIBRARY) libgpuarray-0.7.6/CMakeModules/FindOpenCL.cmake000066400000000000000000000121111326743622600213100ustar00rootroot00000000000000#.rst: # FindOpenCL # ---------- # # Try to find OpenCL # # Once done this will define:: # # OpenCL_FOUND - True if OpenCL was found # OpenCL_INCLUDE_DIRS - include directories for OpenCL # OpenCL_LIBRARIES - link against this library to use OpenCL # OpenCL_VERSION_STRING - Highest supported OpenCL version (eg. 1.2) # OpenCL_VERSION_MAJOR - The major version of the OpenCL implementation # OpenCL_VERSION_MINOR - The minor version of the OpenCL implementation # # The module will also define two cache variables:: # # OpenCL_INCLUDE_DIR - the OpenCL include directory # OpenCL_LIBRARY - the path to the OpenCL library # #============================================================================= # Copyright 2014 Matthaeus G. Chajdas # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # * Neither the names of Kitware, Inc., the Insight Software Consortium, # nor the names of their contributors may be used to endorse or promote # products derived from this software without specific prior written # permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. function(_FIND_OPENCL_VERSION) include(CheckSymbolExists) include(CMakePushCheckState) set(CMAKE_REQUIRED_QUIET ${OpenCL_FIND_QUIETLY}) CMAKE_PUSH_CHECK_STATE() foreach(VERSION "2_0" "1_2" "1_1" "1_0") set(CMAKE_REQUIRED_INCLUDES "${OpenCL_INCLUDE_DIR}") if(APPLE) CHECK_SYMBOL_EXISTS( CL_VERSION_${VERSION} "${OpenCL_INCLUDE_DIR}/OpenCL/cl.h" OPENCL_VERSION_${VERSION}) else() CHECK_SYMBOL_EXISTS( CL_VERSION_${VERSION} "${OpenCL_INCLUDE_DIR}/CL/cl.h" OPENCL_VERSION_${VERSION}) endif() if(OPENCL_VERSION_${VERSION}) string(REPLACE "_" "." VERSION "${VERSION}") set(OpenCL_VERSION_STRING ${VERSION} PARENT_SCOPE) string(REGEX MATCHALL "[0-9]+" version_components "${VERSION}") list(GET version_components 0 major_version) list(GET version_components 1 minor_version) set(OpenCL_VERSION_MAJOR ${major_version} PARENT_SCOPE) set(OpenCL_VERSION_MINOR ${minor_version} PARENT_SCOPE) break() endif() endforeach() CMAKE_POP_CHECK_STATE() endfunction() find_path(OpenCL_INCLUDE_DIR NAMES CL/cl.h OpenCL/cl.h PATHS ENV "PROGRAMFILES(X86)" ENV AMDAPPSDKROOT ENV INTELOCLSDKROOT ENV NVSDKCOMPUTE_ROOT ENV CUDA_PATH ENV ATISTREAMSDKROOT PATH_SUFFIXES include OpenCL/common/inc "AMD APP/include") _FIND_OPENCL_VERSION() if(WIN32) if(CMAKE_SIZEOF_VOID_P EQUAL 4) find_library(OpenCL_LIBRARY NAMES OpenCL PATHS ENV "PROGRAMFILES(X86)" ENV AMDAPPSDKROOT ENV INTELOCLSDKROOT ENV CUDA_PATH ENV NVSDKCOMPUTE_ROOT ENV ATISTREAMSDKROOT PATH_SUFFIXES "AMD APP/lib/x86" lib/x86 lib/Win32 OpenCL/common/lib/Win32) elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) find_library(OpenCL_LIBRARY NAMES OpenCL PATHS ENV "PROGRAMFILES(X86)" ENV AMDAPPSDKROOT ENV INTELOCLSDKROOT ENV CUDA_PATH ENV NVSDKCOMPUTE_ROOT ENV ATISTREAMSDKROOT PATH_SUFFIXES "AMD APP/lib/x86_64" lib/x86_64 lib/x64 OpenCL/common/lib/x64) endif() else() find_library(OpenCL_LIBRARY NAMES OpenCL HINTS ${OpenCL_INCLUDE_DIR}/.. PATHS ENV AMDAPPSDKROOT ENV INTELOCLSDKROOT ENV NVSDKCOMPUTE_ROOT ENV CUDA_PATH ENV ATISTREAMSDKROOT PATH_SUFFIXES lib64 lib OpenCL/lib64 OpenCL/lib "AMD APP/lib64") endif() set(OpenCL_LIBRARIES ${OpenCL_LIBRARY}) set(OpenCL_INCLUDE_DIRS ${OpenCL_INCLUDE_DIR}) include(FindPackageHandleStandardArgs) find_package_handle_standard_args( OpenCL FOUND_VAR OpenCL_FOUND REQUIRED_VARS OpenCL_LIBRARY OpenCL_INCLUDE_DIR VERSION_VAR OpenCL_VERSION_STRING) mark_as_advanced( OpenCL_INCLUDE_DIR OpenCL_LIBRARY) libgpuarray-0.7.6/CMakeModules/FindclBLAS.cmake000066400000000000000000000022171326743622600212360ustar00rootroot00000000000000# - Try to find clBLAS # Once done this will define # # CLBLAS_FOUND - system has clBLAS # CLBLAS_INCLUDE_DIRS - location of clBLAS.h # CLBLAS_LIBRARIES - location of libclBLAS IF(CLBLAS_INCLUDE_DIRS) # Already in cache, be silent set (CLBLAS_FIND_QUIETLY TRUE) ENDIF (CLBLAS_INCLUDE_DIRS) FIND_PATH(CLBLAS_ROOT_DIR NAMES include/clBLAS.h HINTS /usr/local/ $ENV{CLBLAS_ROOT} DOC "clBLAS root directory.") FIND_PATH(_CLBLAS_INCLUDE_DIRS NAMES clBLAS.h HINTS ${CLBLAS_ROOT_DIR}/include DOC "clBLAS Include directory") FIND_LIBRARY(_CLBLAS_LIBRARY NAMES CLBLAS clBLAS HINTS ${CLBLAS_ROOT_DIR}/lib ${CLBLAS_ROOT_DIR}/lib64 ${CLBLAS_ROOT_DIR}/lib32 PATH_SUFFIXES import DOC "clBLAS lib directory") SET(CLBLAS_INCLUDE_DIRS ${_CLBLAS_INCLUDE_DIRS}) SET(CLBLAS_LIBRARIES ${_CLBLAS_LIBRARY}) # handle the QUIETLY and REQUIRED arguments and set CLBLAS_FOUND to TRUE if # all listed variables are TRUE INCLUDE (FindPackageHandleStandardArgs) FIND_PACKAGE_HANDLE_STANDARD_ARGS(CLBLAS DEFAULT_MSG CLBLAS_LIBRARIES CLBLAS_INCLUDE_DIRS) MARK_AS_ADVANCED(CLBLAS_LIBRARIES CLBLAS_INCLUDE_DIRS) libgpuarray-0.7.6/INSTALL000066400000000000000000000017271326743622600151200ustar00rootroot00000000000000This library should build without problems on Mac OS X (10.5+), Linux, and Windows. If you encounter a build problem, please report it along with a log of the build messages to abergeron@gmail.com. Requirements: - either an OpenCL runtime (with headers) or the CUDA toolkit - CMake [ https://cmake.org ] (to build) Run CMake on the CMakeList.txt file in src/ and build according to your platform. Set CMAKE_INSTALL_PREFIX to your desired path if you don't want the platform default. ('cmake -DCMAKE_INSTALL_PREFIX=' for unix-like platforms) Pay attention to the messages from CMake since the library might still build if no backends are found, but it won't be very useful. There are instruction for installation in the CMake file which make 'make install' work on unix-like platforms. I have no idea how to install on Windows. If you also want the python bindings, run 'python setup.py install' after building and installing the library which will install pygpu. libgpuarray-0.7.6/LICENSE000066400000000000000000000017301326743622600150660ustar00rootroot00000000000000All code in this package is covered by the following license and copyright unless specified otherwise in the file: Copyright (c) 2013 Arnaud Begeron Permission to use, copy, modify, and distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. The instances that have a different copyright or license are: src/gpuarray_strl.c (MIT, but different copyright) libgpuarray-0.7.6/MANIFEST.in000066400000000000000000000000601326743622600156120ustar00rootroot00000000000000include versioneer.py include pygpu/_version.py libgpuarray-0.7.6/Makefile000066400000000000000000000026301326743622600155210ustar00rootroot00000000000000rel: install-relc py -include Makefile.conf config: Makefile.conf Makefile.conf: @[ ! -f Makefile.conf ] && cp Makefile.conf.tmpl Makefile.conf && echo "\n\n** Adjust the values in Makefile.conf for your system **\n\n" && exit 1 debug: install-debugc py .PHONY: install-debugc py debug install-relc rel config Debug/Makefile: Makefile.conf mkdir -p Debug ifndef INSTALL_PREFIX (cd Debug && NUM_DEVS=${NUM_DEVS} DEV_NAMES=${DEV_NAMES} cmake .. -DCMAKE_BUILD_TYPE=Debug) else (cd Debug && NUM_DEVS=${NUM_DEVS} DEV_NAMES=${DEV_NAMES} cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=$(INSTALL_PREFIX)) endif debugc: Debug/Makefile (cd Debug && make) test-debugc: debugc ifndef DEVICE (cd Debug && make test) else (cd Debug && DEVICE=${DEVICE} make test) endif install-debugc: debugc (cd Debug && ${SUDO} make install) Release/Makefile: Makefile.conf mkdir -p Release ifndef INSTALL_PREFIX (cd Release && NUM_DEVS=${NUM_DEVS} DEV_NAMES=${DEV_NAMES} cmake .. -DCMAKE_BUILD_TYPE=Release) else (cd Release && NUM_DEVS=${NUM_DEVS} DEV_NAMES=${DEV_NAMES} cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$(INSTALL_PREFIX)) endif relc: Release/Makefile (cd Release && make) test-relc: relc ifndef DEVICE (cd Release && make test) else (cd Release && DEVICE=${DEVICE} make test) endif install-relc: relc (cd Release && ${SUDO} make install) py: Makefile.conf python setup.py build_ext --inplace libgpuarray-0.7.6/Makefile.conf.tmpl000066400000000000000000000011531326743622600174170ustar00rootroot00000000000000# Set to empty if you don't need sudo SUDO=sudo # Set to the prefix of the install path e.g.: #INSTALL_PREFIX=~/.local # If not set it installs to /usr/local INSTALL_PREFIX= # Set to device that you want to use for single-gpu tests # Set to empty if you want to use default (opencl device) DEVICE= # Set to number of devices you want to use for multi-gpu tests # Set to empty if you want to use default (1 gpu) NUM_DEVS= # Set to devices you want to use for multi-gpu tests # Use a string of space separated device names, e.g. DEV_NAMES="cuda0 cuda1" # Set to empty if you want to use defaults (cuda device) DEV_NAMES= libgpuarray-0.7.6/README.txt000066400000000000000000000001021326743622600155470ustar00rootroot00000000000000The web site is at: http://deeplearning.net/software/libgpuarray/ libgpuarray-0.7.6/bin/000077500000000000000000000000001326743622600146305ustar00rootroot00000000000000libgpuarray-0.7.6/bin/gpuarray-cache000066400000000000000000000026271326743622600174550ustar00rootroot00000000000000#!/usr/bin/env python import os import sys def clean(max_size, path): content = [] for root, dirs, files in os.walk(path): for file in files: fpath = os.path.join(root, file) st = os.stat(fpath) content.append((st.st_atime, st.st_size, fpath)) content.sort() cur_size = 0 for _, size, path in content: cur_size += size if cur_size > max_size: os.remove(path) SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 << 20, 'G': 1 << 30, 'T': 1 << 40, 'P': 1 << 50, 'E': 1 << 60, 'Z': 1 << 70, 'Y': 1 << 80} def get_size(s): i = 0 s = s.strip() if s[-1].upper() in SUFFIXES: num = s[:-1] suf = s[-1].upper() else: num = s suf = "" num = float(num) if suf != "": mult = SUFFIXES[suf] else: mult = 1 return int(num * mult) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='libgpuarray cache maintenance utility') parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning (in bytes with suffixes: K, M, G, ...)') args = parser.parse_args() path = os.environ.get('GPUARRAY_CACHE_PATH', None) if path is None: print("You need to set GPUARRAY_CACHE_PATH so that this programs knows which path to clean.") sys.exit(1) clean(get_size(args.max_size), path) libgpuarray-0.7.6/cmake_uninstall.cmake.in000066400000000000000000000020131326743622600206340ustar00rootroot00000000000000if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files) string(REGEX REPLACE "\n" ";" files "${files}") foreach(file ${files}) message(STATUS "Uninstalling $ENV{DESTDIR}${file}") if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") exec_program( "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\"" OUTPUT_VARIABLE rm_out RETURN_VALUE rm_retval ) if(NOT "${rm_retval}" STREQUAL 0) message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}") endif(NOT "${rm_retval}" STREQUAL 0) else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") message(STATUS "File $ENV{DESTDIR}${file} does not exist.") endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") endforeach(file) libgpuarray-0.7.6/conda/000077500000000000000000000000001326743622600151445ustar00rootroot00000000000000libgpuarray-0.7.6/conda/libgpuarray/000077500000000000000000000000001326743622600174655ustar00rootroot00000000000000libgpuarray-0.7.6/conda/libgpuarray/bld.bat000066400000000000000000000005621326743622600207210ustar00rootroot00000000000000cmake -G "%CMAKE_GENERATOR%" ^ -DCMAKE_PREFIX_PATH="%LIBRARY_PREFIX%" ^ -DCMAKE_INSTALL_PREFIX="%LIBRARY_PREFIX%" ^ -DCMAKE_C_FLAGS="-I%LIBRARY_PREFIX%\include" ^ "%SRC_DIR%" if errorlevel 1 exit 1 cmake --build . --config Release --target ALL_BUILD if errorlevel 1 exit 1 cmake --build . --config Release --target install if errorlevel 1 exit 1 libgpuarray-0.7.6/conda/libgpuarray/build.sh000066400000000000000000000004721326743622600211230ustar00rootroot00000000000000#!/bin/bash if [[ $(uname) == Darwin ]]; then cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_OSX_DEPLOYMENT_TARGET= else cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX fi cmake --build . --config Release --target all cmake --build . --config Release --target install libgpuarray-0.7.6/conda/libgpuarray/meta.yaml000066400000000000000000000012761326743622600213050ustar00rootroot00000000000000package: name: libgpuarray version: {{ environ.get('GPUARRAY_VERSION') }} source: path: ../../ build: number: 0 features: - vc9 # [win and py27] - vc10 # [win and py34] - vc14 # [win and (py35 or py36)] requirements: build: - cmake - mako - python # version doesn't matter here run: - vs2008_runtime [win and py27] - vs2010_runtime [win and py34] - vs2015_runtime [win and (py35 or py36)] about: home: http://github.com/Theano/libgpuarray license: ISC license_file: LICENSE summary: 'Library to manipulate arrays on GPU' doc_url: http://deeplearning.net/software/libgpuarray/ dev_url: http://github.com/Theano/libgpuarray libgpuarray-0.7.6/conda/pygpu/000077500000000000000000000000001326743622600163105ustar00rootroot00000000000000libgpuarray-0.7.6/conda/pygpu/bld.bat000066400000000000000000000002211326743622600175340ustar00rootroot00000000000000set LIB=%LIBRARY_LIB%;%LIB% set INCLUDE=%LIBRARY_INC%;%INCLUDE% %PYTHON% setup.py install --single-version-externally-managed --record=record.txtlibgpuarray-0.7.6/conda/pygpu/build.sh000066400000000000000000000002331326743622600177410ustar00rootroot00000000000000#!/bin/bash export CFLAGS=${CFLAGS}" -I${PREFIX}/include -L${PREFIX}/lib" $PYTHON setup.py install --single-version-externally-managed --record=record.txtlibgpuarray-0.7.6/conda/pygpu/meta.yaml000066400000000000000000000012601326743622600201210ustar00rootroot00000000000000{% set version = environ.get('GPUARRAY_VERSION') %} package: name: pygpu version: {{ version }} source: path: ../../ build: number: 0 detect_binary_files_with_prefix: False requirements: build: - python - cython >=0.25 - numpy 1.11 - mako - setuptools - libgpuarray =={{ version }} run: - python - {{ pin_compatible('numpy', '1.11') }} - mako - six - libgpuarray =={{ version }} about: home: http://github.com/Theano/libgpuarray license: ISC license_file: LICENSE summary: 'Library to manipulate arrays on GPU' doc_url: http://deeplearning.net/software/libgpuarray/ dev_url: http://github.com/Theano/libgpuarray libgpuarray-0.7.6/doc/000077500000000000000000000000001326743622600146255ustar00rootroot00000000000000libgpuarray-0.7.6/doc/Doxyfile000066400000000000000000002317701326743622600163450ustar00rootroot00000000000000# Doxyfile 1.8.3.1 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (" "). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # http://www.gnu.org/software/libiconv for the list of possible encodings. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or sequence of words) that should # identify the project. Note that if you do not use Doxywizard you need # to put quotes around the project name if it contains spaces. PROJECT_NAME = "libgpuarray" # The PROJECT_NUMBER tag can be used to enter a project or revision number. # This could be handy for archiving the generated documentation or # if some version control system is used. PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer # a quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "library for computing on GPUs" # With the PROJECT_LOGO tag one can specify an logo or icon that is # included in the documentation. The maximum height of the logo should not # exceed 55 pixels and the maximum width should not exceed 200 pixels. # Doxygen will copy the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. # If a relative path is entered, it will be relative to the location # where doxygen was started. If left blank the current directory will be used. OUTPUT_DIRECTORY = _doxybuild # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create # 4096 sub-directories (in 2 levels) under the output directory of each output # format and will distribute the generated files over these directories. # Enabling this option can be useful when feeding doxygen a huge amount of # source files, where putting all generated files in the same directory would # otherwise cause performance problems for the file system. CREATE_SUBDIRS = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # The default language is English, other supported languages are: # Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, # Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, # Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English # messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, # Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, # Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will # include brief member descriptions after the members that are listed in # the file and class documentation (similar to JavaDoc). # Set to NO to disable this. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend # the brief description of a member or function before the detailed description. # Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator # that is used to form the text in various listings. Each string # in this list, if found as the leading text of the brief description, will be # stripped from the text and the result after processing the whole list, is # used as the annotated text. Otherwise, the brief description is used as-is. # If left blank, the following values are used ("$name" is automatically # replaced with the name of the entity): "The $name class" "The $name widget" # "The $name file" "is" "provides" "specifies" "contains" # "represents" "a" "an" "the" ABBREVIATE_BRIEF = # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # Doxygen will generate a detailed section even if there is only a brief # description. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full # path before files name in the file list and in the header files. If set # to NO the shortest path that makes the file name unique will be used. FULL_PATH_NAMES = YES # If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag # can be used to strip a user-defined part of the path. Stripping is # only done if one of the specified strings matches the left-hand part of # the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the # path to strip. Note that you specify absolute paths here, but also # relative paths, which will be relative from the directory where doxygen is # started. STRIP_FROM_PATH = ../src # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of # the path mentioned in the documentation of a class, which tells # the reader which header file to include in order to use a class. # If left blank only the name of the header file containing the class # definition is used. Otherwise one should specify the include paths that # are normally passed to the compiler using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter # (but less readable) file names. This can be useful if your file system # doesn't support long names like on DOS, Mac, or CD-ROM. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen # will interpret the first line (until the first dot) of a JavaDoc-style # comment as the brief description. If set to NO, the JavaDoc # comments will behave just like regular Qt-style comments # (thus requiring an explicit @brief command for a brief description.) JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then Doxygen will # interpret the first line (until the first dot) of a Qt-style # comment as the brief description. If set to NO, the comments # will behave just like regular Qt-style comments (thus requiring # an explicit \brief command for a brief description.) QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen # treat a multi-line C++ special comment block (i.e. a block of //! or /// # comments) as a brief description. This used to be the default behaviour. # The new default is to treat a multi-line C++ comment block as a detailed # description. Set this tag to YES if you prefer the old behaviour instead. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES (the default) then an undocumented # member inherits the documentation from any documented member that it # re-implements. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce # a new page for each member. If set to NO, the documentation of a member will # be part of the file/class/namespace that contains it. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. # Doxygen uses this value to replace tabs by spaces in code fragments. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that acts # as commands in the documentation. An alias has the form "name=value". # For example adding "sideeffect=\par Side Effects:\n" will allow you to # put the command \sideeffect (or @sideeffect) in the documentation, which # will result in a user-defined paragraph with heading "Side Effects:". # You can put \n's in the value part of an alias to insert newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding # "class=itcl::class" will allow you to use the command class in the # itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C # sources only. Doxygen will then generate output that is more tailored for C. # For instance, some of the names that are used will be different. The list # of all members will be omitted, etc. OPTIMIZE_OUTPUT_FOR_C = YES # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java # sources only. Doxygen will then generate output that is more tailored for # Java. For instance, namespaces will be presented as packages, qualified # scopes will look different, etc. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources only. Doxygen will then generate output that is more tailored for # Fortran. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for # VHDL. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, # and language is one of the parsers supported by doxygen: IDL, Java, # Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, # C++. For instance to make doxygen treat .inc files as Fortran files (default # is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note # that for custom extensions you also need to set FILE_PATTERNS otherwise the # files are not read by doxygen. EXTENSION_MAPPING = # If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all # comments according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you # can mix doxygen, HTML, and XML commands with Markdown formatting. # Disable only in case of backward compatibilities issues. MARKDOWN_SUPPORT = YES # When enabled doxygen tries to link words that correspond to documented classes, # or namespaces to their corresponding documentation. Such a link can be # prevented in individual cases by by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should # set this tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); v.s. # func(std::string) {}). This also makes the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. # Doxygen will parse them like normal C++ but will assume all classes use public # instead of private inheritance when no explicit protection keyword is present. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES (the # default) will make doxygen replace the get and set methods by a property in # the documentation. This will only work if the methods are indeed getting or # setting a simple type. If this is not the case, or you want to show the # methods anyway, you should set this option to NO. IDL_PROPERTY_SUPPORT = NO # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES, then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. DISTRIBUTE_GROUP_DOC = NO # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a # subgroup of that type (e.g. under the Public Functions section). Set it to # NO to prevent subgrouping. Alternatively, this can be done per class using # the \nosubgrouping command. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and # unions are shown inside the group in which they are included (e.g. using # @ingroup) instead of on a separate page (for HTML and Man pages) or # section (for LaTeX and RTF). INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and # unions with only public data fields will be shown inline in the documentation # of the scope in which they are defined (i.e. file, namespace, or group # documentation), provided this scope is documented. If set to NO (the default), # structs, classes, and unions are shown on a separate page (for HTML and Man # pages) or section (for LaTeX and RTF). INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum # is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically # be useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. TYPEDEF_HIDES_STRUCT = YES # Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be # set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given # their name and scope. Since this can be an expensive process and often the # same symbol appear multiple times in the code, doxygen keeps a cache of # pre-resolved symbols. If the cache is too small doxygen will become slower. # If the cache is too large, memory is wasted. The cache size is given by this # formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0, # corresponding to a cache size of 2^16 = 65536 symbols. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in # documentation are documented, even if no documentation was available. # Private class members and static file members will be hidden unless # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES all private members of a class # will be included in the documentation. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES all members with package or internal # scope will be included in the documentation. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES all static members of a file # will be included in the documentation. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) # defined locally in source files will be included in the documentation. # If set to NO only classes defined in header files are included. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. When set to YES local # methods, which are defined in the implementation section but not in # the interface are included in the documentation. # If set to NO (the default) only methods in the interface are included. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base # name of the file that contains the anonymous namespace. By default # anonymous namespaces are hidden. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all # undocumented members of documented classes, files or namespaces. # If set to NO (the default) these members will be included in the # various overviews, but no documentation section is generated. # This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. # If set to NO (the default) these classes will be included in the various # overviews. This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all # friend (class|struct|union) declarations. # If set to NO (the default) these declarations will be included in the # documentation. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any # documentation blocks found inside the body of a function. # If set to NO (the default) these blocks will be appended to the # function's detailed documentation block. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation # that is typed after a \internal command is included. If the tag is set # to NO (the default) then the documentation will be excluded. # Set it to YES to include the internal documentation. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate # file names in lower-case letters. If set to YES upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen # will show members with their full class and namespace scopes in the # documentation. If set to YES the scope will be hidden. HIDE_SCOPE_NAMES = NO # If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen # will put a list of the files that are included by a file in the documentation # of that file. SHOW_INCLUDE_FILES = YES # If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen # will list include files with double quotes in the documentation # rather than with sharp brackets. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES (the default) then a tag [inline] # is inserted in the documentation for inline members. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen # will sort the (detailed) documentation of file and class members # alphabetically by member name. If set to NO the members will appear in # declaration order. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the # brief documentation of file, namespace and class members alphabetically # by member name. If set to NO (the default) the members will appear in # declaration order. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen # will sort the (brief and detailed) documentation of class members so that # constructors and destructors are listed first. If set to NO (the default) # the constructors will appear in the respective orders defined by # SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. # This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO # and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the # hierarchy of group names into alphabetical order. If set to NO (the default) # the group names will appear in their defined order. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be # sorted by fully-qualified names, including namespaces. If set to # NO (the default), the class list will be sorted only by class name, # not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the # alphabetical list. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to # do proper type resolution of all parameters of a function it will reject a # match between the prototype and the implementation of a member function even # if there is only one candidate or it is obvious which candidate to choose # by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen # will still accept a match between prototype and implementation in such cases. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or # disable (NO) the todo list. This list is created by putting \todo # commands in the documentation. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or # disable (NO) the test list. This list is created by putting \test # commands in the documentation. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or # disable (NO) the bug list. This list is created by putting \bug # commands in the documentation. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or # disable (NO) the deprecated list. This list is created by putting # \deprecated commands in the documentation. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional # documentation sections, marked by \if section-label ... \endif # and \cond section-label ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines # the initial value of a variable or macro consists of for it to appear in # the documentation. If the initializer consists of more lines than specified # here it will be hidden. Use a value of 0 to hide initializers completely. # The appearance of the initializer of individual variables and macros in the # documentation can be controlled using \showinitializer or \hideinitializer # command in the documentation regardless of this setting. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated # at the bottom of the documentation of classes and structs. If set to YES the # list will mention the files that were used to generate the documentation. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. # This will remove the Files entry from the Quick Index and from the # Folder Tree View (if specified). The default is YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the # Namespaces page. # This will remove the Namespaces entry from the Quick Index # and from the Folder Tree View (if specified). The default is YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command , where is the value of # the FILE_VERSION_FILTER tag, and is the name of an input file # provided by doxygen. Whatever the program writes to standard output # is used as the file version. See the manual for examples. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. # You can optionally specify a file name after the option, if omitted # DoxygenLayout.xml will be used as the name of the layout file. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files # containing the references data. This must be a list of .bib files. The # .bib extension is automatically appended if omitted. Using this command # requires the bibtex tool to be installed. See also # http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style # of the bibliography can be controlled using LATEX_BIB_STYLE. To use this # feature you need bibtex and perl available in the search path. Do not use # file names with spaces, bibtex cannot handle them. CITE_BIB_FILES = #--------------------------------------------------------------------------- # configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated # by doxygen. Possible values are YES and NO. If left blank NO is used. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated by doxygen. Possible values are YES and NO. If left blank # NO is used. WARNINGS = YES # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings # for undocumented members. If EXTRACT_ALL is set to YES then this flag will # automatically be disabled. WARN_IF_UNDOCUMENTED = YES # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some # parameters in a documented function, or documenting parameters that # don't exist or using markup commands wrongly. WARN_IF_DOC_ERROR = YES # The WARN_NO_PARAMDOC option can be enabled to get warnings for # functions that are documented, but have no documentation for their parameters # or return value. If set to NO (the default) doxygen will only warn about # wrong or incomplete parameter documentation, but not about the absence of # documentation. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that # doxygen can produce. The string should contain the $file, $line, and $text # tags, which will be replaced by the file and line number from which the # warning originated and the warning text. Optionally the format may contain # $version, which will be replaced by the version of the file (if it could # be obtained via FILE_VERSION_FILTER) WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning # and error messages should be written. If left blank the output is written # to stderr. WARN_LOGFILE = #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag can be used to specify the files and/or directories that contain # documented source files. You may enter file names like "myfile.cpp" or # directories like "/usr/src/myproject". Separate the files or directories # with spaces. INPUT = ../src # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is # also the default input encoding. Doxygen uses libiconv (or the iconv built # into libc) for the transcoding. See http://www.gnu.org/software/libiconv for # the list of possible encodings. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank the following patterns are tested: # *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh # *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py # *.f90 *.f *.for *.vhd *.vhdl FILE_PATTERNS = *.h # The RECURSIVE tag can be used to turn specify whether or not subdirectories # should be searched for input files as well. Possible values are YES and NO. # If left blank NO is used. RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = ../src/gpuarray/wincompat \ ../src/gpuarray/compat.h \ ../src/private_config.h # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. Note that the wildcards are matched # against the file with absolute path, so to exclude all test directories # for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or # directories that contain example code fragments that are included (see # the \include command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank all files are included. EXAMPLE_PATTERNS = # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude # commands irrespective of the value of the RECURSIVE tag. # Possible values are YES and NO. If left blank NO is used. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or # directories that contain image that are included in the documentation (see # the \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command , where # is the value of the INPUT_FILTER tag, and is the name of an # input file. Doxygen will then use the output that the filter program writes # to standard output. # If FILTER_PATTERNS is specified, this tag will be # ignored. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. # Doxygen will compare the file name with each pattern and apply the # filter if there is a match. # The filters are a list of the form: # pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further # info on how filters are used. If FILTER_PATTERNS is empty or if # non of the patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will be used to filter the input files when producing source # files to browse (i.e. when SOURCE_BROWSER is set to YES). FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) # and it is also possible to disable source filtering for a specific pattern # using *.ext= (so without naming a filter). This option only has effect when # FILTER_SOURCE_FILES is enabled. FILTER_SOURCE_PATTERNS = # If the USE_MD_FILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page (index.html). # This can be useful if you have a project on for instance GitHub and want reuse # the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will # be generated. Documented entities will be cross-referenced with these sources. # Note: To get rid of all source code in the generated output, make sure also # VERBATIM_HEADERS is set to NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body # of functions and classes directly in the documentation. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct # doxygen to hide any special comment blocks from generated source code # fragments. Normal C, C++ and Fortran comments will always remain visible. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES # then for each documented function all documented # functions referencing it will be listed. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES # then for each documented function all documented entities # called/used by that function will be listed. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES (the default) # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from # functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will # link to the source code. # Otherwise they will link to the documentation. REFERENCES_LINK_SOURCE = YES # If the USE_HTAGS tag is set to YES then the references to source code # will point to the HTML generated by the htags(1) tool instead of doxygen # built-in source browser. The htags tool is part of GNU's global source # tagging system (see http://www.gnu.org/software/global/global.html). You # will need version 4.8.6 or higher. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen # will generate a verbatim copy of the header file for each class for # which an include is specified. Set to NO to disable this. VERBATIM_HEADERS = YES #--------------------------------------------------------------------------- # configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index # of all compounds will be generated. Enable this if the project # contains a lot of classes, structs, unions or interfaces. ALPHABETICAL_INDEX = YES # If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then # the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns # in which this list will be split (can be a number in the range [1..20]) COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that # should be ignored while generating the index headers. IGNORE_PREFIX = #--------------------------------------------------------------------------- # configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES (the default) Doxygen will # generate HTML output. GENERATE_HTML = NO # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `html' will be used as the default path. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for # each generated HTML page (for example: .htm,.php,.asp). If it is left blank # doxygen will generate files with .html extension. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a personal HTML header for # each generated HTML page. If it is left blank doxygen will generate a # standard header. Note that when using a custom header you are responsible # for the proper inclusion of any scripts and style sheets that doxygen # needs, which is dependent on the configuration options used. # It is advised to generate a default header using "doxygen -w html # header.html footer.html stylesheet.css YourConfigFile" and then modify # that header. Note that the header is subject to change so you typically # have to redo this when upgrading to a newer version of doxygen or when # changing the value of configuration settings such as GENERATE_TREEVIEW! HTML_HEADER = # The HTML_FOOTER tag can be used to specify a personal HTML footer for # each generated HTML page. If it is left blank doxygen will generate a # standard footer. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading # style sheet that is used by each HTML page. It can be used to # fine-tune the look of the HTML output. If left blank doxygen will # generate a default style sheet. Note that it is recommended to use # HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this # tag will in the future become obsolete. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify an additional # user-defined cascading style sheet that is included after the standard # style sheets created by doxygen. Using this option one can overrule # certain style aspects. This is preferred over using HTML_STYLESHEET # since it does not replace the standard style sheet and is therefor more # robust against future updates. Doxygen will copy the style sheet file to # the output directory. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that # the files will be copied as-is; there are no commands or markers available. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. # Doxygen will adjust the colors in the style sheet and background images # according to this color. Hue is specified as an angle on a colorwheel, # see http://en.wikipedia.org/wiki/Hue for more information. # For instance the value 0 represents red, 60 is yellow, 120 is green, # 180 is cyan, 240 is blue, 300 purple, and 360 is red again. # The allowed range is 0 to 359. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of # the colors in the HTML output. For a value of 0 the output will use # grayscales only. A value of 255 will produce the most vivid colors. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to # the luminance component of the colors in the HTML output. Values below # 100 gradually make the output lighter, whereas values above 100 make # the output darker. The value divided by 100 is the actual gamma applied, # so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, # and 100 does not change the gamma. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting # this to NO can help when comparing the output of multiple runs. HTML_TIMESTAMP = YES # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of # entries shown in the various tree structured indices initially; the user # can expand and collapse entries dynamically later on. Doxygen will expand # the tree to such a level that at most the specified number of entries are # visible (unless a fully collapsed tree already exceeds this amount). # So setting the number of entries 1 will produce a full collapsed tree by # default. 0 is a special value representing an infinite number of entries # and will result in a full expanded tree by default. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files # will be generated that can be used as input for Apple's Xcode 3 # integrated development environment, introduced with OSX 10.5 (Leopard). # To create a documentation set, doxygen will generate a Makefile in the # HTML output directory. Running make will produce the docset in that # directory and running "make install" will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find # it at startup. # See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. GENERATE_DOCSET = NO # When GENERATE_DOCSET tag is set to YES, this tag determines the name of the # feed. A documentation feed provides an umbrella under which multiple # documentation sets from a single provider (such as a company or product suite) # can be grouped. DOCSET_FEEDNAME = "Doxygen generated docs" # When GENERATE_DOCSET tag is set to YES, this tag specifies a string that # should uniquely identify the documentation set bundle. This should be a # reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen # will append .docset to the name. DOCSET_BUNDLE_ID = org.doxygen.Project # When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely # identify the documentation publisher. This should be a reverse domain-name # style string, e.g. com.mycompany.MyDocSet.documentation. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES, additional index files # will be generated that can be used as input for tools like the # Microsoft HTML help workshop to generate a compiled HTML help file (.chm) # of the generated HTML documentation. GENERATE_HTMLHELP = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can # be used to specify the file name of the resulting .chm file. You # can add a path in front of the file if the result should not be # written to the html output directory. CHM_FILE = # If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can # be used to specify the location (absolute path including file name) of # the HTML help compiler (hhc.exe). If non-empty doxygen will try to run # the HTML help compiler on the generated index.hhp. HHC_LOCATION = # If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag # controls if a separate .chi index file is generated (YES) or that # it should be included in the master .chm file (NO). GENERATE_CHI = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING # is used to encode HtmlHelp index (hhk), content (hhc) and project file # content. CHM_INDEX_ENCODING = # If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag # controls whether a binary table of contents is generated (YES) or a # normal table of contents (NO) in the .chm file. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members # to the contents of the HTML help documentation and to the tree view. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated # that can be used as input for Qt's qhelpgenerator to generate a # Qt Compressed Help (.qch) of the generated HTML documentation. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can # be used to specify the file name of the resulting .qch file. # The path specified is relative to the HTML output folder. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#namespace QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#virtual-folders QHP_VIRTUAL_FOLDER = doc # If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to # add. For more information please see # http://doc.trolltech.com/qthelpproject.html#custom-filters QHP_CUST_FILTER_NAME = # The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see # # Qt Help Project / Custom Filters. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's # filter section matches. # # Qt Help Project / Filter Attributes. QHP_SECT_FILTER_ATTRS = # If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can # be used to specify the location of Qt's qhelpgenerator. # If non-empty doxygen will try to run qhelpgenerator on the generated # .qhp file. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files # will be generated, which together with the HTML files, form an Eclipse help # plugin. To install this plugin and make it available under the help contents # menu in Eclipse, the contents of the directory containing the HTML and XML # files needs to be copied into the plugins directory of eclipse. The name of # the directory within the plugins directory should be the same as # the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before # the help appears. GENERATE_ECLIPSEHELP = NO # A unique identifier for the eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have # this name. ECLIPSE_DOC_ID = org.doxygen.Project # The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) # at top of each HTML page. The value NO (the default) enables the index and # the value YES disables it. Since the tabs have the same information as the # navigation tree you can set this option to NO if you already set # GENERATE_TREEVIEW to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. # If the tag value is set to YES, a side panel will be generated # containing a tree-like index structure (just like the one that # is generated for HTML Help). For this to work a browser that supports # JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). # Windows users are probably better off using the HTML help feature. # Since the tree basically has the same information as the tab index you # could consider to set DISABLE_INDEX to NO when enabling this option. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values # (range [0,1..20]) that doxygen will group on one line in the generated HTML # documentation. Note that a value of 0 will completely suppress the enum # values from appearing in the overview section. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be # used to set the initial width (in pixels) of the frame in which the tree # is shown. TREEVIEW_WIDTH = 250 # When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open # links to external symbols imported via tag files in a separate window. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of Latex formulas included # as images in the HTML documentation. The default is 10. Note that # when you change the font size after a successful doxygen run you need # to manually remove any form_*.png images from the HTML output directory # to force them to be regenerated. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are # not supported properly for IE 6.0, but are supported on all modern browsers. # Note that when changing this option you need to delete any form_*.png files # in the HTML output before the changes have effect. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax # (see http://www.mathjax.org) which uses client side Javascript for the # rendering instead of using prerendered bitmaps. Use this if you do not # have LaTeX installed or if you want to formulas look prettier in the HTML # output. When enabled you may also need to install MathJax separately and # configure the path to it using the MATHJAX_RELPATH option. USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # thA MathJax output. Supported types are HTML-CSS, NativeMML (i.e. MathML) and # SVG. The default value is HTML-CSS, which is slower, but has the best # compatibility. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the # HTML output directory using the MATHJAX_RELPATH option. The destination # directory should contain the MathJax.js script. For instance, if the mathjax # directory is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to # the MathJax Content Delivery Network so you can quickly see the result without # installing MathJax. # However, it is strongly recommended to install a local # copy of MathJax from http://www.mathjax.org before deployment. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension # names that should be enabled during MathJax rendering. MATHJAX_EXTENSIONS = # When the SEARCHENGINE tag is enabled doxygen will generate a search box # for the HTML output. The underlying search engine uses javascript # and DHTML and should work on any modern browser. Note that when using # HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets # (GENERATE_DOCSET) there is already a search function so this one should # typically be disabled. For large projects the javascript based search engine # can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. SEARCHENGINE = YES # When the SERVER_BASED_SEARCH tag is enabled the search engine will be # implemented using a web server instead of a web client using Javascript. # There are two flavours of web server based search depending on the # EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for # searching and an index file used by the script. When EXTERNAL_SEARCH is # enabled the indexing and searching needs to be provided by external tools. # See the manual for details. SERVER_BASED_SEARCH = NO # When EXTERNAL_SEARCH is enabled doxygen will no longer generate the PHP # script for searching. Instead the search results are written to an XML file # which needs to be processed by an external indexer. Doxygen will invoke an # external search engine pointed to by the SEARCHENGINE_URL option to obtain # the search results. Doxygen ships with an example indexer (doxyindexer) and # search engine (doxysearch.cgi) which are based on the open source search engine # library Xapian. See the manual for configuration details. EXTERNAL_SEARCH = NO # The SEARCHENGINE_URL should point to a search engine hosted by a web server # which will returned the search results when EXTERNAL_SEARCH is enabled. # Doxygen ships with an example search engine (doxysearch) which is based on # the open source search engine library Xapian. See the manual for configuration # details. SEARCHENGINE_URL = # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed # search data is written to a file for indexing by an external tool. With the # SEARCHDATA_FILE tag the name of this file can be specified. SEARCHDATA_FILE = searchdata.xml # When SERVER_BASED_SEARCH AND EXTERNAL_SEARCH are both enabled the # EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is # useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple # projects and redirect the results back to the right project. EXTERNAL_SEARCH_ID = # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen # projects other than the one defined by this configuration file, but that are # all added to the same external search index. Each project needs to have a # unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id # of to a relative location where the documentation can be found. # The format is: EXTRA_SEARCH_MAPPINGS = id1=loc1 id2=loc2 ... EXTRA_SEARCH_MAPPINGS = #--------------------------------------------------------------------------- # configuration options related to the LaTeX output #--------------------------------------------------------------------------- # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will # generate Latex output. GENERATE_LATEX = NO # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `latex' will be used as the default path. LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. If left blank `latex' will be used as the default command name. # Note that when enabling USE_PDFLATEX this option is only used for # generating bitmaps for formulas in the HTML output, but not in the # Makefile that is written to the output directory. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to # generate index for LaTeX. If left blank `makeindex' will be used as the # default command name. MAKEINDEX_CMD_NAME = makeindex # If the COMPACT_LATEX tag is set to YES Doxygen generates more compact # LaTeX documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_LATEX = NO # The PAPER_TYPE tag can be used to set the paper type that is used # by the printer. Possible values are: a4, letter, legal and # executive. If left blank a4wide will be used. PAPER_TYPE = a4 # The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX # packages that should be included in the LaTeX output. EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for # the generated latex document. The header should contain everything until # the first chapter. If it is left blank doxygen will generate a # standard header. Notice: only use this tag if you know what you are doing! LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for # the generated latex document. The footer should contain everything after # the last chapter. If it is left blank doxygen will generate a # standard footer. Notice: only use this tag if you know what you are doing! LATEX_FOOTER = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated # is prepared for conversion to pdf (using ps2pdf). The pdf file will # contain links (just like the HTML output) instead of page references # This makes the output suitable for online browsing using a pdf viewer. PDF_HYPERLINKS = YES # If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of # plain latex in the generated Makefile. Set this option to YES to get a # higher quality PDF documentation. USE_PDFLATEX = YES # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. # command to the generated LaTeX files. This will instruct LaTeX to keep # running if errors occur, instead of asking the user for help. # This option is also used when generating formulas in HTML. LATEX_BATCHMODE = NO # If LATEX_HIDE_INDICES is set to YES then doxygen will not # include the index chapters (such as File Index, Compound Index, etc.) # in the output. LATEX_HIDE_INDICES = NO # If LATEX_SOURCE_CODE is set to YES then doxygen will include # source code with syntax highlighting in the LaTeX output. # Note that which sources are shown also depends on other settings # such as SOURCE_BROWSER. LATEX_SOURCE_CODE = NO # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See # http://en.wikipedia.org/wiki/BibTeX for more info. LATEX_BIB_STYLE = plain #--------------------------------------------------------------------------- # configuration options related to the RTF output #--------------------------------------------------------------------------- # If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output # The RTF output is optimized for Word 97 and may not look very pretty with # other RTF readers or editors. GENERATE_RTF = NO # The RTF_OUTPUT tag is used to specify where the RTF docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `rtf' will be used as the default path. RTF_OUTPUT = rtf # If the COMPACT_RTF tag is set to YES Doxygen generates more compact # RTF documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_RTF = NO # If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated # will contain hyperlink fields. The RTF file will # contain links (just like the HTML output) instead of page references. # This makes the output suitable for online browsing using WORD or other # programs which support those fields. # Note: wordpad (write) and others do not support links. RTF_HYPERLINKS = NO # Load style sheet definitions from file. Syntax is similar to doxygen's # config file, i.e. a series of assignments. You only have to provide # replacements, missing definitions are set to their default value. RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an rtf document. # Syntax is similar to doxygen's config file. RTF_EXTENSIONS_FILE = #--------------------------------------------------------------------------- # configuration options related to the man page output #--------------------------------------------------------------------------- # If the GENERATE_MAN tag is set to YES (the default) Doxygen will # generate man pages GENERATE_MAN = NO # The MAN_OUTPUT tag is used to specify where the man pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `man' will be used as the default path. MAN_OUTPUT = man # The MAN_EXTENSION tag determines the extension that is added to # the generated man pages (default is the subroutine's section .3) MAN_EXTENSION = .3 # If the MAN_LINKS tag is set to YES and Doxygen generates man output, # then it will generate one additional man file for each entity # documented in the real man page(s). These additional files # only source the real man page, but without them the man command # would be unable to find the correct page. The default is NO. MAN_LINKS = NO #--------------------------------------------------------------------------- # configuration options related to the XML output #--------------------------------------------------------------------------- # If the GENERATE_XML tag is set to YES Doxygen will # generate an XML file that captures the structure of # the code including all documentation. GENERATE_XML = YES # The XML_OUTPUT tag is used to specify where the XML pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `xml' will be used as the default path. XML_OUTPUT = xml # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that # enabling this will significantly increase the size of the XML output. XML_PROGRAMLISTING = YES #--------------------------------------------------------------------------- # configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will # generate an AutoGen Definitions (see autogen.sf.net) file # that captures the structure of the code including all # documentation. Note that this feature is still experimental # and incomplete at the moment. GENERATE_AUTOGEN_DEF = NO #--------------------------------------------------------------------------- # configuration options related to the Perl module output #--------------------------------------------------------------------------- # If the GENERATE_PERLMOD tag is set to YES Doxygen will # generate a Perl module file that captures the structure of # the code including all documentation. Note that this # feature is still experimental and incomplete at the # moment. GENERATE_PERLMOD = NO # If the PERLMOD_LATEX tag is set to YES Doxygen will generate # the necessary Makefile rules, Perl scripts and LaTeX code to be able # to generate PDF and DVI output from the Perl module output. PERLMOD_LATEX = NO # If the PERLMOD_PRETTY tag is set to YES the Perl module output will be # nicely formatted so it can be parsed by a human reader. # This is useful # if you want to understand what is going on. # On the other hand, if this # tag is set to NO the size of the Perl module output will be much smaller # and Perl will parse it just the same. PERLMOD_PRETTY = YES # The names of the make variables in the generated doxyrules.make file # are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. # This is useful so different doxyrules.make files included by the same # Makefile don't overwrite each other's variables. PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor #--------------------------------------------------------------------------- # If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will # evaluate all C-preprocessor directives found in the sources and include # files. ENABLE_PREPROCESSING = YES # If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro # names in the source code. If set to NO (the default) only conditional # compilation will be performed. Macro expansion can be done in a controlled # way by setting EXPAND_ONLY_PREDEF to YES. MACRO_EXPANSION = YES # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES # then the macro expansion is limited to the macros specified with the # PREDEFINED and EXPAND_AS_DEFINED tags. EXPAND_ONLY_PREDEF = YES # If the SEARCH_INCLUDES tag is set to YES (the default) the includes files # pointed to by INCLUDE_PATH will be searched when a #include is found. SEARCH_INCLUDES = YES # The INCLUDE_PATH tag can be used to specify one or more directories that # contain include files that are not input files but should be processed by # the preprocessor. INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the # directories. If left blank, the patterns specified with FILE_PATTERNS will # be used. INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that # are defined before the preprocessor is started (similar to the -D option of # gcc). The argument of the tag is a list of macros of the form: name # or name=definition (no spaces). If the definition and the = are # omitted =1 is assumed. To prevent a macro definition from being # undefined via #undef or recursively expanded use the := operator # instead of the = operator. PREDEFINED = "GPUARRAY_PUBLIC=" "GPUARRAY_LOCAL=/** \private */" # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then # this tag can be used to specify a list of macro names that should be expanded. # The macro definition that is found in the sources will be used. # Use the PREDEFINED tag if you want to use a different macro definition that # overrules the definition found in the source code. EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then # doxygen's preprocessor will remove all references to function-like macros # that are alone on a line, have an all uppercase name, and do not end with a # semicolon, because these will confuse the parser if not removed. SKIP_FUNCTION_MACROS = YES #--------------------------------------------------------------------------- # Configuration::additions related to external references #--------------------------------------------------------------------------- # The TAGFILES option can be used to specify one or more tagfiles. For each # tag file the location of the external documentation should be added. The # format of a tag file without this location is as follows: # # TAGFILES = file1 file2 ... # Adding location for the tag files is done as follows: # # TAGFILES = file1=loc1 "file2 = loc2" ... # where "loc1" and "loc2" can be relative or absolute paths # or URLs. Note that each tag file must have a unique name (where the name does # NOT include the path). If a tag file is not located in the directory in which # doxygen is run, you must also specify the path to the tagfile here. TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create # a tag file that is based on the input files it reads. GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES all external classes will be listed # in the class index. If set to NO only the inherited external classes # will be listed. ALLEXTERNALS = NO # If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed # in the modules index. If set to NO, only the current project's groups will # be listed. EXTERNAL_GROUPS = YES # The PERL_PATH should be the absolute path and name of the perl script # interpreter (i.e. the result of `which perl'). PERL_PATH = /usr/bin/perl #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- # If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will # generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base # or super classes. Setting the tag to NO turns the diagrams off. Note that # this option also works with HAVE_DOT disabled, but it is recommended to # install and use dot, since it yields more powerful graphs. CLASS_DIAGRAMS = YES # You can define message sequence charts within doxygen comments using the \msc # command. Doxygen will then run the mscgen tool (see # http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the # documentation. The MSCGEN_PATH tag allows you to specify the directory where # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. MSCGEN_PATH = # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. HIDE_UNDOC_RELATIONS = YES # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz, a graph visualization # toolkit from AT&T and Lucent Bell Labs. The other options in this section # have no effect if this option is set to NO (the default) HAVE_DOT = NO # The DOT_NUM_THREADS specifies the number of dot invocations doxygen is # allowed to run in parallel. When set to 0 (the default) doxygen will # base this on the number of processors available in the system. You can set it # explicitly to a value larger than 0 to get control over the balance # between CPU load and processing speed. DOT_NUM_THREADS = 0 # By default doxygen will use the Helvetica font for all dot files that # doxygen generates. When you want a differently looking font you can specify # the font name using DOT_FONTNAME. You need to make sure dot is able to find # the font, which can be done by putting it in a standard location or by setting # the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the # directory containing the font. DOT_FONTNAME = Helvetica # The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. # The default size is 10pt. DOT_FONTSIZE = 10 # By default doxygen will tell dot to use the Helvetica font. # If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to # set the path where dot can find it. DOT_FONTPATH = # If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect inheritance relations. Setting this tag to YES will force the # CLASS_DIAGRAMS tag to NO. CLASS_GRAPH = YES # If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect implementation dependencies (inheritance, containment, and # class references variables) of the class with other documented classes. COLLABORATION_GRAPH = YES # If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen # will generate a graph for groups, showing the direct groups dependencies GROUP_GRAPHS = YES # If the UML_LOOK tag is set to YES doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling # Language. UML_LOOK = NO # If the UML_LOOK tag is enabled, the fields and methods are shown inside # the class node. If there are many fields or methods and many nodes the # graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS # threshold limits the number of items for each type to make the size more # managable. Set this to 0 for no limit. Note that the threshold may be # exceeded by 50% before the limit is enforced. UML_LIMIT_NUM_FIELDS = 10 # If set to YES, the inheritance and collaboration graphs will show the # relations between templates and their instances. TEMPLATE_RELATIONS = NO # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT # tags are set to YES then doxygen will generate a graph for each documented # file showing the direct and indirect include dependencies of the file with # other documented files. INCLUDE_GRAPH = YES # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and # HAVE_DOT tags are set to YES then doxygen will generate a graph for each # documented header file showing the documented files that directly or # indirectly include this file. INCLUDED_BY_GRAPH = YES # If the CALL_GRAPH and HAVE_DOT options are set to YES then # doxygen will generate a call dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable call graphs # for selected functions only using the \callgraph command. CALL_GRAPH = NO # If the CALLER_GRAPH and HAVE_DOT tags are set to YES then # doxygen will generate a caller dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable caller # graphs for selected functions only using the \callergraph command. CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen # will generate a graphical hierarchy of all classes instead of a textual one. GRAPHICAL_HIERARCHY = YES # If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES # then doxygen will show the dependencies a directory has on other directories # in a graphical way. The dependency relations are determined by the #include # relations between the files in the directories. DIRECTORY_GRAPH = YES # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. Possible values are svg, png, jpg, or gif. # If left blank png will be used. If you choose svg you need to set # HTML_FILE_EXTENSION to xhtml in order to make the SVG files # visible in IE 9+ (other browsers do not have this requirement). DOT_IMAGE_FORMAT = png # If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to # enable generation of interactive SVG images that allow zooming and panning. # Note that this requires a modern browser other than Internet Explorer. # Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you # need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files # visible. Older versions of IE do not have SVG support. INTERACTIVE_SVG = NO # The tag DOT_PATH can be used to specify the path where the dot tool can be # found. If left blank, it is assumed the dot tool can be found in the path. DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the # \dotfile command). DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the # \mscfile command). MSCFILE_DIRS = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of # nodes that will be shown in the graph. If the number of nodes in a graph # becomes larger than this value, doxygen will truncate the graph, which is # visualized by representing a node as a red box. Note that doxygen if the # number of direct children of the root node in a graph is already larger than # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note # that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. DOT_GRAPH_MAX_NODES = 50 # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the # graphs generated by dot. A depth value of 3 means that only nodes reachable # from the root by following a path via at most 3 edges will be shown. Nodes # that lay further from the root node will be omitted. Note that setting this # option to 1 or 2 may greatly reduce the computation time needed for large # code bases. Also note that the size of a graph can be further restricted by # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. MAX_DOT_GRAPH_DEPTH = 0 # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent # background. This is disabled by default, because dot on Windows does not # seem to support this out of the box. Warning: Depending on the platform used, # enabling this option may lead to badly anti-aliased labels on the edges of # a graph (i.e. they become hard to read). DOT_TRANSPARENT = NO # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) # support this, this feature is disabled by default. DOT_MULTI_TARGETS = NO # If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will # generate a legend page explaining the meaning of the various boxes and # arrows in the dot generated graphs. GENERATE_LEGEND = YES # If the DOT_CLEANUP tag is set to YES (the default) Doxygen will # remove the intermediate dot files that are used to generate # the various graphs. DOT_CLEANUP = YES libgpuarray-0.7.6/doc/Makefile000066400000000000000000000132211326743622600162640ustar00rootroot00000000000000# Makefile for Sphinx documentation # DOXYGEN = doxygen # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean doxy html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* doxy: ($(DOXYGEN) Doxyfile || /Applications/Doxygen.app/Contents/Resources/doxygen Doxyfile) html: doxy $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: doxy $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: doxy $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: doxy $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: doxy $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: doxy $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: doxy $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/gpuarray.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/gpuarray.qhc" devhelp: doxy $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/gpuarray" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/gpuarray" @echo "# devhelp" epub: doxy $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: doxy $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: doxy $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: doxy $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: doxy $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: doxy $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: doxy $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: doxy $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." libgpuarray-0.7.6/doc/_static/000077500000000000000000000000001326743622600162535ustar00rootroot00000000000000libgpuarray-0.7.6/doc/_static/.git_marker000066400000000000000000000000711326743622600203760ustar00rootroot00000000000000This file is there so that git will create the directory.libgpuarray-0.7.6/doc/_static/fix_rtd.css000066400000000000000000000003771326743622600204330ustar00rootroot00000000000000/* work around https://github.com/snide/sphinx_rtd_theme/issues/149 */ .rst-content table.field-list .field-body { padding-top: 8px; } .rst-versions-up { cursor: pointer; display: inline; } .wy-side-nav-search>div.version { color: white; }libgpuarray-0.7.6/doc/_static/version_switch.js000066400000000000000000000076721326743622600216730ustar00rootroot00000000000000// Create version selector for documentation top bar. (function() { var url = window.location.href; var base_dir = 'libgpuarray'; // directory containing doc // Default libgpuarray version: release and development. var versions_dir = {"release": "libgpuarray", "dev": "libgpuarray_versions/dev"}; // If doc is run localy if (url.startsWith('file')) { base_dir = 'html'; versions_dir = {"local":"html", "test":"test"}; } var root_url = url.substring(0, url.search('/' + base_dir)) + '/'; // Regular expression to find libgpuarray version directory in URL. var version_regex = new RegExp("\\/" + base_dir + "(_versions\\/)?([_a-zA-Z.0-9]*)\\/"); // Get current version var current_version = url.match(version_regex)[0] current_version = current_version.substring(1, current_version.length - 1) // Add current version in case versions.json is unavailable if (current_version != "libgpuarray" && current_version != "html") { ver = current_version.replace("libgpuarray_versions/", "") versions_dir[ver] = current_version } function build_vswitch() { // Build HTML string for version selector, based on ReadTheDocs theme's versions.html var vlabel = current_version.replace("libgpuarray_versions/", ""); if (vlabel == 'libgpuarray') { vlabel = 'release'; } var vswitch = ['
']; vswitch.push(''); vswitch.push(''); vswitch.push('v: ', vlabel, ' '); vswitch.push(''); vswitch.push(''); vswitch.push('
'); vswitch.push('
'); vswitch.push('
Versions
'); for (var version in versions_dir) { var new_url = url.replace(url.match(version_regex)[0], '/' + versions_dir[version] + '/'); vswitch.push('
', version, '
'); } vswitch.push('
'); // vswitch.push('
'); // vswitch.push('
Downloads
'); // var pdf_url = root_url + current_version + "/libgpuarray.pdf" // vswitch.push('
', 'PDF', '
'); // vswitch.push('
'); vswitch.push('
'); vswitch.push('
On GitHub
'); var git_master = "https://github.com/Theano/libgpuarray" vswitch.push('
', 'Fork me', '
'); vswitch.push('
'); vswitch.push('
'); vswitch.push('
'); return vswitch.join(''); } function build_vswitch_up() { // Build HTML string for version selector, based on ReadTheDocs theme's versions.html var vlabel = current_version.replace("libgpuarray_versions/", ""); if (vlabel == 'libgpuarray') { vlabel = 'release'; } var vswitch = ['
']; vswitch.push(''); vswitch.push(vlabel); vswitch.push(''); vswitch.push(''); vswitch.push('
'); return vswitch.join(''); } // Create HTML for version switcher and assign to placeholder in layout.html. $(document).ready(function() { // Build default switcher $('.version_switcher_placeholder').html(build_vswitch()); $('.version_switcher_placeholder_up').html(build_vswitch_up()); // Check server for other doc versions and update switcher. if (url.startsWith('http')) { $.getJSON(root_url + 'libgpuarray_versions/versions.json', function(data){ $.each(data, function(version, dir) { versions_dir[version] = dir; }); $('.version_switcher_placeholder').html(build_vswitch()); $('.version_switcher_placeholder_up').html(build_vswitch_up()); }); } }); })(); libgpuarray-0.7.6/doc/_templates/000077500000000000000000000000001326743622600167625ustar00rootroot00000000000000libgpuarray-0.7.6/doc/_templates/layout.html000066400000000000000000000025021326743622600211640ustar00rootroot00000000000000{% extends "!layout.html" %} {% block footer %} {{ super() }} {% endblock %} libgpuarray-0.7.6/doc/c_api.rst000066400000000000000000000001341326743622600164300ustar00rootroot00000000000000C library reference =================== .. toctree:: c_api/grouplist c_api/filelist libgpuarray-0.7.6/doc/c_api/000077500000000000000000000000001326743622600157005ustar00rootroot00000000000000libgpuarray-0.7.6/doc/c_api/file/000077500000000000000000000000001326743622600166175ustar00rootroot00000000000000libgpuarray-0.7.6/doc/c_api/file/abi__version_8h.rst000066400000000000000000000001061326743622600224040ustar00rootroot00000000000000File abi_version.h ================== .. doxygenfile:: abi_version.h libgpuarray-0.7.6/doc/c_api/file/array_8h.rst000066400000000000000000000000641326743622600210660ustar00rootroot00000000000000File array.h ============ .. doxygenfile:: array.h libgpuarray-0.7.6/doc/c_api/file/blas_8h.rst000066400000000000000000000000611326743622600206660ustar00rootroot00000000000000File blas.h =========== .. doxygenfile:: blas.h libgpuarray-0.7.6/doc/c_api/file/buffer_8h.rst000066400000000000000000000000671326743622600212240ustar00rootroot00000000000000File buffer.h ============= .. doxygenfile:: buffer.h libgpuarray-0.7.6/doc/c_api/file/buffer__blas_8h.rst000066400000000000000000000001061326743622600223560ustar00rootroot00000000000000File buffer_blas.h ================== .. doxygenfile:: buffer_blas.h libgpuarray-0.7.6/doc/c_api/file/buffer__collectives_8h.rst000066400000000000000000000001331326743622600237510ustar00rootroot00000000000000File buffer_collectives.h ========================= .. doxygenfile:: buffer_collectives.h libgpuarray-0.7.6/doc/c_api/file/cache_8h.rst000066400000000000000000000000641326743622600210130ustar00rootroot00000000000000File cache.h ============ .. doxygenfile:: cache.h libgpuarray-0.7.6/doc/c_api/file/collectives_8h.rst000066400000000000000000000001061326743622600222610ustar00rootroot00000000000000File collectives.h ================== .. doxygenfile:: collectives.h libgpuarray-0.7.6/doc/c_api/file/config_8h.rst000066400000000000000000000000671326743622600212200ustar00rootroot00000000000000File config.h ============= .. doxygenfile:: config.h libgpuarray-0.7.6/doc/c_api/file/dyn__load_8h.rst000066400000000000000000000000751326743622600217020ustar00rootroot00000000000000File dyn_load.h =============== .. doxygenfile:: dyn_load.h libgpuarray-0.7.6/doc/c_api/file/elemwise_8h.rst000066400000000000000000000000751326743622600215640ustar00rootroot00000000000000File elemwise.h =============== .. doxygenfile:: elemwise.h libgpuarray-0.7.6/doc/c_api/file/error_8h.rst000066400000000000000000000000751326743622600211030ustar00rootroot00000000000000File error.h ============ .. doxygenfile:: gpuarray/error.h libgpuarray-0.7.6/doc/c_api/file/ext__cuda_8h.rst000066400000000000000000000000751326743622600217050ustar00rootroot00000000000000File ext_cuda.h =============== .. doxygenfile:: ext_cuda.h libgpuarray-0.7.6/doc/c_api/file/extension_8h.rst000066400000000000000000000001001326743622600217530ustar00rootroot00000000000000File extension.h ================ .. doxygenfile:: extension.h libgpuarray-0.7.6/doc/c_api/file/integerfactoring_8h.rst000066400000000000000000000001251326743622600233000ustar00rootroot00000000000000File integerfactoring.h ======================= .. doxygenfile:: integerfactoring.h libgpuarray-0.7.6/doc/c_api/file/kernel_8h.rst000066400000000000000000000000671326743622600212330ustar00rootroot00000000000000File kernel.h ============= .. doxygenfile:: kernel.h libgpuarray-0.7.6/doc/c_api/file/libclblas_8h.rst000066400000000000000000000001001326743622600216660ustar00rootroot00000000000000File libclblas.h ================ .. doxygenfile:: libclblas.h libgpuarray-0.7.6/doc/c_api/file/libclblast_8h.rst000066400000000000000000000001031326743622600220550ustar00rootroot00000000000000File libclblast.h ================= .. doxygenfile:: libclblast.h libgpuarray-0.7.6/doc/c_api/file/libcublas_8h.rst000066400000000000000000000001001326743622600216770ustar00rootroot00000000000000File libcublas.h ================ .. doxygenfile:: libcublas.h libgpuarray-0.7.6/doc/c_api/file/libcuda_8h.rst000066400000000000000000000000721326743622600213520ustar00rootroot00000000000000File libcuda.h ============== .. doxygenfile:: libcuda.h libgpuarray-0.7.6/doc/c_api/file/libnccl_8h.rst000066400000000000000000000000721326743622600213550ustar00rootroot00000000000000File libnccl.h ============== .. doxygenfile:: libnccl.h libgpuarray-0.7.6/doc/c_api/file/libnvrtc_8h.rst000066400000000000000000000000751326743622600215750ustar00rootroot00000000000000File libnvrtc.h =============== .. doxygenfile:: libnvrtc.h libgpuarray-0.7.6/doc/c_api/file/libopencl_8h.rst000066400000000000000000000001001326743622600217060ustar00rootroot00000000000000File libopencl.h ================ .. doxygenfile:: libopencl.h libgpuarray-0.7.6/doc/c_api/file/private_8h.rst000066400000000000000000000000721326743622600214210ustar00rootroot00000000000000File private.h ============== .. doxygenfile:: private.h libgpuarray-0.7.6/doc/c_api/file/private__cuda_8h.rst000066400000000000000000000001111326743622600225460ustar00rootroot00000000000000File private_cuda.h =================== .. doxygenfile:: private_cuda.h libgpuarray-0.7.6/doc/c_api/file/private__opencl_8h.rst000066400000000000000000000001171326743622600231200ustar00rootroot00000000000000File private_opencl.h ===================== .. doxygenfile:: private_opencl.h libgpuarray-0.7.6/doc/c_api/file/strb_8h.rst000066400000000000000000000000611326743622600207170ustar00rootroot00000000000000File strb.h =========== .. doxygenfile:: strb.h libgpuarray-0.7.6/doc/c_api/file/types_8h.rst000066400000000000000000000000641326743622600211140ustar00rootroot00000000000000File types.h ============ .. doxygenfile:: types.h libgpuarray-0.7.6/doc/c_api/file/util_8h.rst000066400000000000000000000000611326743622600207220ustar00rootroot00000000000000File util.h =========== .. doxygenfile:: util.h libgpuarray-0.7.6/doc/c_api/file/xxhash_8h.rst000066400000000000000000000000671326743622600212560ustar00rootroot00000000000000File xxhash.h ============= .. doxygenfile:: xxhash.h libgpuarray-0.7.6/doc/c_api/filelist.rst000066400000000000000000000000671326743622600202500ustar00rootroot00000000000000File list ========= .. toctree:: :glob: file/* libgpuarray-0.7.6/doc/c_api/group/000077500000000000000000000000001326743622600170345ustar00rootroot00000000000000libgpuarray-0.7.6/doc/c_api/group/group__aflags.rst000066400000000000000000000001011326743622600223660ustar00rootroot00000000000000Group aflags ============ .. doxygengroup:: aflags :no-link: libgpuarray-0.7.6/doc/c_api/group/group__alloc__flags.rst000066400000000000000000000001201326743622600235370ustar00rootroot00000000000000Group alloc_flags ================= .. doxygengroup:: alloc_flags :no-link: libgpuarray-0.7.6/doc/c_api/group/group__eflags.rst000066400000000000000000000001011326743622600223720ustar00rootroot00000000000000Group eflags ============ .. doxygengroup:: eflags :no-link: libgpuarray-0.7.6/doc/c_api/group/group__elem__call__flags.rst000066400000000000000000000001341326743622600245260ustar00rootroot00000000000000Group elem_call_flags ===================== .. doxygengroup:: elem_call_flags :no-link: libgpuarray-0.7.6/doc/c_api/group/group__elem__flags.rst000066400000000000000000000001151326743622600233730ustar00rootroot00000000000000Group elem_flags ================ .. doxygengroup:: elem_flags :no-link: libgpuarray-0.7.6/doc/c_api/group/group__props.rst000066400000000000000000000000761326743622600223070ustar00rootroot00000000000000Group props =========== .. doxygengroup:: props :no-link: libgpuarray-0.7.6/doc/c_api/grouplist.rst000066400000000000000000000000721326743622600204610ustar00rootroot00000000000000Group list ========== .. toctree:: :glob: group/* libgpuarray-0.7.6/doc/conf.py000066400000000000000000000227621326743622600161350ustar00rootroot00000000000000# -*- Coding: utf-8 -*- # # gpuarray documentation build configuration file, created by # sphinx-quickstart on Wed Nov 21 16:23:37 2012. # # This file is execfile()d with the current directory set to its containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import sys, os # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.abspath('..')) import versioneer # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.todo', 'sphinx.ext.napoleon', # 'sphinx.ext.linkcode', 'breathe'] todo_include_todos = True napoleon_google_docstring = False napoleon_include_special_with_doc = False # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = u'gpuarray' copyright = u'2012--2017, Arnaud Bergeron' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # We need this hokey-pokey because versioneer needs the current # directory to be the root of the project to work. _curpath = os.getcwd() os.chdir(os.path.dirname(os.path.dirname(__file__))) # The full version, including alpha/beta/rc tags. release = versioneer.get_version() # The short X.Y version. version = '.'.join(release.split('.')[:2]) os.chdir(_curpath) del _curpath # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. #language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_dirs = ['_build', 'scripts'] # The reST default role (used for this markup: `text`) to use for all documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- if os.environ.get('READTHEDOCS') != 'True': try: import sphinx_rtd_theme except ImportError: pass else: html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] html_theme = 'sphinx_rtd_theme' def setup(app): app.add_stylesheet('fix_rtd.css') # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. #html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. html_last_updated_fmt = '%b %d, %Y' # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_domain_indices = True # If false, no index is generated. #html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. #html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. #html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = 'gpuarraydoc' # Options for the linkcode extension # ---------------------------------- # Resolve function # This function is used to populate the (source) links in the API # XXX: This is broken for now since it doesn't work for cython modules def linkcode_resolve(domain, info): def find_source(): obj = sys.modules[info['module']] for part in info['fullname'].split('.'): obj = getattr(obj, part) import inspect import os fn = inspect.getsourcefile(obj) fn = os.path.relpath(fn, start=os.path.dirname(pygpu.__file__)) source, lineno = inspect.getsourcelines(obj) return fn, lineno, lineno + len(source) - 1 if domain != 'py' or not info['module']: return None try: filename = 'libgpuarray/pygpu/%s#L%d-L%d' % find_source() except Exception: filename = info['module'].replace('.', '/') + '.py' import subprocess tag = subprocess.Popen(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE, universal_newlines=True).communicate()[0][:-1] return "https://github.com/Theano/libgpuarray/blob/%s/%s" % (tag, filename) # -- Options for LaTeX output -------------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). #'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). #'pointsize': '10pt', # Additional stuff for the LaTeX preamble. #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'gpuarray.tex', u'libgpuarray Documentation', u'Arnaud Bergeron', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # If true, show page references after internal links. #latex_show_pagerefs = False # If true, show URL addresses after external links. #latex_show_urls = False # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ ('index', 'gpuarray', u'libgpuarray Documentation', [u'Arnaud Bergeron'], 1) ] # If true, show URL addresses after external links. #man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ('index', 'gpuarray', u'libgpuarray Documentation', u'Arnaud Bergeron', 'gpuarray', 'One line description of project.', 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. #texinfo_appendices = [] # If false, no module index is generated. #texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. #texinfo_show_urls = 'footnote' breathe_projects = { "gpuarray": "_doxybuild/xml/", } breathe_default_project = "gpuarray" breathe_domain_by_extension = { "h": "c", "c": "c", } libgpuarray-0.7.6/doc/index.rst000066400000000000000000000007441326743622600164730ustar00rootroot00000000000000.. libgpuarray documentation master file, created by sphinx-quickstart on Wed Nov 21 16:23:37 2012. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to libgpuarray's documentation! ======================================= Contents: .. toctree:: :maxdepth: 2 why installation pyapi/pygpu c_api Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` libgpuarray-0.7.6/doc/installation.rst000066400000000000000000000167271326743622600200750ustar00rootroot00000000000000Installation ============ The library is routinely tested on linux and, less frequently, on Windows and Mac OS X. The OS most frequently tested are: - Debian 6 - Ubuntu 16.04 - macOS 10.12 - Windows 7 It should also work on any decently recent OS not listed here. If you get an error during the build on your favorite OS, please report it and we will attempt to fix it. Conda ----- The easiest way to install libgpuarray is with conda:: conda install pygpu This will also install the libgpuarray package automatically. This should work on Linux, Mac OS and Windows. This is also available in packages in conda-forge. They could be more up to date:: conda install -c conda-forge pygpu Build Requirements ------------------ - cmake >= 3.0 (cmake_). - a c99-compliant compiler (or MSVC if on windows). - (optional) libcheck (check_) to run the C tests. - (optional) python (python_) for the python bindings. - (optional) mako (mako_) for development or running the python bindings. - (optional) Cython >= 0.25 (cython_) for the python bindings. - (optional) nosetests (nosetests_) to run the python tests. Run Requirements ---------------- No matter what was available at build time, this library comes with dynamic loaders for the following libraries. You don't need to have any of this available, but you won't be able to use associated functionality. * For CUDA: - CUDA (cuda_) version 7.0 or more, with the appropriate driver - (optional) NCCL (nccl_) for the collectives interface * For OpenCL: - OpenCL version 1.2 or more - (optional) clBLAS (clblas_) or CLBlast (clblast_) for blas functionality .. note:: The OpenCL that comes with OS X is fundamentally broken and doesn't work with some of the kernels in the library. You can use it at your own risk, but don't report problems with it we can't fix them. Download -------- :: git clone https://github.com/Theano/libgpuarray.git cd libgpuarray Step-by-step install: system library (as admin) ----------------------------------------------- extract/clone the source to For libgpuarray: :: cd mkdir Build cd Build # you can pass -DCMAKE_INSTALL_PREFIX=/path/to/somewhere to install to an alternate location cmake .. -DCMAKE_BUILD_TYPE=Release # or Debug if you are investigating a crash make make install cd .. For pygpu: :: # This must be done after libgpuarray is installed as per instructions above. python setup.py build python setup.py install If you installed libgpuarray in a path that isn't a default one, you will need to specify where it is. Replace the first line by something like this: :: python setup.py build_ext -L $MY_PREFIX/lib -I $MY_PREFIX/include If installed globally under Linux (in /usr/local), you might have to run: .. code-block:: bash $ sudo ldconfig to make the linker know that there are new libraries available. You can also reboot the machine to do that. Step-by-step install: user library ---------------------------------- If you can not or do not want to install it for every user of that computer, you can install them in your home directory like this: :: cd rm -rf ~/.local/lib/libgpuarray* ~/.local/include/gpuarray ~/.local/lib/python*/site-packages/pygpu* rm -rf build Build mkdir Build cd Build cmake .. -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_BUILD_TYPE=Release make make install DEVICE="" make test cd .. # Run the following export and add them in your ~/.bashrc file export CPATH=$CPATH:~/.local/include export LIBRARY_PATH=$LIBRARY_PATH:~/.local/lib export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.local/lib python setup.py build python setup.py install --user cd DEVICE="" python -c "import pygpu;pygpu.test()" Change ``DEVICE=""`` to the GPU device you want to use for testing. Mac-specific instructions ------------------------- The only supported compiler is the clang version that comes with Xcode. Select the appropriate version of Xcode for you version of macOS. It might be possible to use a version of gcc built using Homebrew or MacPorts, but this is untested and unsupported. If on OS X 10.11 or macOS 10.12 and later and using the system python, you will have to use a virtualenv to use the python module. This is due to a restriction of the new SIP feature about loading libraries. It appears that on some versions, /usr/local is not in the default compiler paths so you might need to add ``-L /usr/local/lib -I /usr/local/include`` to the ``setup.py build`` command or export the paths like for a custom path install. Windows-specific instructions ----------------------------- If you are not comfortable with the command line, you can use the cmake-gui application to perform the config phase. It will generate a Visual Studio solution file for the version installed. To build the project open this file (.sln) and run the "Build All" command after selecting the appropriate build type. If you prefer a command-line approach, cmake is available as a console program with the same options as the Unix variant. You can select the nmake builder by passing ``-G "NMake Makefiles"`` to cmake. There is no standard install location on Windows, but you can specify a custom location by passing ``-DCMAKE_INSTALL_PREFIX=%LIBDIR%`` to cmake. You can then install using ``cmake --build . --target install`` after ``nmake``. If you don't have Visual Studio installed, you can get the free `Visual Studio Community edition `_, which has compilation tools for python 3.5 and up. For python 2.7, install `Microsoft Visual C++ Compiler for Python 2.7 `_. .. warning:: While you may get the library to compile using cygwin, this is not recommended nor supported. Running Tests ------------- .. warning:: In its current state, the C test suite is woefully incomplete. It will test very basic functionality, but nothing else. It is strongly recommended to run the python test suite to ensure everything is ok even if you intend on just using the C library. To run the C tests, enter the build directory (the one where you ran cmake), select a target device by exporting DEVICE (or GPUARRAY_TEST_DEVICE) and run 'make test'. If you get an error message similar to this one: :: Running tests... Test project /Users/anakha/ext/gpuarray/Debug No tests were found!!! This means either you don't have check installed or it wasn't found by the cmake detection script. To run the python tests, install pygpu, then **move outside** its directory and run this command: :: DEVICE="" python -c "import pygpu;pygpu.test()" See the documentation for :py:meth:`pygpu.gpuarray.init` for more details on the syntax of the device name. The test script prints the device name of the chosen device so that you can confirm which device it is running on. .. note:: AMD GPUs tend to have really uninformative names, generally being only the codename of the architecture the GPU belongs to (e.g. 'Tahiti'). .. _cmake: https://cmake.org/ .. _clblas: https://github.com/clMathLibraries/clBLAS .. _clblast: https://github.com/CNugteren/CLBlast .. _cuda: https://developer.nvidia.com/category/zone/cuda-zone .. _nccl: https://github.com/NVIDIA/nccl .. _check: http://check.sourceforge.net/ .. _python: https://python.org/ .. _cython: http://cython.org/ .. _nosetests: https://nose.readthedocs.org/en/latest/ .. _mako: http://www.makotemplates.org/ libgpuarray-0.7.6/doc/make.bat000066400000000000000000000117541326743622600162420ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=_build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . set I18NSPHINXOPTS=%SPHINXOPTS% . if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) if "%1" == "html" ( %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\gpuarray.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\gpuarray.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) :end libgpuarray-0.7.6/doc/pyapi/000077500000000000000000000000001326743622600157475ustar00rootroot00000000000000libgpuarray-0.7.6/doc/pyapi/pygpu.rst000066400000000000000000000017451326743622600176540ustar00rootroot00000000000000pygpu package ============= pygpu.gpuarray module --------------------- .. automodule:: pygpu.gpuarray :members: :undoc-members: pygpu.elemwise module --------------------- .. automodule:: pygpu.elemwise :members: :undoc-members: pygpu.operations module ----------------------- .. automodule:: pygpu.operations :members: :undoc-members: pygpu.reduction module ---------------------- .. automodule:: pygpu.reduction :members: :undoc-members: pygpu.blas module ----------------- .. automodule:: pygpu.blas :members: :undoc-members: pygpu.collectives module ------------------------ .. automodule:: pygpu.collectives :members: :undoc-members: pygpu.dtypes module ------------------- .. automodule:: pygpu.dtypes :members: :undoc-members: pygpu.tools module ------------------ .. automodule:: pygpu.tools :members: :undoc-members: Module contents --------------- .. automodule:: pygpu :members: :undoc-members: libgpuarray-0.7.6/doc/why.rst000066400000000000000000000041721326743622600161720ustar00rootroot00000000000000Goal ==== Make a common GPU ndarray(n dimensions array) that can be reused by all projects that is as future proof as possible, while keeping it easy to use for simple need/quick test. Motivation ---------- * Currently there are at least 6 different GPU arrays in python * CudaNdarray(Theano), GPUArray(pycuda), CUDAMatrix(cudamat), GPUArray(pyopencl), Clyther, Copperhead, ... * There are even more if we include other languages. * They are incompatible * None have the same properties and interface * All of them are a subset of NumPy.ndarray on the GPU! Design Goals ------------ * Have a n dimensional array. * Otherwise, not all project can reuse it. And you never know when you will need more dimensions. * Support many data types (int, float, double). * Otherwise, we are limited in what we can do with it. * Support strided view, c and f memory layout * This lowers memory usage and memory copies. A scarce resource on GPU. * You never know which memory layout is the best for your future need. * Be compatible with CUDA and OpenCL * You never know the future. Also, this make it possible to support other future language. * Make it easy to support just a subset of the feature. * If you just want to test something that support only CUDA and c contiguous matrices, it will stay easy as without libgpuarray. * There is functionality to make the same code work and compile with both CUDA and OpenCL. You don't need to use them. * Have the base object in C to allow collaboration with more projects. * We want people from C, C++, ruby, R, ... all use the same base GPU ndarray. * Have a python binding separate from the c code. * Support mixed back-end OpenCL/CUDA in the same binary. * But still keep it easy to use only one. * This would allow an easier transition to a new platform if the need come. * Support dynamic compilation * This allow optimization at run time based on the shapes for example. * You don't need to use this. In the end, we need a NumPy ndarray on the GPU! There is a restriction that does not allow us to reuse that object directly, but you will find it very similar. libgpuarray-0.7.6/make.bat000077500000000000000000000004251326743622600154710ustar00rootroot00000000000000REM This helps repetitive builds on windows REM It needs the compiler you want to use to be available in the shell REM and it will build a release version del bld mkdir bld cd bld cmake .. -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release cmake --build . --config Release cd .. libgpuarray-0.7.6/pygpu/000077500000000000000000000000001326743622600152245ustar00rootroot00000000000000libgpuarray-0.7.6/pygpu/__init__.py000066400000000000000000000014301326743622600173330ustar00rootroot00000000000000def get_include(): import os.path p = os.path.dirname(__file__) assert os.path.exists(os.path.join(p, 'gpuarray_api.h')) return p from . import gpuarray, elemwise, reduction from .gpuarray import (init, set_default_context, get_default_context, array, zeros, empty, asarray, ascontiguousarray, asfortranarray, register_dtype) from .operations import (split, array_split, hsplit, vsplit, dsplit, concatenate, hstack, vstack, dstack) from ._array import ndgpuarray from ._version import get_versions __version__ = get_versions()['version'] del get_versions def test(): from . import tests from .tests import main if hasattr(main, "NoseTester"): main.NoseTester(package=tests).test() libgpuarray-0.7.6/pygpu/_array.py000066400000000000000000000241201326743622600170520ustar00rootroot00000000000000from __future__ import division import numpy as np from .elemwise import elemwise1, elemwise2, ielemwise2, compare, arg, GpuElemwise, as_argument from .reduction import reduce1 from .dtypes import dtype_to_ctype, get_np_obj, get_common_dtype from . import gpuarray class ndgpuarray(gpuarray.GpuArray): """ Extension class for gpuarray.GpuArray to add numpy mathematical operations between arrays. These operations are all performed on the GPU but this is not the most efficient way since it will involve the creation of temporaries (just like numpy) for all intermediate results. This class may help transition code from numpy to pygpu by acting more like a drop-in replacement for numpy.ndarray than the raw GpuArray class. """ # add def __add__(self, other): return elemwise2(self, '+', other, self, broadcast=True) def __radd__(self, other): return elemwise2(other, '+', self, self, broadcast=True) def __iadd__(self, other): return ielemwise2(self, '+', other, broadcast=True) # sub def __sub__(self, other): return elemwise2(self, '-', other, self, broadcast=True) def __rsub__(self, other): return elemwise2(other, '-', self, self, broadcast=True) def __isub__(self, other): return ielemwise2(self, '-', other, broadcast=True) # mul def __mul__(self, other): return elemwise2(self, '*', other, self, broadcast=True) def __rmul__(self, other): return elemwise2(other, '*', self, self, broadcast=True) def __imul__(self, other): return ielemwise2(self, '*', other, broadcast=True) # div def __div__(self, other): return elemwise2(self, '/', other, self, broadcast=True) def __rdiv__(self, other): return elemwise2(other, '/', self, self, broadcast=True) def __idiv__(self, other): return ielemwise2(self, '/', other, broadcast=True) # truediv def __truediv__(self, other): np1 = get_np_obj(self) np2 = get_np_obj(other) res = (np1.__truediv__(np2)).dtype return elemwise2(self, '/', other, self, odtype=res, broadcast=True) def __rtruediv__(self, other): np1 = get_np_obj(self) np2 = get_np_obj(other) res = (np2.__truediv__(np1)).dtype return elemwise2(other, '/', self, self, odtype=res, broadcast=True) def __itruediv__(self, other): np2 = get_np_obj(other) kw = {'broadcast': True} if self.dtype == np.float32 or np2.dtype == np.float32: kw['op_tmpl'] = "a = (float)a / (float)b" if self.dtype == np.float64 or np2.dtype == np.float64: kw['op_tmpl'] = "a = (double)a / (double)b" return ielemwise2(self, '/', other, **kw) # floordiv def __floordiv__(self, other): out_dtype = get_common_dtype(self, other, True) kw = {'broadcast': True} if out_dtype.kind == 'f': kw['op_tmpl'] = "res = floor((%(out_t)s)a / (%(out_t)s)b)" return elemwise2(self, '/', other, self, odtype=out_dtype, **kw) def __rfloordiv__(self, other): out_dtype = get_common_dtype(other, self, True) kw = {'broadcast': True} if out_dtype.kind == 'f': kw['op_tmpl'] = "res = floor((%(out_t)s)a / (%(out_t)s)b)" return elemwise2(other, '/', self, self, odtype=out_dtype, **kw) def __ifloordiv__(self, other): out_dtype = self.dtype kw = {'broadcast': True} if out_dtype == np.float32: kw['op_tmpl'] = "a = floor((float)a / (float)b)" if out_dtype == np.float64: kw['op_tmpl'] = "a = floor((double)a / (double)b)" return ielemwise2(self, '/', other, **kw) # mod def __mod__(self, other): out_dtype = get_common_dtype(self, other, True) kw = {'broadcast': True} if out_dtype.kind == 'f': kw['op_tmpl'] = "res = fmod((%(out_t)s)a, (%(out_t)s)b)" return elemwise2(self, '%', other, self, odtype=out_dtype, **kw) def __rmod__(self, other): out_dtype = get_common_dtype(other, self, True) kw = {'broadcast': True} if out_dtype.kind == 'f': kw['op_tmpl'] = "res = fmod((%(out_t)s)a, (%(out_t)s)b)" return elemwise2(other, '%', self, self, odtype=out_dtype, **kw) def __imod__(self, other): out_dtype = get_common_dtype(self, other, self.dtype == np.float64) kw = {'broadcast': True} if out_dtype == np.float32: kw['op_tmpl'] = "a = fmod((float)a, (float)b)" if out_dtype == np.float64: kw['op_tmpl'] = "a = fmod((double)a, (double)b)" return ielemwise2(self, '%', other, **kw) # divmod def __divmod__(self, other): if not isinstance(other, gpuarray.GpuArray): other = np.asarray(other) odtype = get_common_dtype(self, other, True) a_arg = as_argument(self, 'a', read=True) b_arg = as_argument(other, 'b', read=True) args = [arg('div', odtype, write=True), arg('mod', odtype, write=True), a_arg, b_arg] div = self._empty_like_me(dtype=odtype) mod = self._empty_like_me(dtype=odtype) if odtype.kind == 'f': tmpl = ("div = floor((%(out_t)s)a / (%(out_t)s)b)," "mod = fmod((%(out_t)s)a, (%(out_t)s)b)") else: tmpl = ("div = (%(out_t)s)a / (%(out_t)s)b," "mod = a %% b") ksrc = tmpl % {'out_t': dtype_to_ctype(odtype)} k = GpuElemwise(self.context, ksrc, args) k(div, mod, self, other, broadcast=True) return (div, mod) def __rdivmod__(self, other): if not isinstance(other, gpuarray.GpuArray): other = np.asarray(other) odtype = get_common_dtype(other, self, True) a_arg = as_argument(other, 'a', read=True) b_arg = as_argument(self, 'b', read=True) args = [arg('div', odtype, write=True), arg('mod', odtype, write=True), a_arg, b_arg] div = self._empty_like_me(dtype=odtype) mod = self._empty_like_me(dtype=odtype) if odtype.kind == 'f': tmpl = ("div = floor((%(out_t)s)a / (%(out_t)s)b)," "mod = fmod((%(out_t)s)a, (%(out_t)s)b)") else: tmpl = ("div = (%(out_t)s)a / (%(out_t)s)b," "mod = a %% b") ksrc = tmpl % {'out_t': dtype_to_ctype(odtype)} k = GpuElemwise(self.context, ksrc, args) k(div, mod, other, self, broadcast=True) return (div, mod) def __neg__(self): return elemwise1(self, '-') def __pos__(self): return elemwise1(self, '+') def __abs__(self): if self.dtype.kind == 'u': return self.copy() if self.dtype.kind == 'f': oper = "res = fabs(a)" elif self.dtype.itemsize < 4: # cuda 5.5 finds the c++ stdlib definition if we don't cast here. oper = "res = abs((int)a)" else: oper = "res = abs(a)" return elemwise1(self, None, oper=oper) # richcmp def __lt__(self, other): return compare(self, '<', other, broadcast=True) def __le__(self, other): return compare(self, '<=', other, broadcast=True) def __eq__(self, other): return compare(self, '==', other, broadcast=True) def __ne__(self, other): return compare(self, '!=', other, broadcast=True) def __ge__(self, other): return compare(self, '>=', other, broadcast=True) def __gt__(self, other): return compare(self, '>', other, broadcast=True) # misc other things @property def T(self): if self.ndim < 2: return self return self.transpose() """ Since these functions are untested (thus probably wrong), we disable them. def clip(self, a_min, a_max, out=None): oper=('res = a > %(max)s ? %(max)s : ' '(a < %(min)s ? %(min)s : a)' % dict(min=a_min, max=a_max)) return elemwise1(self, '', oper=oper, out=out) def fill(self, value): self[...] = value """ # reductions def all(self, axis=None, out=None): if self.ndim == 0: return self.copy() return reduce1(self, '&&', '1', np.dtype('bool'), axis=axis, out=out) def any(self, axis=None, out=None): if self.ndim == 0: return self.copy() return reduce1(self, '||', '0', np.dtype('bool'), axis=axis, out=out) def prod(self, axis=None, dtype=None, out=None): if dtype is None: dtype = self.dtype # we only upcast integers that are smaller than the plaform default if dtype.kind == 'i': di = np.dtype('int') if di.itemsize > dtype.itemsize: dtype = di if dtype.kind == 'u': di = np.dtype('uint') if di.itemsize > dtype.itemsize: dtype = di return reduce1(self, '*', '1', dtype, axis=axis, out=out) # def max(self, axis=None, out=None); # nd = self.ndim # if nd == 0: # return self.copy() # idx = (0,) * nd # n = str(self.__getitem__(idx).__array__()) # return reduce1(self, '', n, self.dtype, axis=axis, out=out, # oper='max(a, b)') # def min(self, axis=None, out=None): # nd = self.ndim # if nd == 0: # return self.copy() # idx = (0,) * nd # n = str(self.__getitem__(idx).__array__()) # return reduce1(self, '', n, self.dtype, axis=axis, out=out, # oper='min(a, b)') def sum(self, axis=None, dtype=None, out=None): if dtype is None: dtype = self.dtype # we only upcast integers that are smaller than the plaform default if dtype.kind == 'i': di = np.dtype('int') if di.itemsize > dtype.itemsize: dtype = di if dtype.kind == 'u': di = np.dtype('uint') if di.itemsize > dtype.itemsize: dtype = di return reduce1(self, '+', '0', dtype, axis=axis, out=out) libgpuarray-0.7.6/pygpu/_elemwise.pyx000066400000000000000000000157021326743622600177440ustar00rootroot00000000000000from pygpu.gpuarray import GpuArrayException from pygpu.gpuarray cimport (gpucontext, GA_NO_ERROR, get_typecode, typecode_to_dtype, GpuContext, GpuArray, get_exc, gpuarray_get_elsize) from pygpu.gpuarray cimport (GA_BUFFER, GA_SIZE, GA_SSIZE, GA_ULONG, GA_LONG, GA_UINT, GA_INT, GA_USHORT, GA_SHORT, GA_UBYTE, GA_BYTE, GA_DOUBLE, GA_FLOAT) from libc.string cimport memset, memcpy, strdup from libc.stdlib cimport malloc, calloc, free cdef bytes to_bytes(s): if isinstance(s, bytes): return s if isinstance(s, unicode): return (s).encode('ascii') raise TypeError("Can't convert to bytes") cdef extern from "gpuarray/elemwise.h": ctypedef struct _GpuElemwise "GpuElemwise": pass ctypedef struct gpuelemwise_arg: const char *name int typecode int flags cdef int GE_SCALAR cdef int GE_READ cdef int GE_WRITE _GpuElemwise *GpuElemwise_new(gpucontext *ctx, const char *preamble, const char *expr, unsigned int n, gpuelemwise_arg *args, unsigned int nd, int flags) void GpuElemwise_free(_GpuElemwise *ge) int GpuElemwise_call(_GpuElemwise *ge, void **args, int flags) cdef int GE_NOADDR64 cdef int GE_CONVERT_F16 cdef int GE_BROADCAST cdef int GE_NOCOLLAPSE cdef int GE_PADSHAPE cdef class arg: cdef gpuelemwise_arg a def __cinit__(self): memset(&self.a, 0, sizeof(gpuelemwise_arg)) def __init__(self, name, type, read=False, write=False, scalar=False): # Make sure to clear previous storage # __init__ may be called more than once free(self.a.name) self.a.name = strdup(to_bytes(name)) if self.a.name is NULL: raise MemoryError self.a.typecode = get_typecode(type) self.a.flags = 0 if read: self.a.flags |= GE_READ if write: self.a.flags |= GE_WRITE if scalar: self.a.flags |= GE_SCALAR if self.a.flags == 0: raise ValueError('no flags specified for arg %s' % (name,)) def __dealloc__(self): free(self.a.name) property name: def __get__(self): return self.a.name.decode('ascii') property type: def __get__(self): return typecode_to_dtype(self.a.typecode) property read: def __get__(self): return self.a.flags & GE_READ property write: def __get__(self): return self.a.flags & GE_WRITE property scalar: def __get__(self): return self.a.flags & GE_SCALAR cdef class GpuElemwise: cdef _GpuElemwise *ge cdef int *types cdef void **callbuf cdef unsigned int n def __cinit__(self, GpuContext ctx, expr, args, unsigned int nd=0, preamble=b"", bint convert_f16=False): cdef gpuelemwise_arg *_args; cdef unsigned int i cdef arg aa self.ge = NULL self.types = NULL self.callbuf = NULL preamble = to_bytes(preamble) expr = to_bytes(expr) self.n = len(args) self.types = calloc(self.n, sizeof(int)) if self.types is NULL: raise MemoryError self.callbuf = calloc(self.n, sizeof(void *)) if self.callbuf == NULL: raise MemoryError _args = calloc(self.n, sizeof(gpuelemwise_arg)); if _args is NULL: raise MemoryError try: for i in range(self.n): if not isinstance(args[i], arg): raise TypeError("args must be an iterable of arg") aa = args[i] memcpy(&_args[i], &aa.a, sizeof(gpuelemwise_arg)) if aa.a.flags & GE_SCALAR: self.types[i] = aa.a.typecode self.callbuf[i] = malloc(gpuarray_get_elsize(aa.a.typecode)) if self.callbuf[i] is NULL: raise MemoryError else: self.types[i] = GA_BUFFER self.ge = GpuElemwise_new(ctx.ctx, preamble, expr, self.n, _args, nd, GE_CONVERT_F16 if convert_f16 else 0) finally: free(_args) if self.ge is NULL: raise GpuArrayException("Could not initialize C GpuElemwise instance") def __dealloc__(self): cdef unsigned int i if self.ge is not NULL: GpuElemwise_free(self.ge) self.ge = NULL for i in range(self.n): if self.types[i] != GA_BUFFER: free(self.callbuf[i]) free(self.callbuf) free(self.types) cdef _setarg(self, unsigned int index, object o): cdef int typecode typecode = self.types[index] if typecode == GA_BUFFER: if not isinstance(o, GpuArray): raise TypeError, "expected a GpuArray" self.callbuf[index] = &(o).ga elif typecode == GA_SIZE: (self.callbuf[index])[0] = o elif typecode == GA_SSIZE: (self.callbuf[index])[0] = o elif typecode == GA_FLOAT: (self.callbuf[index])[0] = o elif typecode == GA_DOUBLE: (self.callbuf[index])[0] = o elif typecode == GA_BYTE: (self.callbuf[index])[0] = o elif typecode == GA_UBYTE: (self.callbuf[index])[0] = o elif typecode == GA_SHORT: (self.callbuf[index])[0] = o elif typecode == GA_USHORT: (self.callbuf[index])[0] = o elif typecode == GA_INT: (self.callbuf[index])[0] = o elif typecode == GA_UINT: (self.callbuf[index])[0] = o elif typecode == GA_LONG: (self.callbuf[index])[0] = o elif typecode == GA_ULONG: (self.callbuf[index])[0] = o else: raise ValueError("Bad typecode in _setarg: %d " "(please report this, it is a bug)" % (typecode,)) def __call__(self, *args, **kwargs): cdef unsigned int i cdef int err cdef int flags flags = 0 if kwargs.pop('broadcast', True): flags |= GE_BROADCAST if kwargs.pop('padshape', True): flags |= GE_PADSHAPE if len(kwargs) != 0: raise TypeError("Unknown keyword argument: %s" % list(kwargs.keys())[0]) for i, arg in enumerate(args): self._setarg(i, arg) err = GpuElemwise_call(self.ge, self.callbuf, flags) if err != GA_NO_ERROR: raise get_exc(err)("Could not call GpuElemwise") libgpuarray-0.7.6/pygpu/_version.py000066400000000000000000000441041326743622600174250ustar00rootroot00000000000000 # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. Generated by # versioneer-0.18 (https://github.com/warner/python-versioneer) """Git implementation of _version.py.""" import errno import os import re import subprocess import sys def get_keywords(): """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = " (tag: v0.7.6)" git_full = "f036aef3a425560161de362f390d238f4e7c1721" git_date = "2018-04-23 16:05:42 -0400" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_config(): """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "pep440" cfg.tag_prefix = "v" cfg.parentdir_prefix = "libgpuarray-" cfg.versionfile_source = "pygpu/_version.py" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen([c] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None)) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, p.returncode return stdout, p.returncode def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") date = keywords.get("date") if date is not None: # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] if verbose: print("picking %s" % r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", "--always", "--long", "--match", "%s*" % tag_prefix], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%s'" % describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" % (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%d" % pieces["distance"] else: # exception #1 rendered = "0.post.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} def get_versions(): """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for i in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} libgpuarray-0.7.6/pygpu/basic.py000066400000000000000000000035701326743622600166640ustar00rootroot00000000000000from string import Template from .gpuarray import GpuArray, GpuKernel, SIZE def _generate_kernel(ctx, cols, upper=True): tmpl = Template(""" #include "cluda.h" KERNEL void extract_tri(GLOBAL_MEM ga_float *a, ga_size a_off, ga_uint N) { a = (GLOBAL_MEM ga_float *)(((GLOBAL_MEM char *)a) + a_off); unsigned int idx = GID_1 * LDIM_0 * GDIM_0 + GID_0 * LDIM_0 + LID_0; unsigned int ix = idx/${cols}; unsigned int iy = idx%${cols}; if (idx < N) { if (ix ${le} iy) a[idx] = 0.0; } } """) if upper: le = '>' else: le = '<' src = tmpl.substitute(cols=cols, le=le) spec = [GpuArray, SIZE, 'uint32'] k = GpuKernel(src, "extract_tri", spec, context=ctx) return k def triu(A, inplace=True): if A.ndim != 2: raise ValueError("triu only works for 2d arrays") if A.flags.c_contiguous is A.flags.f_contiguous is False: raise ValueError("triu only works for contiguous arrays") if not inplace: A = A.copy() if A.flags['F_CONTIGUOUS']: upper = False cols = A.shape[0] else: upper = True cols = A.shape[1] k = _generate_kernel(A.context, cols, upper) k(A, A.offset, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) return A def tril(A, inplace=True): if A.ndim != 2: raise ValueError("tril only works for 2d arrays") if A.flags.c_contiguous is A.flags.f_contiguous is False: raise ValueError("tril only works for contiguous arrays") if not inplace: A = A.copy() if A.flags['F_CONTIGUOUS']: upper = True cols = A.shape[0] else: upper = False cols = A.shape[1] k = _generate_kernel(A.context, cols, upper) k(A, A.offset, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) return A libgpuarray-0.7.6/pygpu/blas.pyx000066400000000000000000000161031326743622600167100ustar00rootroot00000000000000from pygpu.gpuarray import GpuArrayException from pygpu.gpuarray cimport (_GpuArray, GpuArray, GA_NO_ERROR, GpuArray_error, pygpu_copy, pygpu_empty, pygpu_zeros, GA_ANY_ORDER, GA_F_ORDER, GpuArray_ISONESEGMENT) cdef extern from "gpuarray/buffer_blas.h": ctypedef enum cb_transpose: cb_no_trans, cb_trans, cb_conj_trans cdef extern from "gpuarray/blas.h": int GpuArray_rdot(_GpuArray *X, _GpuArray *Y, _GpuArray *Z, int nocopy) int GpuArray_rgemv(cb_transpose transA, double alpha, _GpuArray *A, _GpuArray *X, double beta, _GpuArray *Y, int nocopy) int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, _GpuArray *A, _GpuArray *B, double beta, _GpuArray *C, int nocopy) int GpuArray_rger(double alpha, _GpuArray *X, _GpuArray *Y, _GpuArray *A, int nocopy) int GpuArray_rgemmBatch_3d( cb_transpose transA, cb_transpose transB, double alpha, _GpuArray *A, _GpuArray *B, double beta, _GpuArray *C, int nocopy) cdef api int pygpu_blas_rdot(GpuArray X, GpuArray Y, GpuArray Z, bint nocopy) except -1: cdef int err err = GpuArray_rdot(&X.ga, &Y.ga, &Z.ga, nocopy) if err != GA_NO_ERROR: raise GpuArrayException(GpuArray_error(&X.ga, err), err) return 0 cdef api int pygpu_blas_rgemv(cb_transpose transA, double alpha, GpuArray A, GpuArray X, double beta, GpuArray Y, bint nocopy) except -1: cdef int err err = GpuArray_rgemv(transA, alpha, &A.ga, &X.ga, beta, &Y.ga, nocopy); if err != GA_NO_ERROR: raise GpuArrayException(GpuArray_error(&A.ga, err), err) return 0 cdef api int pygpu_blas_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuArray A, GpuArray B, double beta, GpuArray C, bint nocopy) except -1: cdef int err err = GpuArray_rgemm(transA, transB, alpha, &A.ga, &B.ga, beta, &C.ga, nocopy); if err != GA_NO_ERROR: raise GpuArrayException(GpuArray_error(&A.ga, err), err) return 0 cdef api int pygpu_blas_rger(double alpha, GpuArray X, GpuArray Y, GpuArray A, bint nocopy) except -1: cdef int err err = GpuArray_rger(alpha, &X.ga, &Y.ga, &A.ga, nocopy); if err != GA_NO_ERROR: raise GpuArrayException(GpuArray_error(&X.ga, err), err) return 0 cdef api int pygpu_blas_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alpha, GpuArray A, GpuArray B, double beta, GpuArray C, bint nocopy) except -1: cdef int err err = GpuArray_rgemmBatch_3d(transA, transB, alpha, &A.ga, &B.ga, beta, &C.ga, nocopy) if err != GA_NO_ERROR: raise GpuArrayException(GpuArray_error(&A.ga, err), err) return 0 def dot(GpuArray X, GpuArray Y, GpuArray Z=None, overwrite_z=False): """dot(X, Y, Z=None, overwrite_z=False) """ if Z is None: Z = pygpu_empty(0, NULL, X.typecode, GA_ANY_ORDER, X.context, None) overwrite_z = True if not overwrite_z: Z = pygpu_copy(Z, GA_ANY_ORDER) pygpu_blas_rdot(X, Y, Z, 0) return Z def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0, GpuArray Y=None, trans_a=False, overwrite_y=False): """gemv(alpha, A, X, beta=0.0, Y=None, trans_a=False, overwrite_y=False) """ cdef cb_transpose transA cdef size_t Yshp if trans_a: transA = cb_trans else: transA = cb_no_trans if A.ga.nd != 2: raise TypeError("A is not a matrix") if transA == cb_no_trans: Yshp = A.ga.dimensions[0] else: Yshp = A.ga.dimensions[1] if Y is None: if beta != 0.0: raise ValueError("Y not provided and beta != 0") Y = pygpu_empty(1, &Yshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) overwrite_y = True if not overwrite_y: Y = pygpu_copy(Y, GA_ANY_ORDER) pygpu_blas_rgemv(transA, alpha, A, X, beta, Y, 0) return Y def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None, trans_a=False, trans_b=False, overwrite_c=False): """gemm(alpha, A, B, beta, C=None, trans_a=False, trans_b=False, overwrite_c=False) """ cdef cb_transpose transA cdef cb_transpose transB cdef size_t[2] Cshp if trans_a: transA = cb_trans else: transA = cb_no_trans if trans_b: transB = cb_trans else: transB = cb_no_trans if A.ga.nd != 2: raise TypeError("A is not a matrix") if B.ga.nd != 2: raise TypeError("B is not a matrix") if transA == cb_no_trans: Cshp[0] = A.ga.dimensions[0] else: Cshp[0] = A.ga.dimensions[1] if transB == cb_no_trans: Cshp[1] = B.ga.dimensions[1] else: Cshp[1] = B.ga.dimensions[0] if C is None: if beta != 0.0: raise ValueError("C not provided and beta != 0") C = pygpu_empty(2, Cshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) overwrite_c = True if not overwrite_c: C = pygpu_copy(C, GA_ANY_ORDER) pygpu_blas_rgemm(transA, transB, alpha, A, B, beta, C, 0) return C def ger(double alpha, GpuArray X, GpuArray Y, GpuArray A=None, overwrite_a=False): """ger(alpha, X, Y, A=None, overwrite_a=False) """ cdef size_t[2] Ashp if A is None: Ashp[0] = X.ga.dimensions[0]; Ashp[1] = Y.ga.dimensions[0]; A = pygpu_zeros(2, Ashp, X.ga.typecode, GA_ANY_ORDER, X.context, None) overwrite_a = True if not overwrite_a: A = pygpu_copy(A, GA_ANY_ORDER) pygpu_blas_rger(alpha, X, Y, A, 0) return A def gemmBatch_3d(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None, trans_a=False, trans_b=False, overwrite_c=False): """gemmBatch_3d(alpha, A, B, beta, C=None, trans_a=False, trans_b=False, overwrite_c=False) """ cdef cb_transpose transA cdef cb_transpose transB cdef size_t[3] Cshp if trans_a: transA = cb_trans else: transA = cb_no_trans if trans_b: transB = cb_trans else: transB = cb_no_trans if A.ga.nd != 3: raise TypeError("A is not a batch of matrices") if B.ga.nd != 3: raise TypeError("B is not a batch of matrices") Cshp[0] = A.ga.dimensions[0] if transA == cb_no_trans: Cshp[1] = A.ga.dimensions[1] else: Cshp[1] = A.ga.dimensions[2] if transB == cb_no_trans: Cshp[2] = B.ga.dimensions[2] else: Cshp[2] = B.ga.dimensions[1] if C is None: if beta != 0.0: raise ValueError("C not provided and beta != 0") C = pygpu_empty(3, Cshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) elif not overwrite_c: C = pygpu_copy(C, GA_ANY_ORDER) pygpu_blas_rgemmBatch_3d(transA, transB, alpha, A, B, beta, C, 0) return C libgpuarray-0.7.6/pygpu/collectives.pxd000066400000000000000000000056731326743622600202700ustar00rootroot00000000000000from pygpu.gpuarray cimport (gpucontext, GpuContext, _GpuArray, GpuArray) cdef extern from "gpuarray/buffer_collectives.h": ctypedef struct gpucomm: pass enum gpucomm_reduce_ops: GA_SUM, GA_PROD, GA_MAX, GA_MIN enum: GA_COMM_ID_BYTES ctypedef struct gpucommCliqueId: char[GA_COMM_ID_BYTES] internal int gpucomm_new(gpucomm** comm, gpucontext* ctx, gpucommCliqueId comm_id, int ndev, int rank) void gpucomm_free(gpucomm* comm) gpucontext* gpucomm_context(gpucomm* comm) int gpucomm_gen_clique_id(gpucontext* ctx, gpucommCliqueId* comm_id) int gpucomm_get_count(gpucomm* comm, int* gpucount) int gpucomm_get_rank(gpucomm* comm, int* rank) cdef extern from "gpuarray/collectives.h" nogil: int GpuArray_reduce_from(const _GpuArray* src, int opcode, int root, gpucomm* comm) int GpuArray_reduce(const _GpuArray* src, _GpuArray* dest, int opcode, int root, gpucomm* comm) int GpuArray_all_reduce(const _GpuArray* src, _GpuArray* dest, int opcode, gpucomm* comm) int GpuArray_reduce_scatter(const _GpuArray* src, _GpuArray* dest, int opcode, gpucomm* comm) int GpuArray_broadcast(_GpuArray* array, int root, gpucomm* comm) int GpuArray_all_gather(const _GpuArray* src, _GpuArray* dest, gpucomm* comm) cdef api class GpuCommCliqueId [type PyGpuCliqueIdType, object PyGpuCliqueIdObject]: cdef gpucommCliqueId c_comm_id cdef readonly GpuContext context cdef api class GpuComm [type PyGpuCommType, object PyGpuCommObject]: cdef gpucomm* c cdef object __weakref__ cdef int to_reduce_opcode(op) except -1 cdef gpucontext* comm_context(GpuComm comm) except NULL cdef int comm_generate_id(gpucontext* ctx, gpucommCliqueId* comm_id) except -1 cdef int comm_get_count(GpuComm comm, int* gpucount) except -1 cdef int comm_get_rank(GpuComm comm, int* gpurank) except -1 cdef int comm_reduce_from(GpuComm comm, GpuArray src, int opcode, int root) except -1 cdef int comm_reduce(GpuComm comm, GpuArray src, GpuArray dest, int opcode, int root) except -1 cdef int comm_all_reduce(GpuComm comm, GpuArray src, GpuArray dest, int opcode) except -1 cdef int comm_reduce_scatter(GpuComm comm, GpuArray src, GpuArray dest, int opcode) except -1 cdef int comm_broadcast(GpuComm comm, GpuArray arr, int root) except -1 cdef int comm_all_gather(GpuComm comm, GpuArray src, GpuArray dest) except -1 cdef api: GpuArray pygpu_make_reduced(GpuComm comm, GpuArray src, int opcode) GpuArray pygpu_make_all_reduced(GpuComm comm, GpuArray src, int opcode) GpuArray pygpu_make_reduce_scattered(GpuComm comm, GpuArray src, int opcode) GpuArray pygpu_make_all_gathered(GpuComm comm, GpuArray src, unsigned int nd_up) libgpuarray-0.7.6/pygpu/collectives.pyx000066400000000000000000000401371326743622600203070ustar00rootroot00000000000000from libc.stdlib cimport malloc, calloc, free from libc.string cimport memcmp from cpython cimport Py_buffer, Py_INCREF, Py_DECREF from cpython.buffer cimport PyBUF_FORMAT, PyBUF_ND, PyBUF_STRIDES from pygpu.gpuarray cimport (gpucontext, GpuContext, _GpuArray, GpuArray, ensure_context, GA_NO_ERROR, get_exc, gpucontext_error, GpuArray_IS_C_CONTIGUOUS, GA_C_ORDER, GA_F_ORDER, GA_ANY_ORDER, pygpu_empty_like, pygpu_empty, memcpy) from pygpu.gpuarray import GpuArrayException COMM_ID_BYTES = GA_COMM_ID_BYTES cdef class GpuCommCliqueId: """GpuCommCliqueId(context=None, comm_id=None) Represents a unique id shared among :class:`GpuComm` communicators which participate in a multi-gpu clique. Parameters ---------- context: GpuContext Reference to which gpu this GpuCommCliqueId object belongs. comm_id: bytes Existing unique id to be passed in this object. """ def __cinit__(self, GpuContext context=None, unsigned char[:] comm_id=None): self.context = ensure_context(context) if comm_id is None: comm_generate_id(self.context.ctx, &self.c_comm_id) def __init__(self, GpuContext context=None, unsigned char[:] comm_id=None): if comm_id is not None: self.comm_id = comm_id def __richcmp__(this, that, int op): if type(this) != type(that): raise TypeError, "Cannot compare %s with %s" % (type(this), type(that)) cdef int res cdef GpuCommCliqueId a a = this cdef GpuCommCliqueId b b = that res = memcmp(a.c_comm_id.internal, b.c_comm_id.internal, GA_COMM_ID_BYTES) if op == 0: return res < 0 elif op == 1: return res <= 0 elif op == 2: return res == 0 elif op == 3: return res != 0 elif op == 4: return res > 0 else: return res >= 0 def __hash__(self): return hash(self.__class__.__name__) ^ hash(self.c_comm_id.internal[:GA_COMM_ID_BYTES]) def __reduce__(self): raise RuntimeError, "Cannot pickle %s object" % self.__class__.__name__ property comm_id: "Unique clique id to be used by each :class:`GpuComm` in a group of devices" def __get__(self): cdef bytearray res res = self.c_comm_id.internal[:GA_COMM_ID_BYTES] return res def __set__(self, unsigned char[:] cid): cdef int length length = cid.shape[0] if length < GA_COMM_ID_BYTES: raise ValueError, "GpuComm clique id must have length %d bytes" % (GA_COMM_ID_BYTES) memcpy(self.c_comm_id.internal, &cid[0], GA_COMM_ID_BYTES) cdef class GpuComm: """GpuComm(cid, ndev, rank) Represents a communicator which participates in a multi-gpu clique. It is used to invoke collective operations to gpus inside its clique. Parameters ---------- cid: GpuCommCliqueId Unique id shared among participating communicators. ndev: int Number of communicators inside the clique. rank: int User-defined rank of this communicator inside the clique. It influences order of collective operations. """ def __dealloc__(self): gpucomm_free(self.c) def __cinit__(self, GpuCommCliqueId cid not None, int ndev, int rank): cdef int err err = gpucomm_new(&self.c, cid.context.ctx, cid.c_comm_id, ndev, rank) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(cid.context.ctx, err) def __reduce__(self): raise RuntimeError, "Cannot pickle %s object" % self.__class__.__name__ property count: "Total number of communicators inside the clique" def __get__(self): cdef int gpucount comm_get_count(self, &gpucount) return gpucount property rank: "User-defined rank of this communicator inside the clique" def __get__(self): cdef int gpurank comm_get_rank(self, &gpurank) return gpurank def reduce(self, GpuArray src not None, op, GpuArray dest=None, int root=-1): """ reduce(self, src, op, dest=None, root=-1) Reduce collective operation for ranks in a communicator world. Parameters ---------- src: GpuArray Array to be reduced. op: str Key indicating operation type. dest: GpuArray Array to collect reduce operation result. root: int Rank in GpuComm which will collect result. Notes ----- * `root` is necessary when invoking from a non-root rank. Root caller does not need to provide `root` argument. * Not providing `dest` argument for a root caller will result in creating a new compatible :class:`GpuArray` and returning result in it. """ cdef int srank if dest is None: if root != -1: comm_get_rank(self, &srank) if root == srank: return pygpu_make_reduced(self, src, to_reduce_opcode(op)) comm_reduce_from(self, src, to_reduce_opcode(op), root) return else: return pygpu_make_reduced(self, src, to_reduce_opcode(op)) if root == -1: comm_get_rank(self, &root) comm_reduce(self, src, dest, to_reduce_opcode(op), root) def all_reduce(self, GpuArray src not None, op, GpuArray dest=None): """ all_reduce(self, src, op, dest=None) AllReduce collective operation for ranks in a communicator world. Parameters ---------- src: GpuArray Array to be reduced. op: str Key indicating operation type. dest: GpuArray Array to collect reduce operation result. Notes ----- * Not providing `dest` argument for a caller will result in creating a new compatible :class:`GpuArray` and returning result in it. """ if dest is None: return pygpu_make_all_reduced(self, src, to_reduce_opcode(op)) comm_all_reduce(self, src, dest, to_reduce_opcode(op)) def reduce_scatter(self, GpuArray src not None, op, GpuArray dest=None): """ reduce_scatter(self, src, op, dest=None) ReduceScatter collective operation for ranks in a communicator world. Parameters ---------- src: GpuArray Array to be reduced. op: str Key indicating operation type. dest: GpuArray Array to collect reduce operation scattered result. Notes ----- * Not providing `dest` argument for a caller will result in creating a new compatible :class:`GpuArray` and returning result in it. """ if dest is None: return pygpu_make_reduce_scattered(self, src, to_reduce_opcode(op)) comm_reduce_scatter(self, src, dest, to_reduce_opcode(op)) def broadcast(self, GpuArray array not None, int root=-1): """ broadcast(self, array, root=-1) Broadcast collective operation for ranks in a communicator world. Parameters ---------- array: GpuArray Array to be reduced. root: int Rank in `GpuComm` which broadcasts its `array`. Notes ----- * `root` is necessary when invoking from a non-root rank. Root caller does not need to provide `root` argument. """ if root == -1: comm_get_rank(self, &root) comm_broadcast(self, array, root) def all_gather(self, GpuArray src not None, GpuArray dest=None, unsigned int nd_up=1): """ all_gather(self, src, dest=None, nd_up=1) AllGather collective operation for ranks in a communicator world. Parameters ---------- src: GpuArray Array to be gathered. dest: GpuArray Array to receive all gathered arrays from ranks in `GpuComm`. nd_up: int Used when creating result array. Indicates how many extra dimensions user wants result to have. Default is 1, which means that the result will store each rank's gathered array in one extra new dimension. Notes ----- * Providing `nd_up` == 0 means that gathered arrays will be appended to the dimension with the largest stride. """ if dest is None: return pygpu_make_all_gathered(self, src, nd_up) comm_all_gather(self, src, dest) cdef dict TO_RED_OP = { '+': GA_SUM, "sum": GA_SUM, "add": GA_SUM, '*': GA_PROD, "prod": GA_PROD, "product": GA_PROD, "mul": GA_PROD, "max": GA_MAX, "maximum": GA_MAX, "min": GA_MIN, "minimum": GA_MIN, } cdef int to_reduce_opcode(op) except -1: res = TO_RED_OP.get(op.lower()) if res is not None: return res raise ValueError, "Invalid reduce operation: %s" % (str(op)) cdef gpucontext* comm_context(GpuComm comm) except NULL: cdef gpucontext* res res = gpucomm_context(comm.c) if res is NULL: raise GpuArrayException, "Invalid communicator or destroyed context" return res cdef int comm_generate_id(gpucontext* ctx, gpucommCliqueId* comm_id) except -1: cdef int err err = gpucomm_gen_clique_id(ctx, comm_id) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(ctx, err) cdef int comm_get_count(GpuComm comm, int* gpucount) except -1: cdef int err err = gpucomm_get_count(comm.c, gpucount) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(comm_context(comm), err) cdef int comm_get_rank(GpuComm comm, int* gpurank) except -1: cdef int err err = gpucomm_get_rank(comm.c, gpurank) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(comm_context(comm), err) cdef int comm_reduce_from(GpuComm comm, GpuArray src, int opcode, int root) except -1: cdef int err err = GpuArray_reduce_from(&src.ga, opcode, root, comm.c) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(comm_context(comm), err) cdef int comm_reduce(GpuComm comm, GpuArray src, GpuArray dest, int opcode, int root) except -1: cdef int err err = GpuArray_reduce(&src.ga, &dest.ga, opcode, root, comm.c) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(comm_context(comm), err) cdef int comm_all_reduce(GpuComm comm, GpuArray src, GpuArray dest, int opcode) except -1: cdef int err err = GpuArray_all_reduce(&src.ga, &dest.ga, opcode, comm.c) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(comm_context(comm), err) cdef int comm_reduce_scatter(GpuComm comm, GpuArray src, GpuArray dest, int opcode) except -1: cdef int err err = GpuArray_reduce_scatter(&src.ga, &dest.ga, opcode, comm.c) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(comm_context(comm), err) cdef int comm_broadcast(GpuComm comm, GpuArray arr, int root) except -1: cdef int err err = GpuArray_broadcast(&arr.ga, root, comm.c) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(comm_context(comm), err) cdef int comm_all_gather(GpuComm comm, GpuArray src, GpuArray dest) except -1: cdef int err err = GpuArray_all_gather(&src.ga, &dest.ga, comm.c) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(comm_context(comm), err) cdef api GpuArray pygpu_make_reduced(GpuComm comm, GpuArray src, int opcode): cdef GpuArray res res = pygpu_empty_like(src, GA_ANY_ORDER, -1) cdef int rank comm_get_rank(comm, &rank) comm_reduce(comm, src, res, opcode, rank) return res cdef api GpuArray pygpu_make_all_reduced(GpuComm comm, GpuArray src, int opcode): cdef GpuArray res res = pygpu_empty_like(src, GA_ANY_ORDER, -1) comm_all_reduce(comm, src, res, opcode) return res cdef api GpuArray pygpu_make_reduce_scattered(GpuComm comm, GpuArray src, int opcode): if src.ga.nd < 1: raise TypeError, "Source GpuArray must have number of dimensions >= 1" cdef GpuArray res cdef int gpucount cdef bint is_c_cont cdef unsigned int nd cdef size_t chosen_dim_size cdef size_t* dims cdef unsigned int j comm_get_count(comm, &gpucount) is_c_cont = GpuArray_IS_C_CONTIGUOUS(&src.ga) nd = src.ga.nd dims = calloc(nd, sizeof(size_t)) if dims == NULL: raise MemoryError, "Could not allocate dims" try: if is_c_cont: # Smallest in index dimension has the largest stride if src.ga.dimensions[0] % gpucount == 0: chosen_dim_size = src.ga.dimensions[0] / gpucount if chosen_dim_size != 1: dims[0] = chosen_dim_size for j in range(1, nd): dims[j] = src.ga.dimensions[j] else: for j in range(nd - 1): dims[j] = src.ga.dimensions[1 + j] nd -= 1 else: raise TypeError, "Source GpuArray cannot be split in %d c-contiguous arrays" % (gpucount) else: # Largest in index dimension has the largest stride if src.ga.dimensions[nd - 1] % gpucount == 0: chosen_dim_size = src.ga.dimensions[nd - 1] / gpucount for j in range(nd - 1): dims[j] = src.ga.dimensions[j] if chosen_dim_size != 1: dims[nd - 1] = chosen_dim_size else: nd -= 1 else: raise TypeError, "Source GpuArray cannot be split in %d f-contiguous arrays" % (gpucount) res = pygpu_empty(nd, dims, src.ga.typecode, GA_C_ORDER if is_c_cont else GA_F_ORDER, src.context, type(src)) comm_reduce_scatter(comm, src, res, opcode) finally: free(dims) return res cdef api GpuArray pygpu_make_all_gathered(GpuComm comm, GpuArray src, unsigned int nd_up): if src.ga.nd < 1: raise TypeError, "Source GpuArray must have number of dimensions >= 1" cdef GpuArray res cdef int gpucount cdef bint is_c_cont cdef unsigned int nd cdef size_t* dims cdef unsigned int j comm_get_count(comm, &gpucount) is_c_cont = GpuArray_IS_C_CONTIGUOUS(&src.ga) nd = src.ga.nd + nd_up dims = calloc(nd, sizeof(size_t)) if dims == NULL: raise MemoryError, "Could not allocate dims" try: if is_c_cont: # Smallest in index dimension has the largest stride if nd_up == 0: dims[0] = gpucount * src.ga.dimensions[0] for j in range(1, nd): dims[j] = src.ga.dimensions[j] else: dims[0] = gpucount for j in range(1, nd_up): dims[j] = 1 for j in range(src.ga.nd): dims[nd_up + j] = src.ga.dimensions[j] else: # Largest in index dimension has the largest stride if nd_up == 0: dims[nd - 1] = gpucount * src.ga.dimensions[nd - 1] for j in range(nd - 1): dims[j] = src.ga.dimensions[j] else: dims[nd - 1] = gpucount for j in range(nd_up - 1): dims[src.ga.nd + j] = 1 for j in range(src.ga.nd): dims[j] = src.ga.dimensions[j] res = pygpu_empty(nd, dims, src.ga.typecode, GA_C_ORDER if is_c_cont else GA_F_ORDER, src.context, type(src)) comm_all_gather(comm, src, res) finally: free(dims) return res libgpuarray-0.7.6/pygpu/dtypes.py000066400000000000000000000134411326743622600171110ustar00rootroot00000000000000"""Type mapping helpers.""" from __future__ import division import numpy as np from . import gpuarray __copyright__ = "Copyright (C) 2011 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ # {{{ registry NAME_TO_DTYPE = {} def register_dtype(dtype, c_names): """ Associate a numpy dtype with its C equivalents. Will register `dtype` for use with the gpuarray module. If the c_names argument is a list then the first element of that list is taken as the primary association and will be used for generated C code. The other types will be mapped to the provided dtype when going in the other direction. Parameters ---------- dtype: numpy.dtype or string type to associate c_names: str or list list of C type names """ if isinstance(c_names, str): c_names = [c_names] dtype = np.dtype(dtype) # register if not already there try: gpuarray.dtype_to_ctype(dtype) except ValueError: gpuarray.register_dtype(dtype, c_names[0]) for nm in c_names: if nm in NAME_TO_DTYPE and NAME_TO_DTYPE[nm] != dtype: raise RuntimeError("name '%s' already registered" % nm) NAME_TO_DTYPE[nm] = dtype def _fill_dtype_registry(): register_dtype(np.bool, ["ga_bool", "bool"]) register_dtype(np.int8, ["ga_byte", "char", "signed char"]) register_dtype(np.uint8, ["ga_ubyte", "unsigned char"]) register_dtype(np.int16, ["ga_short", "short", "signed short", "signed short int", "short signed int"]) register_dtype(np.uint16, ["ga_ushort", "unsigned short", "unsigned short int", "short unsigned int"]) register_dtype(np.int32, ["ga_int", "int", "signed int"]) register_dtype(np.uint32, ["ga_uint", "unsigned", "unsigned int"]) register_dtype(np.int64, ["ga_long", "long int", "signed long int", "long signed int"]) register_dtype(np.uint64, ["ga_ulong", "unsigned long", "unsigned long int", "long unsigned int"]) register_dtype(np.intp, ["ga_ssize", "ssize_t"]) register_dtype(np.uintp, ["ga_size", "size_t"]) register_dtype(np.float32, ["ga_float", "float"]) register_dtype(np.float64, ["ga_double", "double"]) # }}} # {{{ dtype -> ctype def dtype_to_ctype(dtype): """ Return the C type that corresponds to `dtype`. Parameters ---------- dtype: data type a numpy dtype """ if dtype is None: raise ValueError("dtype may not be None") dtype = np.dtype(dtype) return gpuarray.dtype_to_ctype(dtype) # }}} # {{{ c declarator parsing def parse_c_arg_backend(c_arg, scalar_arg_class, vec_arg_class): c_arg = c_arg.replace("const", "").replace("volatile", "") # process and remove declarator import re decl_re = re.compile(r"(\**)\s*([_a-zA-Z0-9]+)(\s*\[[ 0-9]*\])*\s*$") decl_match = decl_re.search(c_arg) if decl_match is None: raise ValueError("couldn't parse C declarator '%s'" % c_arg) name = decl_match.group(2) if decl_match.group(1) or decl_match.group(3) is not None: arg_class = vec_arg_class else: arg_class = scalar_arg_class tp = c_arg[:decl_match.start()] tp = " ".join(tp.split()) try: dtype = NAME_TO_DTYPE[tp] except KeyError: raise ValueError("unknown type '%s'" % tp) return arg_class(dtype, name) # }}} def get_np_obj(obj): """ Returns a numpy object of the same dtype and comportement as the source suitable for output dtype determination. This is used since the casting rules of numpy are rather obscure and the best way to imitate them is to try an operation ans see what it does. """ if isinstance(obj, np.ndarray) and obj.shape == (): return obj try: return np.ones(1, dtype=obj.dtype) except AttributeError: return np.asarray(obj) def get_common_dtype(obj1, obj2, allow_double): """ Returns the proper output type for a numpy operation involving the two provided objects. This may not be suitable for certain obscure numpy operations. If `allow_double` is False, a return type of float64 will be forced to float32 and complex128 will be forced to complex64. """ # Yes, numpy behaves differently depending on whether # we're dealing with arrays or scalars. np1 = get_np_obj(obj1) np2 = get_np_obj(obj2) result = (np1 + np2).dtype if not allow_double: if result == np.float64: result = np.dtype(np.float32) elif result == np.complex128: result = np.dtype(np.complex64) return result def upcast(*args): a = np.array([0], dtype=args[0]) for t in args[1:]: a = a + np.array([0], dtype=t) return a.dtype # vim: foldmethod=marker libgpuarray-0.7.6/pygpu/elemwise.py000066400000000000000000000062001326743622600174060ustar00rootroot00000000000000import numpy from .dtypes import dtype_to_ctype, get_common_dtype from . import gpuarray from ._elemwise import GpuElemwise, arg __all__ = ['GpuElemwise', 'arg', 'as_argument', 'elemwise1', 'elemwise2', 'ielemwise2', 'compare'] def _dtype(o): if hasattr(o, 'dtype'): return o.dtype return numpy.asarray(o).dtype def as_argument(o, name, read=False, write=False): if (not read) and (not write): raise ValueError('argument is neither read not write') return arg(name, _dtype(o), scalar=not isinstance(o, gpuarray.GpuArray), read=read, write=write) def elemwise1(a, op, oper=None, op_tmpl="res = %(op)sa", out=None, convert_f16=True): args = (as_argument(a, 'res', write=True), as_argument(a, 'a', read=True)) if out is None: res = a._empty_like_me() else: res = out if oper is None: oper = op_tmpl % {'op': op} k = GpuElemwise(a.context, oper, args, convert_f16=convert_f16) k(res, a) return res def elemwise2(a, op, b, ary, odtype=None, oper=None, op_tmpl="res = (%(out_t)s)a %(op)s (%(out_t)s)b", broadcast=False, convert_f16=True): ndim_extend = True if not isinstance(a, gpuarray.GpuArray): a = numpy.asarray(a) ndim_extend = False if not isinstance(b, gpuarray.GpuArray): b = numpy.asarray(b) ndim_extend = False if odtype is None: odtype = get_common_dtype(a, b, True) a_arg = as_argument(a, 'a', read=True) b_arg = as_argument(b, 'b', read=True) args = [arg('res', odtype, write=True), a_arg, b_arg] if ndim_extend: if a.ndim != b.ndim: nd = max(a.ndim, b.ndim) if a.ndim < nd: a = a.reshape(((1,) * (nd - a.ndim)) + a.shape) if b.ndim < nd: b = b.reshape(((1,) * (nd - b.ndim)) + b.shape) out_shape = tuple(max(sa, sb) for sa, sb in zip(a.shape, b.shape)) res = gpuarray.empty(out_shape, dtype=odtype, context=ary.context, cls=ary.__class__) else: res = ary._empty_like_me(dtype=odtype) if oper is None: if convert_f16 and odtype == 'float16': odtype = numpy.dtype('float32') oper = op_tmpl % {'op': op, 'out_t': dtype_to_ctype(odtype)} k = GpuElemwise(ary.context, oper, args, convert_f16=convert_f16) k(res, a, b, broadcast=broadcast) return res def ielemwise2(a, op, b, oper=None, op_tmpl="a = a %(op)s b", broadcast=False, convert_f16=True): if not isinstance(b, gpuarray.GpuArray): b = numpy.asarray(b) a_arg = as_argument(a, 'a', read=True, write=True) b_arg = as_argument(b, 'b', read=True) args = [a_arg, b_arg] if oper is None: oper = op_tmpl % {'op': op} k = GpuElemwise(a.context, oper, args, convert_f16=convert_f16) k(a, b, broadcast=broadcast) return a def compare(a, op, b, broadcast=False, convert_f16=True): return elemwise2(a, op, b, a, odtype=numpy.dtype('bool'), op_tmpl="res = (a %(op)s b)", broadcast=broadcast, convert_f16=convert_f16) libgpuarray-0.7.6/pygpu/gpuarray.pxd000066400000000000000000000324361326743622600176030ustar00rootroot00000000000000cimport libc # This is used in a hack to silence some over-eager warnings. cdef extern from *: ctypedef object slice_object "PySliceObject *" cdef extern from "stdlib.h": void *memcpy(void *dst, void *src, size_t n) void *memset(void *b, int c, size_t sz) cimport numpy as np cdef extern from "numpy/arrayobject.h": object _PyArray_Empty "PyArray_Empty" (int, np.npy_intp *, np.dtype, int) cdef object PyArray_Empty(int a, np.npy_intp *b, np.dtype c, int d) cdef extern from "Python.h": int PySlice_GetIndicesEx(object slice, Py_ssize_t length, Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step, Py_ssize_t *slicelength) except -1 cdef extern from "gpuarray/config.h": int GPUARRAY_API_VERSION int GPUARRAY_ABI_VERSION cdef extern from "gpuarray/types.h": ctypedef struct gpuarray_type: const char *cluda_name size_t size size_t align int typecode enum GPUARRAY_TYPES: GA_BUFFER, GA_BOOL, GA_BYTE, GA_UBYTE, GA_SHORT, GA_USHORT, GA_INT, GA_UINT, GA_LONG, GA_ULONG, GA_FLOAT, GA_DOUBLE, GA_CFLOAT, GA_CDOUBLE, GA_HALF, GA_SIZE, GA_SSIZE, GA_NBASE cdef extern from "gpuarray/util.h": int gpuarray_register_type(gpuarray_type *t, int *ret) size_t gpuarray_get_elsize(int typecode) gpuarray_type *gpuarray_get_type(int typecode) cdef extern from "gpuarray/error.h": cdef enum ga_error: GA_NO_ERROR, GA_MEMORY_ERROR, GA_VALUE_ERROR, GA_IMPL_ERROR, GA_INVALID_ERROR, GA_UNSUPPORTED_ERROR, GA_SYS_ERROR, GA_RUN_ERROR, GA_DEVSUP_ERROR, GA_READONLY_ERROR, GA_WRITEONLY_ERROR, GA_BLAS_ERROR, GA_UNALIGNED_ERROR, GA_COPY_ERROR, GA_COMM_ERROR cdef extern from "gpuarray/buffer.h": ctypedef struct gpucontext_props: pass ctypedef struct gpucontext: pass ctypedef struct gpudata: pass ctypedef struct gpukernel: pass int gpu_get_platform_count(const char* name, unsigned int* platcount) int gpu_get_device_count(const char* name, unsigned int platform, unsigned int* devcount) int gpucontext_props_new(gpucontext_props **res) int gpucontext_props_cuda_dev(gpucontext_props *p, int devno) int gpucontext_props_opencl_dev(gpucontext_props *p, int platno, int devno) int gpucontext_props_sched(gpucontext_props *p, int sched) int gpucontext_props_set_single_stream(gpucontext_props *p) int gpucontext_props_kernel_cache(gpucontext_props *p, const char *path) int gpucontext_props_alloc_cache(gpucontext_props *p, size_t initial, size_t max) void gpucontext_props_del(gpucontext_props *p) int gpucontext_init(gpucontext **res, const char *name, gpucontext_props *p) void gpucontext_deref(gpucontext *ctx) char *gpucontext_error(gpucontext *ctx, int err) int gpudata_property(gpudata *ctx, int prop_id, void *res) int gpucontext_property(gpucontext *ctx, int prop_id, void *res) int gpukernel_property(gpukernel *k, int prop_id, void *res) gpucontext *gpudata_context(gpudata *) gpucontext *gpukernel_context(gpukernel *) int GA_CTX_SCHED_AUTO int GA_CTX_SCHED_SINGLE int GA_CTX_SCHED_MULTI int GA_CTX_PROP_DEVNAME int GA_CTX_PROP_UNIQUE_ID int GA_CTX_PROP_LMEMSIZE int GA_CTX_PROP_NUMPROCS int GA_CTX_PROP_BIN_ID int GA_CTX_PROP_TOTAL_GMEM int GA_CTX_PROP_FREE_GMEM int GA_CTX_PROP_MAXLSIZE0 int GA_CTX_PROP_MAXLSIZE1 int GA_CTX_PROP_MAXLSIZE2 int GA_CTX_PROP_MAXGSIZE0 int GA_CTX_PROP_MAXGSIZE1 int GA_CTX_PROP_MAXGSIZE2 int GA_CTX_PROP_LARGEST_MEMBLOCK int GA_BUFFER_PROP_SIZE int GA_KERNEL_PROP_MAXLSIZE int GA_KERNEL_PROP_PREFLSIZE int GA_KERNEL_PROP_NUMARGS int GA_KERNEL_PROP_TYPES cdef enum ga_usefl: GA_USE_SMALL, GA_USE_DOUBLE, GA_USE_COMPLEX, GA_USE_HALF, GA_USE_CUDA, GA_USE_OPENCL cdef extern from "gpuarray/kernel.h": ctypedef struct _GpuKernel "GpuKernel": gpukernel *k int GpuKernel_init(_GpuKernel *k, gpucontext *ctx, unsigned int count, const char **strs, const size_t *lens, const char *name, unsigned int argcount, const int *types, int flags, char **err_str) void GpuKernel_clear(_GpuKernel *k) gpucontext *GpuKernel_context(_GpuKernel *k) int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *gs, size_t *ls) int GpuKernel_call(_GpuKernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args) cdef extern from "gpuarray/array.h": ctypedef struct _GpuArray "GpuArray": gpudata *data size_t offset size_t *dimensions ssize_t *strides unsigned int nd int flags int typecode cdef int GA_C_CONTIGUOUS cdef int GA_F_CONTIGUOUS cdef int GA_ALIGNED cdef int GA_WRITEABLE cdef int GA_BEHAVED cdef int GA_CARRAY cdef int GA_FARRAY bint GpuArray_CHKFLAGS(_GpuArray *a, int fl) bint GpuArray_ISONESEGMENT(_GpuArray *a) bint GpuArray_IS_C_CONTIGUOUS(_GpuArray *a) ctypedef enum ga_order: GA_ANY_ORDER, GA_C_ORDER, GA_F_ORDER void GpuArray_fix_flags(_GpuArray *a) int GpuArray_empty(_GpuArray *a, gpucontext *ctx, int typecode, int nd, const size_t *dims, ga_order ord) int GpuArray_fromdata(_GpuArray *a, gpudata *data, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, int writable) int GpuArray_view(_GpuArray *v, _GpuArray *a) int GpuArray_sync(_GpuArray *a) nogil int GpuArray_index(_GpuArray *r, _GpuArray *a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps) int GpuArray_take1(_GpuArray *r, _GpuArray *a, _GpuArray *i, int check_err) int GpuArray_setarray(_GpuArray *v, _GpuArray *a) int GpuArray_reshape(_GpuArray *res, _GpuArray *a, unsigned int nd, const size_t *newdims, ga_order ord, int nocopy) int GpuArray_reshape_inplace(_GpuArray *a, unsigned int nd, const size_t *newdims, ga_order ord) int GpuArray_transpose(_GpuArray *res, _GpuArray *a, const unsigned int *new_axes) void GpuArray_clear(_GpuArray *a) int GpuArray_share(_GpuArray *a, _GpuArray *b) gpucontext *GpuArray_context(_GpuArray *a) int GpuArray_move(_GpuArray *dst, _GpuArray *src) int GpuArray_write(_GpuArray *dst, void *src, size_t src_sz) nogil int GpuArray_read(void *dst, size_t dst_sz, _GpuArray *src) nogil int GpuArray_memset(_GpuArray *a, int data) int GpuArray_copy(_GpuArray *res, _GpuArray *a, ga_order order) int GpuArray_transfer(_GpuArray *res, const _GpuArray *a) nogil int GpuArray_split(_GpuArray **rs, const _GpuArray *a, size_t n, size_t *p, unsigned int axis) int GpuArray_concatenate(_GpuArray *r, const _GpuArray **as, size_t n, unsigned int axis, int restype) char *GpuArray_error(_GpuArray *a, int err) void GpuArray_fprintf(libc.stdio.FILE *fd, _GpuArray *a) bint GpuArray_is_c_contiguous(_GpuArray *a) bint GpuArray_is_f_contiguous(_GpuArray *a) cdef extern from "gpuarray/extension.h": void *gpuarray_get_extension(const char *) ctypedef struct GpuArrayIpcMemHandle: pass cdef int GPUARRAY_CUDA_CTX_NOFREE cdef type get_exc(int errcode) cdef np.dtype dtype_to_npdtype(dtype) # If you change the api interface, you MUST increment either the minor # (if you add a function) or the major version (if you change # arguments or remove a function) in the gpuarray.pyx file. cdef api np.dtype typecode_to_dtype(int typecode) cdef api int get_typecode(dtype) except -1 cpdef int dtype_to_typecode(dtype) except -1 cdef ga_order to_ga_order(ord) except -2 cdef bint py_CHKFLAGS(GpuArray a, int flags) cdef bint py_ISONESEGMENT(GpuArray a) cdef int array_empty(GpuArray a, gpucontext *ctx, int typecode, unsigned int nd, const size_t *dims, ga_order ord) except -1 cdef int array_fromdata(GpuArray a, gpudata *data, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, int writeable) except -1 cdef int array_view(GpuArray v, GpuArray a) except -1 cdef int array_sync(GpuArray a) except -1 cdef int array_index(GpuArray r, GpuArray a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps) except -1 cdef int array_take1(GpuArray r, GpuArray a, GpuArray i, int check_err) except -1 cdef int array_setarray(GpuArray v, GpuArray a) except -1 cdef int array_reshape(GpuArray res, GpuArray a, unsigned int nd, const size_t *newdims, ga_order ord, bint nocopy) except -1 cdef int array_transpose(GpuArray res, GpuArray a, const unsigned int *new_axes) except -1 cdef int array_clear(GpuArray a) except -1 cdef bint array_share(GpuArray a, GpuArray b) cdef gpucontext *array_context(GpuArray a) except NULL cdef int array_move(GpuArray a, GpuArray src) except -1 cdef int array_write(GpuArray a, void *src, size_t sz) except -1 cdef int array_read(void *dst, size_t sz, GpuArray src) except -1 cdef int array_memset(GpuArray a, int data) except -1 cdef int array_copy(GpuArray res, GpuArray a, ga_order order) except -1 cdef int array_transfer(GpuArray res, GpuArray a) except -1 cdef const char *kernel_error(GpuKernel k, int err) except NULL cdef int kernel_init(GpuKernel k, gpucontext *ctx, unsigned int count, const char **strs, const size_t *len, const char *name, unsigned int argcount, const int *types, int flags) except -1 cdef int kernel_clear(GpuKernel k) except -1 cdef gpucontext *kernel_context(GpuKernel k) except NULL cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1 cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args) except -1 cdef int kernel_property(GpuKernel k, int prop_id, void *res) except -1 cdef int ctx_property(GpuContext c, int prop_id, void *res) except -1 cdef GpuContext ensure_context(GpuContext c) cdef api GpuContext pygpu_default_context() cdef api bint pygpu_GpuArray_Check(object o) cdef api GpuContext pygpu_init(object dev, gpucontext_props *p) cdef api GpuArray pygpu_zeros(unsigned int nd, const size_t *dims, int typecode, ga_order order, GpuContext context, object cls) cdef api GpuArray pygpu_empty(unsigned int nd, const size_t *dims, int typecode, ga_order order, GpuContext context, object cls) cdef api GpuArray pygpu_fromgpudata(gpudata *buf, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, GpuContext context, bint writable, object base, object cls) cdef api GpuArray pygpu_copy(GpuArray a, ga_order ord) cdef api int pygpu_move(GpuArray a, GpuArray src) except -1 cdef api GpuArray pygpu_view(GpuArray a, object cls) cdef api int pygpu_sync(GpuArray a) except -1 cdef api GpuArray pygpu_empty_like(GpuArray a, ga_order ord, int typecode) cdef api np.ndarray pygpu_as_ndarray(GpuArray a) cdef np.ndarray _pygpu_as_ndarray(GpuArray a, np.dtype ldtype) cdef api GpuArray pygpu_index(GpuArray a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps) cdef api GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims, ga_order ord, bint nocopy, int compute_axis) cdef api GpuArray pygpu_transpose(GpuArray a, const unsigned int *newaxes) cdef api int pygpu_transfer(GpuArray res, GpuArray a) except -1 cdef api GpuArray pygpu_concatenate(const _GpuArray **a, size_t n, unsigned int axis, int restype, object cls, GpuContext context) cdef api class GpuContext [type PyGpuContextType, object PyGpuContextObject]: cdef dict __dict__ cdef gpucontext* ctx cdef readonly bytes kind cdef object __weakref__ cdef GpuArray new_GpuArray(object cls, GpuContext ctx, object base) cdef api class GpuArray [type PyGpuArrayType, object PyGpuArrayObject]: cdef _GpuArray ga cdef readonly GpuContext context cdef readonly object base cdef object __weakref__ cdef __index_helper(self, key, unsigned int i, ssize_t *start, ssize_t *stop, ssize_t *step) cdef __cgetitem__(self, idx) cdef api class GpuKernel [type PyGpuKernelType, object PyGpuKernelObject]: cdef _GpuKernel k cdef readonly GpuContext context cdef void **callbuf cdef object __weakref__ cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared) cdef _setarg(self, unsigned int index, int typecode, object o) libgpuarray-0.7.6/pygpu/gpuarray.pyx000066400000000000000000002422671326743622600176350ustar00rootroot00000000000000cimport libc.stdio from libc.stdlib cimport malloc, calloc, free from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.string cimport strncmp cimport numpy as np import numpy as np import sys from cpython cimport Py_INCREF, PyNumber_Index from cpython.object cimport Py_EQ, Py_NE def api_version(): """api_version() """ # (library version, module version) return (GPUARRAY_API_VERSION, 0) def abi_version(): """abi_version() """ major_version = GPUARRAY_ABI_VERSION / 1000 minor_version = GPUARRAY_ABI_VERSION % 1000 return (major_version, minor_version) np.import_array() # to export the numeric value SIZE = GA_SIZE SSIZE = GA_SSIZE # Numpy API steals dtype references and this breaks cython cdef object PyArray_Empty(int a, np.npy_intp *b, np.dtype c, int d): Py_INCREF(c) return _PyArray_Empty(a, b, c, d) cdef bytes _s(s): if isinstance(s, unicode): return (s).encode('ascii') if isinstance(s, bytes): return s raise TypeError("Expected a string") cdef size_t countis(l, object val): cdef size_t count cdef size_t i count = 0 for i in range(len(l)): if l[i] is val: count += 1 return count def cl_wrap_ctx(size_t ptr): """ cl_wrap_ctx(ptr) Wrap an existing OpenCL context (the cl_context struct) into a GpuContext class. """ cdef gpucontext *(*cl_make_ctx)(void *, int) cdef GpuContext res cl_make_ctx = gpuarray_get_extension("cl_make_ctx") if cl_make_ctx == NULL: raise RuntimeError, "cl_make_ctx extension is absent" res = GpuContext.__new__(GpuContext) res.ctx = cl_make_ctx(ptr, 0) if res.ctx == NULL: raise RuntimeError, "cl_make_ctx call failed" return res def cuda_wrap_ctx(size_t ptr, bint own): """ cuda_wrap_ctx(ptr) Wrap an existing CUDA driver context (CUcontext) into a GpuContext class. If `own` is true, libgpuarray is now reponsible for the context and it will be destroyed once there are no references to it. Otherwise, the context will not be destroyed and it is the calling code's reponsability. """ cdef gpucontext *(*cuda_make_ctx)(void *, int) cdef int flags cdef GpuContext res cuda_make_ctx = gpuarray_get_extension("cuda_make_ctx") if cuda_make_ctx == NULL: raise RuntimeError, "cuda_make_ctx extension is absent" res = GpuContext.__new__(GpuContext) flags = 0 if not own: flags |= GPUARRAY_CUDA_CTX_NOFREE res.ctx = cuda_make_ctx(ptr, flags) if res.ctx == NULL: raise RuntimeError, "cuda_make_ctx call failed" return res import numpy cdef dict NP_TO_TYPE = { np.dtype('bool'): GA_BOOL, np.dtype('int8'): GA_BYTE, np.dtype('uint8'): GA_UBYTE, np.dtype('int16'): GA_SHORT, np.dtype('uint16'): GA_USHORT, np.dtype('int32'): GA_INT, np.dtype('uint32'): GA_UINT, np.dtype('int64'): GA_LONG, np.dtype('uint64'): GA_ULONG, np.dtype('float32'): GA_FLOAT, np.dtype('float64'): GA_DOUBLE, np.dtype('complex64'): GA_CFLOAT, np.dtype('complex128'): GA_CDOUBLE, np.dtype('float16'): GA_HALF, } cdef dict TYPE_TO_NP = dict((v, k) for k, v in NP_TO_TYPE.iteritems()) def register_dtype(np.dtype dtype, cname): """ register_dtype(dtype, cname) Make a new type known to the cluda machinery. This function return the associted internal typecode for the new type. Parameters ---------- dtype: numpy.dtype new type cname: str C name for the type declarations """ cdef gpuarray_type *t cdef int typecode cdef char *tmp t = malloc(sizeof(gpuarray_type)) if t == NULL: raise MemoryError, "Can't allocate new type" tmp = malloc(len(cname)+1) if tmp == NULL: free(t) raise MemoryError memcpy(tmp, cname, len(cname)+1) t.size = dtype.itemsize t.align = dtype.alignment t.cluda_name = tmp typecode = gpuarray_register_type(t, NULL) if typecode == -1: free(tmp) free(t) raise RuntimeError, "Could not register type" NP_TO_TYPE[dtype] = typecode TYPE_TO_NP[typecode] = dtype cdef np.dtype typecode_to_dtype(int typecode): res = TYPE_TO_NP.get(typecode, None) if res is not None: return res else: raise NotImplementedError, "TODO" # This function takes a flexible dtype as accepted by the functions of # this module and ensures it becomes a numpy dtype. cdef np.dtype dtype_to_npdtype(dtype): if dtype is None: return None if isinstance(dtype, int): return typecode_to_dtype(dtype) try: return np.dtype(dtype) except TypeError: pass if isinstance(dtype, np.dtype): return dtype raise ValueError("data type not understood", dtype) # This is a stupid wrapper to avoid the extra argument introduced by having # dtype_to_typecode declared 'cpdef'. cdef int get_typecode(dtype) except -1: return dtype_to_typecode(dtype) cpdef int dtype_to_typecode(dtype) except -1: """ dtype_to_typecode(dtype) Get the internal typecode for a type. Parameters ---------- dtype: numpy.dtype type to get the code for """ if isinstance(dtype, int): return dtype try: dtype = np.dtype(dtype) except TypeError: pass if isinstance(dtype, np.dtype): res = NP_TO_TYPE.get(dtype, None) if res is not None: return res raise ValueError, "don't know how to convert to dtype: %s"%(dtype,) def dtype_to_ctype(dtype): """ dtype_to_ctype(dtype) Return the C name for a type. Parameters ---------- dtype: numpy.dtype type to get the name for """ cdef int typecode = dtype_to_typecode(dtype) cdef const gpuarray_type *t = gpuarray_get_type(typecode) cdef bytes res if t.cluda_name == NULL: raise ValueError, "No mapping for %s"%(dtype,) res = t.cluda_name return res.decode('ascii') cdef ga_order to_ga_order(ord) except -2: if ord == "C" or ord == "c": return GA_C_ORDER elif ord == "A" or ord == "a" or ord is None: return GA_ANY_ORDER elif ord == "F" or ord == "f": return GA_F_ORDER else: raise ValueError, "Valid orders are: 'A' (any), 'C' (C), 'F' (Fortran)" cdef int strides_ok(GpuArray a, strides): # Check that the passed in strides will not go outside of the # memory of the array. It is assumed that the strides are of the # proper length. cdef ssize_t max_axis_offset cdef size_t lower = a.ga.offset cdef size_t upper = a.ga.offset cdef size_t itemsize = gpuarray_get_elsize(a.ga.typecode) cdef size_t size cdef unsigned int i gpudata_property(a.ga.data, GA_BUFFER_PROP_SIZE, &size) for i in range(a.ga.nd): if a.ga.dimensions[i] == 0: return 1 max_axis_offset = (strides[i]) * (a.ga.dimensions[i] - 1) if max_axis_offset > 0: if upper + max_axis_offset > size: return 0 upper += max_axis_offset else: if lower < (-max_axis_offset): return 0 lower += max_axis_offset return (upper + itemsize) <= size class GpuArrayException(Exception): """ Exception used for most errors related to libgpuarray. """ class UnsupportedException(GpuArrayException): pass cdef type get_exc(int errcode): if errcode == GA_VALUE_ERROR: return ValueError if errcode == GA_DEVSUP_ERROR: return UnsupportedException else: return GpuArrayException cdef bint py_CHKFLAGS(GpuArray a, int flags): return GpuArray_CHKFLAGS(&a.ga, flags) cdef bint py_ISONESEGMENT(GpuArray a): return GpuArray_ISONESEGMENT(&a.ga) cdef void array_fix_flags(GpuArray a): GpuArray_fix_flags(&a.ga) cdef int array_empty(GpuArray a, gpucontext *ctx, int typecode, unsigned int nd, const size_t *dims, ga_order ord) except -1: cdef int err err = GpuArray_empty(&a.ga, ctx, typecode, nd, dims, ord) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(ctx, err) cdef int array_fromdata(GpuArray a, gpudata *data, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, int writeable) except -1: cdef int err err = GpuArray_fromdata(&a.ga, data, offset, typecode, nd, dims, strides, writeable) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(gpudata_context(data), err) cdef int array_view(GpuArray v, GpuArray a) except -1: cdef int err err = GpuArray_view(&v.ga, &a.ga) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&a.ga, err) cdef int array_sync(GpuArray a) except -1: cdef int err with nogil: err = GpuArray_sync(&a.ga) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&a.ga, err) cdef int array_index(GpuArray r, GpuArray a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps) except -1: cdef int err err = GpuArray_index(&r.ga, &a.ga, starts, stops, steps) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&a.ga, err) cdef int array_take1(GpuArray r, GpuArray a, GpuArray i, int check_err) except -1: cdef int err err = GpuArray_take1(&r.ga, &a.ga, &i.ga, check_err) if err != GA_NO_ERROR: if err == GA_VALUE_ERROR: raise IndexError, GpuArray_error(&r.ga, err) raise get_exc(err), GpuArray_error(&r.ga, err) cdef int array_setarray(GpuArray v, GpuArray a) except -1: cdef int err err = GpuArray_setarray(&v.ga, &a.ga) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&v.ga, err) cdef int array_reshape(GpuArray res, GpuArray a, unsigned int nd, const size_t *newdims, ga_order ord, bint nocopy) except -1: cdef int err err = GpuArray_reshape(&res.ga, &a.ga, nd, newdims, ord, nocopy) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&a.ga, err) cdef int array_transpose(GpuArray res, GpuArray a, const unsigned int *new_axes) except -1: cdef int err err = GpuArray_transpose(&res.ga, &a.ga, new_axes) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&a.ga, err) cdef int array_clear(GpuArray a) except -1: GpuArray_clear(&a.ga) cdef bint array_share(GpuArray a, GpuArray b): return GpuArray_share(&a.ga, &b.ga) cdef gpucontext *array_context(GpuArray a) except NULL: cdef gpucontext *res res = GpuArray_context(&a.ga) if res is NULL: raise GpuArrayException, "Invalid array or destroyed context" return res cdef int array_move(GpuArray a, GpuArray src) except -1: cdef int err err = GpuArray_move(&a.ga, &src.ga) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&a.ga, err) cdef int array_write(GpuArray a, void *src, size_t sz) except -1: cdef int err with nogil: err = GpuArray_write(&a.ga, src, sz) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&a.ga, err) cdef int array_read(void *dst, size_t sz, GpuArray src) except -1: cdef int err with nogil: err = GpuArray_read(dst, sz, &src.ga) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&src.ga, err) cdef int array_memset(GpuArray a, int data) except -1: cdef int err err = GpuArray_memset(&a.ga, data) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&a.ga, err) cdef int array_copy(GpuArray res, GpuArray a, ga_order order) except -1: cdef int err err = GpuArray_copy(&res.ga, &a.ga, order) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&a.ga, err) cdef int array_transfer(GpuArray res, GpuArray a) except -1: cdef int err with nogil: err = GpuArray_transfer(&res.ga, &a.ga) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&a.ga, err) cdef int array_split(_GpuArray **res, GpuArray a, size_t n, size_t *p, unsigned int axis) except -1: cdef int err err = GpuArray_split(res, &a.ga, n, p, axis) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&a.ga, err) cdef int array_concatenate(GpuArray r, const _GpuArray **a, size_t n, unsigned int axis, int restype) except -1: cdef int err err = GpuArray_concatenate(&r.ga, a, n, axis, restype) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(a[0], err) cdef const char *kernel_error(GpuKernel k, int err) except NULL: return gpucontext_error(gpukernel_context(k.k.k), err) cdef int kernel_init(GpuKernel k, gpucontext *ctx, unsigned int count, const char **strs, const size_t *len, const char *name, unsigned int argcount, const int *types, int flags) except -1: cdef int err cdef char *err_str = NULL err = GpuKernel_init(&k.k, ctx, count, strs, len, name, argcount, types, flags, &err_str) if err != GA_NO_ERROR: if err_str != NULL: try: py_err_str = err_str.decode('UTF-8') finally: free(err_str) raise get_exc(err), py_err_str raise get_exc(err), gpucontext_error(ctx, err) cdef int kernel_clear(GpuKernel k) except -1: GpuKernel_clear(&k.k) cdef gpucontext *kernel_context(GpuKernel k) except NULL: cdef gpucontext *res res = GpuKernel_context(&k.k) if res is NULL: raise GpuArrayException, "Invalid kernel or destroyed context" return res cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1: cdef int err err = GpuKernel_sched(&k.k, n, gs, ls) if err != GA_NO_ERROR: raise get_exc(err), kernel_error(k, err) cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args) except -1: cdef int err err = GpuKernel_call(&k.k, n, gs, ls, shared, args) if err != GA_NO_ERROR: raise get_exc(err), kernel_error(k, err) cdef int kernel_property(GpuKernel k, int prop_id, void *res) except -1: cdef int err err = gpukernel_property(k.k.k, prop_id, res) if err != GA_NO_ERROR: raise get_exc(err), kernel_error(k, err) cdef GpuContext pygpu_default_context(): return default_context cdef GpuContext default_context = None cdef int ctx_property(GpuContext c, int prop_id, void *res) except -1: cdef int err err = gpucontext_property(c.ctx, prop_id, res) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(c.ctx, err) def set_default_context(GpuContext ctx): """ set_default_context(ctx) Set the default context for the module. The provided context will be used as a default value for all the other functions in this module which take a context as parameter. Call with `None` to clear the default value. If you don't call this function the context of all other functions is a mandatory argument. This can be helpful to reduce clutter when working with only one context. It is strongly discouraged to use this function when working with multiple contexts at once. Parameters ---------- ctx: GpuContext default context """ global default_context default_context = ctx def get_default_context(): """ get_default_context() Return the currently defined default context (or `None`). """ return default_context cdef GpuContext ensure_context(GpuContext c): global default_context if c is None: if default_context is None: raise TypeError, "No context specified." return default_context return c cdef bint pygpu_GpuArray_Check(object o): return isinstance(o, GpuArray) def count_platforms(kind): """ count_platforms(kind) Return number of host's platforms compatible with `kind`. """ cdef unsigned int platcount cdef int err err = gpu_get_platform_count(_s(kind), &platcount) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(NULL, err) return platcount def count_devices(kind, unsigned int platform): """ count_devices(kind, platform) Returns number of devices in host's `platform` compatible with `kind`. """ cdef unsigned int devcount cdef int err err = gpu_get_device_count(_s(kind), platform, &devcount) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(NULL, err) return devcount cdef GpuContext pygpu_init(dev, gpucontext_props *p): cdef int err cdef GpuContext res if dev.startswith('cuda'): kind = b"cuda" if dev[4:] == '': devnum = -1 else: devnum = int(dev[4:]) gpucontext_props_cuda_dev(p, devnum) elif dev.startswith('opencl'): kind = b"opencl" devspec = dev[6:].split(':') if len(devspec) < 2: raise ValueError, "OpenCL name incorrect. Should be opencl: instead got: " + dev if not devspec[0].isdigit() or not devspec[1].isdigit(): raise ValueError, "OpenCL name incorrect. Should be opencl: instead got: " + dev else: gpucontext_props_opencl_dev(p, int(devspec[0]), int(devspec[1])) else: raise ValueError, "Unknown device format:" + dev res = GpuContext.__new__(GpuContext) res.kind = kind err = gpucontext_init(&res.ctx, res.kind, p) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(NULL, err) return res def init(dev, sched='default', single_stream=False, kernel_cache_path=None, max_cache_size=sys.maxsize, initial_cache_size=0): """ init(dev, sched='default', single_stream=False, kernel_cache_path=None, max_cache_size=sys.maxsize, initial_cache_size=0) Creates a context from a device specifier. Device specifiers are composed of the type string and the device id like so:: "cuda0" "opencl0:1" For cuda the device id is the numeric identifier. You can see what devices are available by running nvidia-smi on the machine. Be aware that the ordering in nvidia-smi might not correspond to the ordering in this library. This is due to how cuda enumerates devices. If you don't specify a number (e.g. 'cuda') the first available device will be selected according to the backend order. For opencl the device id is the platform number, a colon (:) and the device number. There are no widespread and/or easy way to list available platforms and devices. You can experiement with the values, unavaiable ones will just raise an error, and there are no gaps in the valid numbers. Parameters ---------- dev: str device specifier sched: {'default', 'single', 'multi'} optimize scheduling for which type of operation disable_alloc_cache: bool disable allocation cache (if any) single_stream: bool enable single stream mode """ cdef gpucontext_props *p = NULL cdef int err cdef bytes kernel_cache_path_b err = gpucontext_props_new(&p) if err != GA_NO_ERROR: raise MemoryError try: if sched == 'single': err = gpucontext_props_sched(p, GA_CTX_SCHED_SINGLE) elif sched == 'multi': err = gpucontext_props_sched(p, GA_CTX_SCHED_MULTI) elif sched != 'default': raise TypeError('unexpected value for parameter sched: %s' % (sched,)) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(NULL, err) if kernel_cache_path: kernel_cache_path_b = _s(kernel_cache_path) gpucontext_props_kernel_cache(p, kernel_cache_path_b) err = gpucontext_props_alloc_cache(p, initial_cache_size, max_cache_size) if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(NULL, err) if single_stream: gpucontext_props_set_single_stream(p); except: gpucontext_props_del(p) raise return pygpu_init(dev, p) def zeros(shape, dtype=GA_DOUBLE, order='C', GpuContext context=None, cls=None): """ zeros(shape, dtype='float64', order='C', context=None, cls=None) Returns an array of zero-initialized values of the requested shape, type and order. Parameters ---------- shape: iterable of ints number of elements in each dimension dtype: str, numpy.dtype or int type of the elements order: {'A', 'C', 'F'} layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran context: GpuContext context in which to do the allocation cls: type class of the returned array (must inherit from GpuArray) """ res = empty(shape, dtype=dtype, order=order, context=context, cls=cls) array_memset(res, 0) return res cdef GpuArray pygpu_zeros(unsigned int nd, const size_t *dims, int typecode, ga_order order, GpuContext context, object cls): cdef GpuArray res res = pygpu_empty(nd, dims, typecode, order, context, cls) array_memset(res, 0) return res cdef GpuArray pygpu_empty(unsigned int nd, const size_t *dims, int typecode, ga_order order, GpuContext context, object cls): cdef GpuArray res context = ensure_context(context) res = new_GpuArray(cls, context, None) array_empty(res, context.ctx, typecode, nd, dims, order) return res cdef GpuArray pygpu_fromgpudata(gpudata *buf, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, GpuContext context, bint writable, object base, object cls): cdef GpuArray res res = new_GpuArray(cls, context, base) array_fromdata(res, buf, offset, typecode, nd, dims, strides, writable) return res cdef GpuArray pygpu_copy(GpuArray a, ga_order ord): cdef GpuArray res res = new_GpuArray(type(a), a.context, None) array_copy(res, a, ord) return res cdef int pygpu_move(GpuArray a, GpuArray src) except -1: array_move(a, src) return 0 def empty(shape, dtype=GA_DOUBLE, order='C', GpuContext context=None, cls=None): """ empty(shape, dtype='float64', order='C', context=None, cls=None) Returns an empty (uninitialized) array of the requested shape, type and order. Parameters ---------- shape: iterable of ints number of elements in each dimension dtype: str, numpy.dtype or int type of the elements order: {'A', 'C', 'F'} layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran context: GpuContext context in which to do the allocation cls: type class of the returned array (must inherit from GpuArray) """ cdef size_t *cdims cdef unsigned int nd try: nd = len(shape) except TypeError: nd = 1 shape = [shape] cdims = calloc(nd, sizeof(size_t)) if cdims == NULL: raise MemoryError, "could not allocate cdims" try: for i, d in enumerate(shape): cdims[i] = d return pygpu_empty(nd, cdims, dtype_to_typecode(dtype), to_ga_order(order), context, cls) finally: free(cdims) def asarray(a, dtype=None, order='A', GpuContext context=None): """ asarray(a, dtype=None, order='A', context=None) Returns a GpuArray from the data in `a` If `a` is already a GpuArray and all other parameters match, then the object itself returned. If `a` is an instance of a subclass of GpuArray then a view of the base class will be returned. Otherwise a new object is create and the data is copied into it. `context` is optional if `a` is a GpuArray (but must match exactly the context of `a` if specified) and is mandatory otherwise. Parameters ---------- a: array-like data dtype: str, numpy.dtype or int type of the elements order: {'A', 'C', 'F'} layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran context: GpuContext context in which to do the allocation """ return array(a, dtype=dtype, order=order, copy=False, context=context, cls=GpuArray) def ascontiguousarray(a, dtype=None, GpuContext context=None): """ ascontiguousarray(a, dtype=None, context=None) Returns a contiguous array in device memory (C order). `context` is optional if `a` is a GpuArray (but must match exactly the context of `a` if specified) and is mandatory otherwise. Parameters ---------- a: array-like input dtype: str, numpy.dtype or int type of the return array context: GpuContext context to use for a new array """ return array(a, order='C', dtype=dtype, ndmin=1, copy=False, context=context) def asfortranarray(a, dtype=None, GpuArray context=None): """ asfortranarray(a, dtype=None, context=None) Returns a contiguous array in device memory (Fortran order) `context` is optional if `a` is a GpuArray (but must match exactly the context of `a` if specified) and is mandatory otherwise. Parameters ---------- a: array-like input dtype: str, numpy.dtype or int type of the elements context: GpuContext context in which to do the allocation """ return array(a, order='F', dtype=dtype, ndmin=1, copy=False, context=context) def may_share_memory(GpuArray a not None, GpuArray b not None): """ may_share_memory(a, b) Returns True if `a` and `b` may share memory, False otherwise. """ return array_share(a, b) def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None, strides=None, writable=True, base=None, cls=None): """ from_gpudata(data, offset, dtype, shape, context=None, strides=None, writable=True, base=None, cls=None) Build a GpuArray from pre-allocated gpudata Parameters ---------- data: int pointer to a gpudata structure offset: int offset to the data location inside the gpudata dtype: numpy.dtype data type of the gpudata elements shape: iterable of ints shape to use for the result context: GpuContext context of the gpudata strides: iterable of ints strides for the results (C contiguous if not specified) writable: bool is the data writable? base: object base object that keeps gpudata alive cls: type view type of the result Notes ----- This function might be deprecated in a later relase since the only way to create gpudata pointers is through libgpuarray functions that aren't exposed at the python level. It can be used with the value of the `gpudata` attribute of an existing GpuArray. .. warning:: This function is intended for advanced use and will crash the interpreter if used improperly. """ cdef size_t *cdims = NULL cdef ssize_t *cstrides = NULL cdef unsigned int nd cdef size_t size cdef int typecode context = ensure_context(context) try: nd = len(shape) except TypeError: nd = 1 shape = [shape] if strides is not None and len(strides) != nd: raise ValueError, "strides must be the same length as shape" typecode = dtype_to_typecode(dtype) try: cdims = calloc(nd, sizeof(size_t)) cstrides = calloc(nd, sizeof(ssize_t)) if cdims == NULL or cstrides == NULL: raise MemoryError for i, d in enumerate(shape): cdims[i] = d if strides: for i, s in enumerate(strides): cstrides[i] = s else: size = gpuarray_get_elsize(typecode) for i in range(nd-1, -1, -1): cstrides[i] = size size *= cdims[i] return pygpu_fromgpudata(data, offset, typecode, nd, cdims, cstrides, context, writable, base, cls) finally: free(cdims) free(cstrides) def array(proto, dtype=None, copy=True, order=None, unsigned int ndmin=0, GpuContext context=None, cls=None): """ array(obj, dtype='float64', copy=True, order=None, ndmin=0, context=None, cls=None) Create a GpuArray from existing data This function creates a new GpuArray from the data provided in `obj` except if `obj` is already a GpuArray and all the parameters match its properties and `copy` is False. The properties of the resulting array depend on the input data except if overriden by other parameters. This function is similar to :meth:`numpy.array` except that it returns GpuArrays. Parameters ---------- obj: array-like data to initialize the result dtype: string or numpy.dtype or int data type of the result elements copy: bool return a copy? order: str memory layout of the result ndmin: int minimum number of result dimensions context: GpuContext allocation context cls: type result class (must inherit from GpuArray) """ return carray(proto, dtype, copy, order, ndmin, context, cls) cdef carray(proto, dtype, copy, order, unsigned int ndmin, GpuContext context, cls): cdef GpuArray res cdef GpuArray arg cdef GpuArray tmp cdef np.ndarray a if isinstance(proto, GpuArray): arg = proto if context is not None and context.ctx != array_context(arg): raise ValueError, "cannot copy an array to a different context" if (not copy and (dtype is None or dtype_to_typecode(dtype) == arg.typecode) and (order is None or order == 'A' or (order == 'C' and py_CHKFLAGS(arg, GA_C_CONTIGUOUS)) or (order == 'F' and py_CHKFLAGS(arg, GA_F_CONTIGUOUS)))): if arg.ga.nd < ndmin: shp = arg.shape idx = (1,)*(ndmin-len(shp)) shp = idx + shp arg = arg.reshape(shp) if not (cls is None or arg.__class__ is cls): arg = arg.view(cls) return arg shp = arg.shape if len(shp) < ndmin: idx = (1,)*(ndmin-len(shp)) shp = idx + shp if order is None or order == 'A': if py_CHKFLAGS(arg, GA_C_CONTIGUOUS): order = 'C' elif py_CHKFLAGS(arg, GA_F_CONTIGUOUS): order = 'F' if cls is None: cls = type(proto) res = empty(shp, dtype=(dtype or arg.dtype), order=order, cls=cls, context=arg.context) res.base = arg.base if len(shp) < ndmin: tmp = res[idx] else: tmp = res array_move(tmp, arg) return res context = ensure_context(context) # We need a contiguous array for the copy if order != 'C' and order != 'F': order = 'C' a = numpy.array(proto, dtype=dtype_to_npdtype(dtype), order=order, ndmin=ndmin, copy=False) res = pygpu_empty(np.PyArray_NDIM(a), np.PyArray_DIMS(a), dtype_to_typecode(a.dtype), to_ga_order(order), context, cls) array_write(res, np.PyArray_DATA(a), np.PyArray_NBYTES(a)) return res cdef void (*cuda_enter)(gpucontext *) cdef void (*cuda_exit)(gpucontext *) cuda_enter = gpuarray_get_extension("cuda_enter") cuda_exit = gpuarray_get_extension("cuda_exit") cdef class GpuContext: """ Class that holds all the information pertaining to a context. The currently implemented modules (for the `kind` parameter) are "cuda" and "opencl". Which are available depends on the build options for libgpuarray. The flag values are defined in the gpuarray/buffer.h header and are in the "Context flags" group. If you want to use more than one value you must bitwise OR them together. If you want an alternative interface check :meth:`~pygpu.gpuarray.init`. Parameters ---------- kind: str module name for the context devno: int device number flags: int context flags """ def __dealloc__(self): if self.ctx != NULL: gpucontext_deref(self.ctx) def __reduce__(self): raise RuntimeError, "Cannot pickle GpuContext object" def __init__(self): if type(self) is GpuContext: raise RuntimeError, "Called raw GpuContext.__init__" def __enter__(self): if cuda_enter == NULL: raise RuntimeError("cuda_enter not available") if cuda_exit == NULL: raise RuntimeError("cuda_exit not available") if self.kind != b"cuda": raise ValueError("Context manager only works for cuda") cuda_enter(self.ctx) return self def __exit__(self, t, v, tb): cuda_exit(self.ctx) property ptr: "Raw pointer value for the context object" def __get__(self): return self.ctx property devname: "Device name for this context" def __get__(self): cdef char tmp[256] ctx_property(self, GA_CTX_PROP_DEVNAME, tmp) return tmp.decode('ascii') property unique_id: "Device PCI Bus ID for this context" def __get__(self): cdef char tmp[16] ctx_property(self, GA_CTX_PROP_UNIQUE_ID, tmp) return tmp.decode('ascii') property lmemsize: "Size of the local (shared) memory, in bytes, for this context" def __get__(self): cdef size_t res ctx_property(self, GA_CTX_PROP_LMEMSIZE, &res) return res property numprocs: "Number of compute units for this context" def __get__(self): cdef unsigned int res ctx_property(self, GA_CTX_PROP_NUMPROCS, &res) return res property bin_id: "Binary compatibility id" def __get__(self): cdef const char *res ctx_property(self, GA_CTX_PROP_BIN_ID, &res) return res; property total_gmem: "Total size of global memory on the device" def __get__(self): cdef size_t res ctx_property(self, GA_CTX_PROP_TOTAL_GMEM, &res) return res property free_gmem: "Size of free global memory on the device" def __get__(self): cdef size_t res ctx_property(self, GA_CTX_PROP_FREE_GMEM, &res) return res property maxlsize0: "Maximum local size for dimension 0" def __get__(self): cdef size_t res ctx_property(self, GA_CTX_PROP_MAXLSIZE0, &res) return res property maxlsize1: "Maximum local size for dimension 1" def __get__(self): cdef size_t res ctx_property(self, GA_CTX_PROP_MAXLSIZE1, &res) return res property maxlsize2: "Maximum local size for dimension 2" def __get__(self): cdef size_t res ctx_property(self, GA_CTX_PROP_MAXLSIZE2, &res) return res property maxgsize0: "Maximum global size for dimension 0" def __get__(self): cdef size_t res ctx_property(self, GA_CTX_PROP_MAXGSIZE0, &res) return res property maxgsize1: "Maximum global size for dimension 1" def __get__(self): cdef size_t res ctx_property(self, GA_CTX_PROP_MAXGSIZE1, &res) return res property maxgsize2: "Maximum global size for dimension 2" def __get__(self): cdef size_t res ctx_property(self, GA_CTX_PROP_MAXGSIZE2, &res) return res property largest_memblock: "Size of the largest memory block you can allocate" def __get__(self): cdef size_t res ctx_property(self, GA_CTX_PROP_LARGEST_MEMBLOCK, &res) return res cdef class flags(object): cdef int fl def __cinit__(self, fl): self.fl = fl def __reduce__(self): return (flags, (self.fl,)) def __getitem__(self, idx): cdef const char *key cdef size_t n cdef char c if isinstance(idx, unicode): idx = idx.encode('UTF-8') if isinstance(idx, bytes): key = idx n = len(idx) else: raise KeyError, "Unknown flag" if n == 1: c = key[0] if c == 'C': return self.c_contiguous elif c == 'F': return self.f_contiguous elif c == 'W': return self.writeable elif c == 'B': return self.behaved elif c == 'O': return self.owndata elif c == 'A': return self.aligned elif c == 'U': return self.updateifcopy elif n == 2: if strncmp(key, "CA", n) == 0: return self.carray if strncmp(key, "FA", n) == 0: return self.farray elif n == 3: if strncmp(key, "FNC", n) == 0: return self.fnc elif n == 4: if strncmp(key, "FORC", n) == 0: return self.forc elif n == 6: if strncmp(key, "CARRAY", n) == 0: return self.carray if strncmp(key, "FARRAY", n) == 0: return self.farray elif n == 7: if strncmp(key, "FORTRAN", n) == 0: return self.fortran if strncmp(key, "BEHAVED", n) == 0: return self.behaved if strncmp(key, "OWNDATA", n) == 0: return self.owndata if strncmp(key, "ALIGNED", n) == 0: return self.aligned elif n == 9: if strncmp(key, "WRITEABLE", n) == 0: return self.writeable elif n == 10: if strncmp(key, "CONTIGUOUS", n) == 0: return self.c_contiguous elif n == 12: if strncmp(key, "UPDATEIFCOPY", n) == 0: return self.updateifcopy if strncmp(key, "C_CONTIGUOUS", n) == 0: return self.c_contiguous if strncmp(key, "F_CONTIGUOUS", n) == 0: return self.f_contiguous raise KeyError, "Unknown flag" def __repr__(self): return '\n'.join(" %s : %s" % (name.upper(), getattr(self, name)) for name in ["c_contiguous", "f_contiguous", "owndata", "writeable", "aligned", "updateifcopy"]) def __richcmp__(self, other, int op): cdef flags a cdef flags b if not isinstance(self, flags) or not isinstance(other, flags): return NotImplemented a = self b = other if op == Py_EQ: return a.fl == b.fl elif op == Py_NE: return a.fl != b.fl raise TypeError, "undefined comparison for flag object" property c_contiguous: def __get__(self): return bool(self.fl & GA_C_CONTIGUOUS) property contiguous: def __get__(self): return self.c_contiguous property f_contiguous: def __get__(self): return bool(self.fl & GA_F_CONTIGUOUS) property fortran: def __get__(self): return self.f_contiguous property updateifcopy: # Not supported. def __get__(self): return False property owndata: # There is no equivalent for GpuArrays and it is always "True". def __get__(self): return True property aligned: def __get__(self): return bool(self.fl & GA_ALIGNED) property writeable: def __get__(self): return bool(self.fl & GA_WRITEABLE) property behaved: def __get__(self): return (self.fl & GA_BEHAVED) == GA_BEHAVED property carray: def __get__(self): return (self.fl & GA_CARRAY) == GA_CARRAY # Yes these are really defined like that according to numpy sources. # I don't know why. property forc: def __get__(self): return ((self.fl & GA_F_CONTIGUOUS) == GA_F_CONTIGUOUS or (self.fl & GA_C_CONTIGUOUS) == GA_C_CONTIGUOUS) property fnc: def __get__(self): return ((self.fl & GA_F_CONTIGUOUS) == GA_F_CONTIGUOUS and not (self.fl & GA_C_CONTIGUOUS) == GA_C_CONTIGUOUS) property farray: def __get__(self): return ((self.fl & GA_FARRAY) != 0 and not ((self.fl & GA_C_CONTIGUOUS) != 0)) property num: def __get__(self): return self.fl cdef GpuArray new_GpuArray(object cls, GpuContext ctx, object base): cdef GpuArray res if ctx is None: raise RuntimeError, "ctx is None in new_GpuArray" if cls is None or cls is GpuArray: res = GpuArray.__new__(GpuArray) else: res = GpuArray.__new__(cls) res.base = base res.context = ctx return res cdef GpuArray pygpu_view(GpuArray a, object cls): cdef GpuArray res = new_GpuArray(cls, a.context, a.base) array_view(res, a) return res cdef int pygpu_sync(GpuArray a) except -1: array_sync(a) return 0 cdef GpuArray pygpu_empty_like(GpuArray a, ga_order ord, int typecode): cdef GpuArray res if ord == GA_ANY_ORDER: if (py_CHKFLAGS(a, GA_F_CONTIGUOUS) and not py_CHKFLAGS(a, GA_C_CONTIGUOUS)): ord = GA_F_ORDER else: ord = GA_C_ORDER if typecode == -1: typecode = a.ga.typecode res = new_GpuArray(type(a), a.context, None) array_empty(res, a.context.ctx, typecode, a.ga.nd, a.ga.dimensions, ord) return res cdef np.ndarray pygpu_as_ndarray(GpuArray a): return _pygpu_as_ndarray(a, None) cdef np.ndarray _pygpu_as_ndarray(GpuArray a, np.dtype ldtype): cdef np.ndarray res if not py_ISONESEGMENT(a): a = pygpu_copy(a, GA_ANY_ORDER) if ldtype is None: ldtype = a.dtype res = PyArray_Empty(a.ga.nd, a.ga.dimensions, ldtype, (py_CHKFLAGS(a, GA_F_CONTIGUOUS) and not py_CHKFLAGS(a, GA_C_CONTIGUOUS))) array_read(np.PyArray_DATA(res), np.PyArray_NBYTES(res), a) return res cdef GpuArray pygpu_index(GpuArray a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps): cdef GpuArray res res = new_GpuArray(type(a), a.context, a.base) try: array_index(res, a, starts, stops, steps) except ValueError, e: raise IndexError, "index out of bounds" return res cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims, ga_order ord, bint nocopy, int compute_axis): cdef GpuArray res res = new_GpuArray(type(a), a.context, a.base) if compute_axis < 0: array_reshape(res, a, nd, newdims, ord, nocopy) return res cdef unsigned int caxis = compute_axis if caxis >= nd: raise ValueError("compute_axis is out of bounds") cdef size_t *cdims cdef size_t tot = 1 cdef unsigned int i for i in range(nd): if i != caxis: tot *= newdims[i] cdims = calloc(nd, sizeof(size_t)) if cdims == NULL: raise MemoryError, "could not allocate cdims" cdef size_t d try: for i in range(nd): d = newdims[i] if i == caxis: d = a.size // tot if d * tot != a.size: raise GpuArrayException, "..." cdims[i] = d array_reshape(res, a, nd, cdims, ord, nocopy) return res finally: free(cdims) cdef GpuArray pygpu_transpose(GpuArray a, const unsigned int *newaxes): cdef GpuArray res res = new_GpuArray(type(a), a.context, a.base) array_transpose(res, a, newaxes) return res cdef int pygpu_transfer(GpuArray res, GpuArray a) except -1: array_transfer(res, a) return 0 def _split(GpuArray a, ind, unsigned int axis): """ _split(a, ind, axis) """ cdef list r = [None] * (len(ind) + 1) cdef Py_ssize_t i if not axis < a.ga.nd: raise ValueError, "split on non-existant axis" cdef size_t m = a.ga.dimensions[axis] cdef size_t v cdef size_t *p = PyMem_Malloc(sizeof(size_t) * len(ind)) if p == NULL: raise MemoryError() cdef _GpuArray **rs = <_GpuArray **>PyMem_Malloc(sizeof(_GpuArray *) * len(r)) if rs == NULL: PyMem_Free(p) raise MemoryError() try: for i in range(len(r)): r[i] = new_GpuArray(type(a), a.context, a.base) rs[i] = &(r[i]).ga for i in range(len(ind)): v = ind[i] # cap the values to the end of the array p[i] = v if v < m else m array_split(rs, a, len(ind), p, axis) return r finally: PyMem_Free(p) PyMem_Free(rs) cdef GpuArray pygpu_concatenate(const _GpuArray **a, size_t n, unsigned int axis, int restype, object cls, GpuContext context): cdef res = new_GpuArray(cls, context, None) array_concatenate(res, a, n, axis, restype) return res def _concatenate(list al, unsigned int axis, int restype, object cls, GpuContext context): """ _concatenate(al, axis, restype, cls, context) """ cdef Py_ssize_t i context = ensure_context(context) cdef const _GpuArray **als = PyMem_Malloc(sizeof(_GpuArray *) * len(al)) if als == NULL: raise MemoryError() try: for i in range(len(al)): if not isinstance(al[i], GpuArray): raise TypeError, "expected GpuArrays to concatenate" als[i] = &(al[i]).ga return pygpu_concatenate(als, len(al), axis, restype, cls, context) finally: PyMem_Free(als) cdef int (*cuda_get_ipc_handle)(gpudata *, GpuArrayIpcMemHandle *) cdef gpudata *(*cuda_open_ipc_handle)(gpucontext *, GpuArrayIpcMemHandle *, size_t) cuda_get_ipc_handle = gpuarray_get_extension("cuda_get_ipc_handle") cuda_open_ipc_handle = gpuarray_get_extension("cuda_open_ipc_handle") def open_ipc_handle(GpuContext c, bytes hpy, size_t l): """ open_ipc_handle(c, hpy, l) Open an IPC handle to get a new GpuArray from it. Parameters ---------- c: GpuContext context hpy: bytes binary handle data received l: int size of the referred memory block """ cdef char *b cdef GpuArrayIpcMemHandle h cdef gpudata *d b = hpy memcpy(&h, b, sizeof(h)) d = cuda_open_ipc_handle(c.ctx, &h, l) if d is NULL: raise GpuArrayException, gpucontext_error(c.ctx, 0) return d cdef class GpuArray: """ Device array To create instances of this class use :meth:`~pygpu.gpuarray.zeros`, :meth:`~pygpu.gpuarray.empty` or :meth:`~pygpu.gpuarray.array`. It cannot be instanciated directly. You can also subclass this class and make the module create your instances by passing the `cls` argument to any method that return a new GpuArray. This way of creating the class will NOT call your :meth:`__init__` method. You can also implement your own :meth:`__init__` method, but you must take care to ensure you properly initialized the GpuArray C fields before using it or you will most likely crash the interpreter. """ def __dealloc__(self): array_clear(self) def __cinit__(self): memset(&self.ga, 0, sizeof(_GpuArray)) def __init__(self): if type(self) is GpuArray: raise RuntimeError, "Called raw GpuArray.__init__" def __reduce__(self): raise RuntimeError, "Cannot pickle GpuArray object" cdef __index_helper(self, key, unsigned int i, ssize_t *start, ssize_t *stop, ssize_t *step): cdef Py_ssize_t dummy cdef Py_ssize_t k try: k = PyNumber_Index(key) if k < 0: k += self.ga.dimensions[i] if k < 0 or (k) >= self.ga.dimensions[i]: raise IndexError, "index %d out of bounds" % (i,) start[0] = k step[0] = 0 return except TypeError: pass if isinstance(key, slice): PySlice_GetIndicesEx(key, self.ga.dimensions[i], start, stop, step, &dummy) if stop[0] < start[0] and step[0] > 0: stop[0] = start[0] elif key is Ellipsis: start[0] = 0 stop[0] = self.ga.dimensions[i] step[0] = 1 else: raise IndexError, "cannot index with: %s" % (key,) def write(self, np.ndarray src not None): """ write(src) Writes host's Numpy array to device's GpuArray. This method is as fast as or even faster than :ref:asarray, because it skips possible allocation of a buffer in device's memory. It uses this already allocated GpuArray buffer to contain `src` array from host's memory. It is required though that the GpuArray and the Numpy array are compatible in byte size and data type. It is also needed for the GpuArray to be well behaved and contiguous. If `src` is not aligned or compatible in contiguity it will be copied to a new Numpy array in order to be. It is allowed for this GpuArray and `src` to have different shapes. Parameters ---------- src: numpy.ndarray source array in host Raises ------ ValueError If this GpuArray is not compatible with `src` or if it is not well behaved or contiguous. """ if not self.flags.behaved: raise ValueError, "Destination GpuArray is not well behaved: aligned and writeable" if self.flags.c_contiguous: src = np.asarray(src, order='C') elif self.flags.f_contiguous: src = np.asarray(src, order='F') else: raise ValueError, "Destination GpuArray is not contiguous" if self.dtype != src.dtype: raise ValueError, "GpuArray and Numpy array do not have matching data types" cdef size_t npsz = np.PyArray_NBYTES(src) cdef size_t sz = gpuarray_get_elsize(self.ga.typecode) cdef unsigned i for i in range(self.ga.nd): sz *= self.ga.dimensions[i] if sz != npsz: raise ValueError, "GpuArray and Numpy array do not have the same size in bytes" array_write(self, np.PyArray_DATA(src), sz) def read(self, np.ndarray dst not None): """ read(dst) Reads from this GpuArray into host's Numpy array. This method is as fast as or even faster than :ref:__array__ method and thus :ref:numpy.asarray. This is because it skips allocation of a new buffer in host's memory to contain device's GpuArray. It uses an existing Numpy ndarray as a buffer to get the GpuArray. It is required though that the GpuArray and the Numpy array to be compatible in byte size, contiguity and data type. It is also needed for `dst` to be writeable and properly aligned in host's memory and for `self` to be contiguous. It is allowed for this GpuArray and `dst` to have different shapes. Parameters ---------- dst: numpy.ndarray destination array in host Raises ------ ValueError If this GpuArray is not compatible with `src` or if `dst` is not well behaved. """ if not np.PyArray_ISBEHAVED(dst): raise ValueError, "Destination Numpy array is not well behaved: aligned and writeable" if (not ((self.flags.c_contiguous and self.flags.aligned and dst.flags['C_CONTIGUOUS']) or (self.flags.f_contiguous and self.flags.aligned and dst.flags['F_CONTIGUOUS']))): raise ValueError, "GpuArray and Numpy array do not match in contiguity or GpuArray is not aligned" if self.dtype != dst.dtype: raise ValueError, "GpuArray and Numpy array do not have matching data types" cdef size_t npsz = np.PyArray_NBYTES(dst) cdef size_t sz = gpuarray_get_elsize(self.ga.typecode) cdef unsigned i for i in range(self.ga.nd): sz *= self.ga.dimensions[i] if sz != npsz: raise ValueError, "GpuArray and Numpy array do not have the same size in bytes" array_read(np.PyArray_DATA(dst), sz, self) def get_ipc_handle(self): """ get_ipc_handle() """ cdef GpuArrayIpcMemHandle h cdef int err if cuda_get_ipc_handle is NULL: raise SystemError, "Could not get necessary extension" if self.context.kind != b'cuda': raise ValueError, "Only works for cuda contexts" err = cuda_get_ipc_handle(self.ga.data, &h) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&self.ga, err) res = (&h)[:sizeof(h)] return res def __array__(self, ldtype=None): """ __array__(ldtype=None) Return a :class:`numpy.ndarray` with the same content. Automatically used by :meth:`numpy.asarray`. """ return _pygpu_as_ndarray(self, ldtype) def __bool__(self): """ __bool__() """ if self.size == 0: return False elif self.size == 1: return bool(numpy.asarray(self)) else: raise ValueError('The truth value of a multi-element array is ambiguous') def _empty_like_me(self, dtype=None, order='C'): """ _empty_like_me(dtype=None, order='C') Returns an empty (uninitialized) GpuArray with the same properties except if overridden by parameters. """ cdef int typecode cdef GpuArray res if dtype is None: typecode = -1 else: typecode = dtype_to_typecode(dtype) return pygpu_empty_like(self, to_ga_order(order), typecode) def copy(self, order='C'): """ copy(order='C') Return a copy if this array. Parameters ---------- order: {'C', 'A', 'F'} memory layout of the copy """ return pygpu_copy(self, to_ga_order(order)) def transfer(self, GpuContext new_ctx): """ transfer(new_ctx) """ cdef GpuArray r if not GpuArray_ISONESEGMENT(&self.ga): # For now raise an error, may make it work later raise ValueError("transfer() only works for contigous source") r = pygpu_empty(self.ga.nd, self.ga.dimensions, self.ga.typecode, GA_C_ORDER if GpuArray_IS_C_CONTIGUOUS(&self.ga) else GA_F_ORDER, new_ctx, None) pygpu_transfer(r, self) # Will raise an error if needed return r def __copy__(self): return pygpu_copy(self, GA_C_ORDER) def __deepcopy__(self, memo): if id(self) in memo: return memo[id(self)] else: return pygpu_copy(self, GA_C_ORDER) def sync(self): """ sync() Wait for all pending operations on this array. This is done automatically when reading or writing from it, but can be useful as a separate operation for timings. """ pygpu_sync(self) def view(self, object cls=GpuArray): """ view(cls=GpuArray) Return a view of this array. The returned array shares device data with this one and both will reflect changes made to the other. Parameters ---------- cls: type class of the view (must inherit from GpuArray) """ return pygpu_view(self, cls) def astype(self, dtype, order='A', copy=True): """ astype(dtype, order='A', copy=True) Cast the elements of this array to a new type. This function returns a new array will all elements cast to the supplied `dtype`, but otherwise unchanged. If `copy` is False and the type and order match `self` is returned. Parameters ---------- dtype: str or numpy.dtype or int type of the elements of the result order: {'A', 'C', 'F'} memory layout of the result copy: bool Always return a copy? """ cdef GpuArray res cdef int typecode = dtype_to_typecode(dtype) cdef ga_order ord = to_ga_order(order) if (not copy and typecode == self.ga.typecode and ((py_CHKFLAGS(self, GA_F_CONTIGUOUS) and ord == GA_F_ORDER) or (py_CHKFLAGS(self, GA_C_CONTIGUOUS) and ord == GA_C_ORDER))): return self res = self._empty_like_me(dtype=typecode, order=order) array_move(res, self) return res def reshape(self, shape, order='C'): """ reshape(shape, order='C') Returns a new array with the given shape and order. The new shape must have the same size (total number of elements) as the current one. """ cdef size_t *newdims cdef unsigned int nd cdef unsigned int i cdef int compute_axis try: nd = len(shape) except TypeError: nd = 1 shape = [shape] newdims = calloc(nd, sizeof(size_t)) if newdims == NULL: raise MemoryError, "calloc" compute_axis = -1 try: for i in range(nd): if shape[i] == -1: assert compute_axis == -1 compute_axis = i newdims[i] = 1 else: newdims[i] = shape[i] return pygpu_reshape(self, nd, newdims, to_ga_order(order), 0, compute_axis) finally: free(newdims) def transpose(self, *params): """ transpose(*params) """ cdef unsigned int *new_axes cdef unsigned int i if len(params) is 1 and isinstance(params[0], (tuple, list)): params = params[0] if params is () or params == (None,): return pygpu_transpose(self, NULL) else: if len(params) != self.ga.nd: raise ValueError("axes don't match: " + str(params)) new_axes = calloc(self.ga.nd, sizeof(unsigned int)) try: for i in range(self.ga.nd): new_axes[i] = params[i] return pygpu_transpose(self, new_axes) finally: free(new_axes) def __len__(self): if self.ga.nd > 0: return self.ga.dimensions[0] else: raise TypeError, "len() of unsized object" def __getitem__(self, key): cdef unsigned int i if key is Ellipsis: return self.__cgetitem__(key) # A list or a sequence of list should trigger "fancy" indexing. # This is not implemented yet. # Conversely, if a list contains slice or Ellipsis objects, it behaves # the same as a tuple. if isinstance(key, list): if any(isinstance(k, slice) or k is Ellipsis for k in key): return self.__getitem__(tuple(key)) else: raise NotImplementedError, "fancy indexing not supported" try: iter(key) except TypeError: key = (key,) else: if all(isinstance(k, list) for k in key): raise NotImplementedError, "fancy indexing not supported" key = tuple(key) # Need to massage Ellipsis here, to avoid packing it into a tuple. if countis(key, Ellipsis) > 1: raise IndexError, "cannot use more than one Ellipsis" # The following code replaces an Ellipsis found in the key by # the corresponding number of slice(None) objects, depending on the # number of dimensions. As example, this allows indexing on the last # dimension with a[..., 1:] on any array (including 1-dim). This # is also required for numpy compat. try: ell_idx = key.index(Ellipsis) except ValueError: pass else: # Need number of axes minus missing dimensions extra slice(None) # objects, not counting None entries and the Ellipsis itself num_slcs = self.ga.nd - (len(key) - countis(key, None) - 1) fill_slices = (slice(None),) * num_slcs key = key[:ell_idx] + fill_slices + key[ell_idx + 1:] # Remove the None entries for indexing getitem_idcs = tuple(k for k in key if k is not None) # For less than 1 index, fill up with slice(None) to the right. # This allows indexing a[1:] in multi-dimensional arrays, where the # slice is applied along the first axis only. It also allows # a[()], which simply is a view in Numpy. if len(getitem_idcs) <= 1: getitem_idcs = (getitem_idcs + (slice(None),) * (self.ga.nd - len(getitem_idcs))) # Slice into array, then reshape, accommodating for None entries in key sliced = self.__cgetitem__(getitem_idcs) if countis(key, None) == 0: # Avoid unnecessary reshaping if there was no None return sliced else: new_shape = [] i = 0 if sliced.shape: for k in key: if isinstance(k, int): continue elif k is None: new_shape.append(1) else: new_shape.append(sliced.shape[i]) i += 1 # Add remaining entries from sliced.shape if existing (happens # for 1 index or less if ndim >= 2). new_shape.extend(sliced.shape[i:]) return sliced.reshape(new_shape) cdef __cgetitem__(self, key): cdef ssize_t *starts cdef ssize_t *stops cdef ssize_t *steps cdef unsigned int i cdef unsigned int d cdef unsigned int el if key is Ellipsis: return pygpu_view(self, None) elif self.ga.nd == 0: if isinstance(key, tuple) and len(key) == 0: return self else: raise IndexError, "0-d arrays can't be indexed" starts = calloc(self.ga.nd, sizeof(ssize_t)) stops = calloc(self.ga.nd, sizeof(ssize_t)) steps = calloc(self.ga.nd, sizeof(ssize_t)) try: if starts == NULL or stops == NULL or steps == NULL: raise MemoryError d = 0 if isinstance(key, (tuple, list)): if Ellipsis in key: # The following code replaces the first Ellipsis # found in the key by a bunch of them depending on # the number of dimensions. As example, this # allows indexing on the last dimension with # a[..., 1:] on any array (including 1-dim). This # is also required for numpy compat. el = key.index(Ellipsis) if isinstance(key, tuple): key = (key[:el] + (Ellipsis,)*(self.ga.nd - (len(key) - 1)) + key[el+1:]) else: key = (key[:el] + [Ellipsis,]*(self.ga.nd - (len(key) - 1)) + key[el+1:]) if len(key) > self.ga.nd: raise IndexError, "too many indices" for i in range(0, len(key)): self.__index_helper(key[i], i, &starts[i], &stops[i], &steps[i]) d += len(key) else: self.__index_helper(key, 0, starts, stops, steps) d += 1 for i in range(d, self.ga.nd): starts[i] = 0 stops[i] = self.ga.dimensions[i] steps[i] = 1 return pygpu_index(self, starts, stops, steps) finally: free(starts) free(stops) free(steps) def __setitem__(self, idx, v): cdef GpuArray tmp, gv if isinstance(idx, list): if any(isinstance(i, slice) or i is Ellipsis for i in idx): self.__setitem__(tuple(idx), v) else: raise NotImplementedError, "fancy indexing not supported" try: iter(idx) except TypeError: idx = (idx,) else: if all(isinstance(i, list) for i in idx): raise NotImplementedError, "fancy indexing not supported" idx = tuple(idx) if countis(idx, Ellipsis) > 1: raise IndexError, "cannot use more than one Ellipsis" # Remove None entries, they should be ignored (as in Numpy) idx = tuple(i for i in idx if i is not None) tmp = self.__cgetitem__(idx) gv = carray(v, self.ga.typecode, False, 'A', 0, self.context, GpuArray) array_setarray(tmp, gv) def take1(self, GpuArray idx): """ take1(idx) """ cdef GpuArray res cdef size_t odim if idx.ga.nd != 1: raise ValueError, "Expected index with nd=1" odim = self.ga.dimensions[0] try: self.ga.dimensions[0] = idx.ga.dimensions[0] res = pygpu_empty_like(self, GA_C_ORDER, -1) finally: self.ga.dimensions[0] = odim array_take1(res, self, idx, 1) return res def __hash__(self): raise TypeError, "unhashable type '%s'" % (self.__class__,) def __nonzero__(self): cdef int sz = self.size if sz == 0: return False if sz == 1: return bool(numpy.asarray(self)) else: raise ValueError, "Truth value of array with more than one element is ambiguous" property shape: "shape of this ndarray (tuple)" def __get__(self): cdef unsigned int i res = [None] * self.ga.nd for i in range(self.ga.nd): res[i] = self.ga.dimensions[i] return tuple(res) def __set__(self, newshape): # We support -1 only in a call to reshape cdef size_t *newdims cdef unsigned int nd cdef unsigned int i cdef int err nd = len(newshape) newdims = calloc(nd, sizeof(size_t)) if newdims == NULL: raise MemoryError, "calloc" try: for i in range(nd): newdims[i] = newshape[i] err = GpuArray_reshape_inplace(&self.ga, nd, newdims, GA_C_ORDER) if err != GA_NO_ERROR: raise get_exc(err), GpuArray_error(&self.ga, err) finally: free(newdims) property T: def __get__(self): return pygpu_transpose(self, NULL) property size: "The number of elements in this object." def __get__(self): cdef size_t res = 1 cdef unsigned int i for i in range(self.ga.nd): res *= self.ga.dimensions[i] return res property strides: "data pointer strides (in bytes)" def __get__(self): cdef unsigned int i res = [None] * self.ga.nd for i in range(self.ga.nd): res[i] = self.ga.strides[i] return tuple(res) def __set__(self, newstrides): cdef unsigned int i if len(newstrides) != self.ga.nd: raise ValueError("new strides are the wrong length") if not strides_ok(self, newstrides): raise ValueError("new strides go outside of allocated memory") for i in range(self.ga.nd): self.ga.strides[i] = newstrides[i] array_fix_flags(self) property ndim: "The number of dimensions in this object" def __get__(self): return self.ga.nd property dtype: "The dtype of the element" def __get__(self): return typecode_to_dtype(self.ga.typecode) property typecode: "The gpuarray typecode for the data type of the array" def __get__(self): return self.ga.typecode property itemsize: "The size of the base element." def __get__(self): return gpuarray_get_elsize(self.ga.typecode) property flags: """Return a flags object describing the properties of this array. This is mostly numpy-compatible with some exceptions: * Flags are always constant (numpy allows modification of certain flags in certain cicumstances). * OWNDATA is always True, since the data is refcounted in libgpuarray. * UPDATEIFCOPY is not supported, therefore always False. """ def __get__(self): return flags(self.ga.flags) property offset: "Return the offset into the gpudata pointer for this array." def __get__(self): return self.ga.offset property data: """Return a pointer to the raw OpenCL buffer object. This will fail for arrays that have an offset. """ def __get__(self): if self.context.kind != b"opencl": raise TypeError("This is for OpenCL arrays.") if self.offset != 0: raise ValueError("This array has an offset.") # This wizadry grabs the actual backend pointer since it's # guarenteed to be the first element of the gpudata # structure. return ((self.ga.data)[0]) property base_data: "Return a pointer to the backing OpenCL object." def __get__(self): if self.context.kind != b"opencl": raise TypeError("This is for OpenCL arrays.") # This wizadry grabs the actual backend pointer since it's # guarenteed to be the first element of the gpudata # structure. return ((self.ga.data)[0]) property gpudata: "Return a pointer to the raw backend object." def __get__(self): if self.context.kind != b"cuda": raise TypeError("This is for CUDA arrays.") # This wizadry grabs the actual backend pointer since it's # guarenteed to be the first element of the gpudata # structure. return ((self.ga.data)[0]) + self.offset def __str__(self): return str(numpy.asarray(self)) def __repr__(self): try: return 'gpuarray.' + repr(numpy.asarray(self)) except Exception: return 'gpuarray.array()' cdef class GpuKernel: """ GpuKernel(source, name, types, context=None, have_double=False, have_small=False, have_complex=False, have_half=False, cuda=False, opencl=False) Compile a kernel on the device The kernel function is retrieved using the provided `name` which must match what you named your kernel in `source`. You can safely reuse the same name multiple times. The `have_*` parameter are there to tell libgpuarray that we need the particular type or feature to work for this kernel. If the request can't be satified a :class:`.UnsupportedException` will be raised in the constructor. Once you have the kernel object you can simply call it like so:: k = GpuKernel(...) k(param1, param2, n=n) where `n` is the minimum number of threads to run. libgpuarray will try to stay close to this number but may run a few more threads to match the hardware preferred multiple and stay efficient. You should watch out for this in your code and make sure to test against the size of your data. If you want more control over thread allocation you can use the `gs` and `ls` parameters like so:: k = GpuKernel(...) k(param1, param2, gs=gs, ls=ls) If you choose to use this interface, make sure to stay within the limits of `k.maxlsize` or the call will fail. Parameters ---------- source: str complete kernel source code name: str function name of the kernel types: list or tuple list of argument types context: GpuContext device on which the kernel is compiled have_double: bool ensure working doubles? have_small: bool ensure types smaller than float will work? have_complex: bool ensure complex types will work? have_half: bool ensure half-floats will work? cuda: bool kernel is cuda code? opencl: bool kernel is opencl code? Notes ----- With the cuda backend, unless you use the cluda include, you must either pass the mangled name of your kernel or declare the function 'extern "C"', because cuda uses a C++ compiler unconditionally. .. warning:: If you do not set the `have_` flags properly, you will either get a device-specific error (the good case) or silent completly bogus data (the bad case). """ def __dealloc__(self): cdef unsigned int numargs cdef int *types cdef unsigned int i cdef int res # We need to do all of this at the C level to avoid touching # python stuff that could be gone and to avoid exceptions if self.k.k is not NULL: res = gpukernel_property(self.k.k, GA_KERNEL_PROP_NUMARGS, &numargs) if res != GA_NO_ERROR: return res = gpukernel_property(self.k.k, GA_KERNEL_PROP_TYPES, &types) if res != GA_NO_ERROR: return for i in range(numargs): if types[i] != GA_BUFFER: free(self.callbuf[i]) kernel_clear(self) free(self.callbuf) def __reduce__(self): raise RuntimeError, "Cannot pickle GpuKernel object" def __cinit__(self, source, name, types, GpuContext context=None, have_double=False, have_small=False, have_complex=False, have_half=False, cuda=False, opencl=False, *a, **kwa): cdef const char *s[1] cdef size_t l cdef unsigned int numargs cdef unsigned int i cdef int *_types cdef int flags = 0 source = _s(source) name = _s(name) self.context = ensure_context(context) if have_double: flags |= GA_USE_DOUBLE if have_small: flags |= GA_USE_SMALL if have_complex: flags |= GA_USE_COMPLEX if have_half: flags |= GA_USE_HALF if cuda: flags |= GA_USE_CUDA if opencl: flags |= GA_USE_OPENCL s[0] = source l = len(source) numargs = len(types) self.callbuf = calloc(len(types), sizeof(void *)) if self.callbuf == NULL: raise MemoryError _types = calloc(numargs, sizeof(int)) if _types == NULL: raise MemoryError try: for i in range(numargs): if (types[i] == GpuArray): _types[i] = GA_BUFFER else: _types[i] = dtype_to_typecode(types[i]) self.callbuf[i] = malloc(gpuarray_get_elsize(_types[i])) if self.callbuf[i] == NULL: raise MemoryError kernel_init(self, self.context.ctx, 1, s, &l, name, numargs, _types, flags) finally: free(_types) def __call__(self, *args, n=None, gs=None, ls=None, shared=0): """ __call__(*args, n=None, gs=None, ls=None, shared=0) """ if n is None and (ls is None or gs is None): raise ValueError, "Must specify size (n) or both gs and ls" self.do_call(n, gs, ls, args, shared) cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared): cdef size_t n cdef size_t gs[3] cdef size_t ls[3] cdef size_t tmp cdef unsigned int nd cdef const int *types cdef unsigned int numargs cdef unsigned int i nd = 0 if py_ls is None: ls[0] = 0 nd = 1 else: if isinstance(py_ls, int): ls[0] = py_ls nd = 1 elif isinstance(py_ls, (list, tuple)): if len(py_ls) > 3: raise ValueError, "ls is not of length 3 or less" nd = len(py_ls) if nd >= 3: ls[2] = py_ls[2] if nd >= 2: ls[1] = py_ls[1] if nd >= 1: ls[0] = py_ls[0] else: raise TypeError, "ls is not int or list" if py_gs is None: if nd != 1: raise ValueError, "nd mismatch for gs (None)" gs[0] = 0 else: if isinstance(py_gs, int): if nd != 1: raise ValueError, "nd mismatch for gs (int)" gs[0] = py_gs elif isinstance(py_gs, (list, tuple)): if len(py_gs) > 3: raise ValueError, "gs is not of length 3 or less" if len(py_ls) != nd: raise ValueError, "nd mismatch for gs (tuple)" if nd >= 3: gs[2] = py_gs[2] if nd >= 2: gs[1] = py_gs[1] if nd >= 1: gs[0] = py_gs[0] else: raise TypeError, "gs is not int or list" numargs = self.numargs if len(py_args) != numargs: raise TypeError, "Expected %d arguments, got %d," % (numargs, len(py_args)) kernel_property(self, GA_KERNEL_PROP_TYPES, &types) for i in range(numargs): self._setarg(i, types[i], py_args[i]) if py_n is not None: if nd != 1: raise ValueError, "n is specified and nd != 1" n = py_n kernel_sched(self, n, &gs[0], &ls[0]) kernel_call(self, nd, gs, ls, shared, self.callbuf) cdef _setarg(self, unsigned int index, int typecode, object o): if typecode == GA_BUFFER: if not isinstance(o, GpuArray): raise TypeError, "expected a GpuArray" self.callbuf[index] = ((o).ga.data) elif typecode == GA_SIZE: (self.callbuf[index])[0] = o elif typecode == GA_SSIZE: (self.callbuf[index])[0] = o elif typecode == GA_FLOAT: (self.callbuf[index])[0] = o elif typecode == GA_DOUBLE: (self.callbuf[index])[0] = o elif typecode == GA_BYTE: (self.callbuf[index])[0] = o elif typecode == GA_UBYTE: (self.callbuf[index])[0] = o elif typecode == GA_SHORT: (self.callbuf[index])[0] = o elif typecode == GA_USHORT: (self.callbuf[index])[0] = o elif typecode == GA_INT: (self.callbuf[index])[0] = o elif typecode == GA_UINT: (self.callbuf[index])[0] = o elif typecode == GA_LONG: (self.callbuf[index])[0] = o elif typecode == GA_ULONG: (self.callbuf[index])[0] = o else: raise ValueError("Bad typecode in _setarg: %d " "(please report this, it is a bug)" % (typecode,)) property maxlsize: "Maximum local size for this kernel" def __get__(self): cdef size_t res kernel_property(self, GA_KERNEL_PROP_MAXLSIZE, &res) return res property preflsize: "Preferred multiple for local size for this kernel" def __get__(self): cdef size_t res kernel_property(self, GA_KERNEL_PROP_PREFLSIZE, &res) return res property numargs: "Number of arguments to kernel" def __get__(self): cdef unsigned int res kernel_property(self, GA_KERNEL_PROP_NUMARGS, &res) return res libgpuarray-0.7.6/pygpu/numpy_compat.h000066400000000000000000000014501326743622600201100ustar00rootroot00000000000000/* * But it allow faster conversion to this new library of existing code */ #ifndef GPUARRAY_NUMPY_COMPAT #define GPUARRAY_NUMPY_COMPAT static int PyGpuArray_NDIM(const PyGpuArrayObject *arr) { return arr->ga.nd; } static const size_t *PyGpuArray_DIMS(const PyGpuArrayObject *arr) { return arr->ga.dimensions; } static const ssize_t *PyGpuArray_STRIDES(const PyGpuArrayObject* arr) { return arr->ga.strides; } static size_t PyGpuArray_DIM(const PyGpuArrayObject* arr, int n) { return arr->ga.dimensions[n]; } static ssize_t PyGpuArray_STRIDE(const PyGpuArrayObject* arr, int n) { return arr->ga.strides[n]; } static size_t PyGpuArray_SIZE(const PyGpuArrayObject* arr) { size_t size = 1; for(int i=0; i< arr->ga.nd; i++) { size *= arr->ga.dimensions[i]; } return size; } #endif libgpuarray-0.7.6/pygpu/operations.py000066400000000000000000000077461326743622600177770ustar00rootroot00000000000000from six.moves import range from .gpuarray import _split, _concatenate, dtype_to_typecode from .dtypes import upcast from . import asarray def atleast_1d(*arys): res = [] for ary in arys: ary = asarray(ary) if len(ary.shape) == 0: result = ary.reshape((1,)) else: result = ary res.append(result) if len(res) == 1: return res[0] else: return res def atleast_2d(*arys): res = [] for ary in arys: ary = asarray(ary) if len(ary.shape) == 0: result = ary.reshape((1, 1)) elif len(ary.shape) == 1: result = ary.reshape((1, ary.shape[0])) else: result = ary res.append(result) if len(res) == 1: return res[0] else: return res def atleast_3d(*arys): res = [] for ary in arys: ary = asarray(ary) if len(ary.shape) == 0: result = ary.reshape((1, 1, 1)) elif len(ary.shape) == 1: result = ary.reshape((1, ary.shape[0], 1)) elif len(ary.shape) == 2: result = ary.reshape(ary.shape + (1,)) else: result = ary res.append(result) if len(res) == 1: return res[0] else: return res def split(ary, indices_or_sections, axis=0): try: len(indices_or_sections) except TypeError: if ary.shape[axis] % indices_or_sections != 0: raise ValueError("array split does not result in an " "equal division") return array_split(ary, indices_or_sections, axis) def array_split(ary, indices_or_sections, axis=0): try: indices = list(indices_or_sections) res = _split(ary, indices, axis) except TypeError: if axis < 0: axis += ary.ndim if axis < 0: raise ValueError('axis out of bounds') nsec = int(indices_or_sections) if nsec <= 0: raise ValueError('number of sections must be larger than 0.') neach, extra = divmod(ary.shape[axis], nsec) # this madness is to support the numpy interface # it is supported by tests, but little else divs = (list(range(neach + 1, (neach + 1) * extra + 1, neach + 1)) + list(range((neach + 1) * extra + neach, ary.shape[axis], neach))) res = _split(ary, divs, axis) return res def hsplit(ary, indices_or_sections): if len(ary.shape) == 0: raise ValueError('hsplit only works on arrays of 1 or more dimensions') if len(ary.shape) > 1: axis = 1 else: axis = 0 return split(ary, indices_or_sections, axis=axis) def vsplit(ary, indices_or_sections): if len(ary.shape) < 2: raise ValueError('vsplit only works on arrays of 2 or more dimensions') return split(ary, indices_or_sections, axis=0) def dsplit(ary, indices_or_sections): if len(ary.shape) < 3: raise ValueError('vsplit only works on arrays of 3 or more dimensions') return split(ary, indices_or_sections, axis=2) def concatenate(arys, axis=0, context=None): if len(arys) == 0: raise ValueError("concatenation of zero-length sequences is " "impossible") if axis < 0: axis += arys[0].ndim if axis < 0: raise ValueError('axis out of bounds') al = [asarray(a, context=context) for a in arys] if context is None: context = al[0].context outtype = upcast(*[a.dtype for a in arys]) return _concatenate(al, axis, dtype_to_typecode(outtype), type(al[0]), context) def vstack(tup, context=None): return concatenate([atleast_2d(a) for a in tup], 0, context) def hstack(tup, context=None): tup = [atleast_1d(a) for a in tup] if tup[0].ndim == 1: return concatenate(tup, 0, context) else: return concatenate(tup, 1, context) def dstack(tup, context=None): return concatenate([atleast_3d(a) for a in tup], 2, context) libgpuarray-0.7.6/pygpu/reduction.py000066400000000000000000000240541326743622600175770ustar00rootroot00000000000000import math import re from mako.template import Template import numpy from . import gpuarray from .tools import ScalarArg, ArrayArg, check_args, prod, lru_cache from .dtypes import parse_c_arg_backend def parse_c_args(arguments): return tuple(parse_c_arg_backend(arg, ScalarArg, ArrayArg) for arg in arguments.split(',')) INDEX_RE = re.compile('([a-zA-Z_][a-zA-Z0-9_]*)\[i\]') def massage_op(operation): return INDEX_RE.sub('\g<1>[0]', operation) def _ceil_log2(x): # nearest power of 2 (going up) if x != 0: return int(math.ceil(math.log(x, 2))) else: return 0 basic_kernel = Template(""" #include "cluda.h" ${preamble} #define REDUCE(a, b) (${reduce_expr}) KERNEL void ${name}(const unsigned int n, ${out_arg.decltype()} out, const unsigned int out_off % for d in range(nd): , const unsigned int dim${d} % endfor % for arg in arguments: % if arg.isarray(): , ${arg.decltype()} ${arg.name}_data , const unsigned int ${arg.name}_offset % for d in range(nd): , const int ${arg.name}_str_${d} % endfor % else: , ${arg.decltype()} ${arg.name} % endif % endfor ) { LOCAL_MEM ${out_arg.ctype()} ldata[${local_size}]; const unsigned int lid = LID_0; unsigned int i; GLOBAL_MEM char *tmp; % for arg in arguments: % if arg.isarray(): tmp = (GLOBAL_MEM char *)${arg.name}_data; tmp += ${arg.name}_offset; ${arg.name}_data = (${arg.decltype()})tmp; % endif % endfor tmp = (GLOBAL_MEM char *)out; tmp += out_off; out = (${out_arg.decltype()})tmp; i = GID_0; % for i in range(nd-1, -1, -1): % if not redux[i]: % if i > 0: const unsigned int pos${i} = i % dim${i}; i = i / dim${i}; % else: const unsigned int pos${i} = i; % endif % endif % endfor ${out_arg.ctype()} acc = ${neutral}; for (i = lid; i < n; i += LDIM_0) { int ii = i; int pos; % for arg in arguments: % if arg.isarray(): GLOBAL_MEM char *${arg.name}_p = (GLOBAL_MEM char *)${arg.name}_data; % endif % endfor % for i in range(nd-1, -1, -1): % if redux[i]: % if i > 0: pos = ii % dim${i}; ii = ii / dim${i}; % else: pos = ii; % endif % for arg in arguments: % if arg.isarray(): ${arg.name}_p += pos * ${arg.name}_str_${i}; % endif % endfor % else: % for arg in arguments: % if arg.isarray(): ${arg.name}_p += pos${i} * ${arg.name}_str_${i}; % endif % endfor % endif % endfor % for arg in arguments: % if arg.isarray(): ${arg.decltype()} ${arg.name} = (${arg.decltype()})${arg.name}_p; % endif % endfor acc = REDUCE((acc), (${map_expr})); } ldata[lid] = acc; <% cur_size = local_size %> % while cur_size > 1: <% cur_size = cur_size // 2 %> local_barrier(); if (lid < ${cur_size}) { ldata[lid] = REDUCE(ldata[lid], ldata[lid+${cur_size}]); } % endwhile local_barrier(); if (lid == 0) out[GID_0] = ldata[0]; } """) class ReductionKernel(object): def __init__(self, context, dtype_out, neutral, reduce_expr, redux, map_expr=None, arguments=None, preamble="", init_nd=None): self.context = context self.neutral = neutral self.redux = tuple(redux) if not any(self.redux): raise ValueError("Reduction is along no axes") self.dtype_out = dtype_out self.out_arg = ArrayArg(numpy.dtype(self.dtype_out), 'out') if isinstance(arguments, str): self.arguments = parse_c_args(arguments) elif arguments is None: self.arguments = [ArrayArg(numpy.dtype(self.dtype_out), '_reduce_input')] else: self.arguments = arguments if (self.dtype_out == numpy.dtype('float16') or any(ar.dtype == numpy.dtype('float16') for ar in self.arguments)): raise NotImplementedError('float16 not supported for the ' 'reduction interface') self.reduce_expr = reduce_expr if map_expr is None: if len(self.arguments) != 1: raise ValueError("Don't know what to do with more than one " "argument. Specify map_expr to explicitly " "state what you want.") self.operation = "%s[i]" % (self.arguments[0].name,) self.expression = "%s[0]" % (self.arguments[0].name,) else: self.operation = map_expr self.expression = massage_op(map_expr) if not any(isinstance(arg, ArrayArg) for arg in self.arguments): raise ValueError("ReductionKernel can only be used with " "functions that have at least one vector " "argument.") have_small = False have_double = False have_complex = False for arg in self.arguments: if arg.dtype.itemsize < 4 and type(arg) == ArrayArg: have_small = True if arg.dtype in [numpy.float64, numpy.complex128]: have_double = True if arg.dtype in [numpy.complex64, numpy.complex128]: have_complex = True self.flags = dict(have_small=have_small, have_double=have_double, have_complex=have_complex) self.preamble = preamble self.init_local_size = min(context.lmemsize // self.out_arg.dtype.itemsize, context.maxlsize0) # this is to prep the cache if init_nd is not None: self._get_basic_kernel(self.init_local_size, init_nd) def _find_kernel_ls(self, tmpl, max_ls, *tmpl_args): local_size = min(self.init_local_size, max_ls) count_lim = _ceil_log2(local_size) local_size = int(2**count_lim) loop_count = 0 while loop_count <= count_lim: k, src, spec = tmpl(local_size, *tmpl_args) if local_size <= k.maxlsize: return k, src, spec, local_size else: local_size //= 2 loop_count += 1 raise RuntimeError("Can't stabilize the local_size for kernel." " Please report this along with your " "reduction code.") def _gen_basic(self, ls, nd): src = basic_kernel.render(preamble=self.preamble, reduce_expr=self.reduce_expr, name="reduk", out_arg=self.out_arg, nd=nd, arguments=self.arguments, local_size=ls, redux=self.redux, neutral=self.neutral, map_expr=self.expression) spec = ['uint32', gpuarray.GpuArray, 'uint32'] spec.extend('uint32' for _ in range(nd)) for i, arg in enumerate(self.arguments): spec.append(arg.spec()) if arg.isarray(): spec.append('uint32') spec.extend('int32' for _ in range(nd)) k = gpuarray.GpuKernel(src, "reduk", spec, context=self.context, **self.flags) return k, src, spec @lru_cache() def _get_basic_kernel(self, maxls, nd): return self._find_kernel_ls(self._gen_basic, maxls, nd) def __call__(self, *args, **kwargs): broadcast = kwargs.pop('broadcast', None) out = kwargs.pop('out', None) if len(kwargs) != 0: raise TypeError('Unexpected keyword argument: %s' % kwargs.keys()[0]) _, nd, dims, strs, offsets = check_args(args, collapse=False, broadcast=broadcast) n = prod(dims) out_shape = tuple(d for i, d in enumerate(dims) if not self.redux[i]) gs = prod(out_shape) if gs == 0: gs = 1 n /= gs if gs > self.context.maxgsize0: raise ValueError("Array too big to be reduced along the " "selected axes") if out is None: out = gpuarray.empty(out_shape, context=self.context, dtype=self.dtype_out) else: if out.shape != out_shape or out.dtype != self.dtype_out: raise TypeError( "Out array is not of expected type (expected %s %s, " "got %s %s)" % (out_shape, self.dtype_out, out.shape, out.dtype)) # Don't compile and cache for nothing for big size if self.init_local_size < n: k, _, _, ls = self._get_basic_kernel(self.init_local_size, nd) else: k, _, _, ls = self._get_basic_kernel(2**_ceil_log2(n), nd) kargs = [n, out, out.offset] kargs.extend(dims) for i, arg in enumerate(args): kargs.append(arg) if isinstance(arg, gpuarray.GpuArray): kargs.append(offsets[i]) kargs.extend(strs[i]) k(*kargs, gs=gs, ls=ls) return out def reduce1(ary, op, neutral, out_type, axis=None, out=None, oper=None): nd = ary.ndim if axis is None: redux = [True] * nd else: redux = [False] * nd if not isinstance(axis, (list, tuple)): axis = (axis,) for ax in axis: if ax < 0: ax += nd if ax < 0 or ax >= nd: raise ValueError('axis out of bounds') redux[ax] = True if oper is None: reduce_expr = "a %s b" % (op,) else: reduce_expr = oper r = ReductionKernel(ary.context, dtype_out=out_type, neutral=neutral, reduce_expr=reduce_expr, redux=redux, arguments=[ArrayArg(ary.dtype, 'a')]) return r(ary, out=out) libgpuarray-0.7.6/pygpu/tests/000077500000000000000000000000001326743622600163665ustar00rootroot00000000000000libgpuarray-0.7.6/pygpu/tests/__init__.py000066400000000000000000000000001326743622600204650ustar00rootroot00000000000000libgpuarray-0.7.6/pygpu/tests/main.py000066400000000000000000000106671326743622600176760ustar00rootroot00000000000000import os import nose.plugins.builtin from nose.config import Config from nose.plugins.manager import PluginManager from numpy.testing.nosetester import NoseTester from numpy.testing.noseclasses import KnownFailure, NumpyTestProgram class NoseTester(NoseTester): """ Nose test runner. This class enables running nose tests from inside libgpuarray, by calling pygpu.test(). This version is more adapted to what we want than Numpy's one. """ def _test_argv(self, verbose, extra_argv): """ Generate argv for nosetest command Parameters ---------- verbose: int Verbosity value for test outputs, in the range 1-10. Default is 1. extra_argv: list List with any extra arguments to pass to nosetests. """ # self.package_path = os.path.abspath(self.package_path) argv = [__file__, self.package_path] argv += ['--verbosity', str(verbose)] if extra_argv: argv += extra_argv return argv def _show_system_info(self): import pygpu # print ("pygpu version %s" % pygpu.__version__) pygpu_dir = os.path.dirname(pygpu.__file__) print("pygpu is installed in %s" % pygpu_dir) super(NoseTester, self)._show_system_info() def prepare_test_args(self, verbose=1, extra_argv=None, coverage=False, capture=True, knownfailure=True): """ Prepare arguments for the `test` method. Takes the same arguments as `test`. """ # compile argv argv = self._test_argv(verbose, extra_argv) # numpy way of doing coverage if coverage: argv += ['--cover-package=%s' % self.package_name, '--with-coverage', '--cover-tests', '--cover-inclusive', '--cover-erase'] # Capture output only if needed if not capture: argv += ['-s'] # construct list of plugins plugins = [] if knownfailure: plugins.append(KnownFailure()) plugins += [p() for p in nose.plugins.builtin.plugins] return argv, plugins def test(self, verbose=1, extra_argv=None, coverage=False, capture=True, knownfailure=True): """ Run tests for module using nose. Parameters ---------- verbose: int Verbosity value for test outputs, in the range 1-10. Default is 1. extra_argv: list List with any extra arguments to pass to nosetests. coverage: bool If True, report coverage of pygpu code. Default is False. capture: bool If True, capture the standard output of the tests, like nosetests does in command-line. The output of failing tests will be displayed at the end. Default is True. knownfailure: bool If True, tests raising KnownFailureTest will not be considered Errors nor Failure, but reported as "known failures" and treated quite like skipped tests. Default is True. Returns ------- nose.result.TextTestResult The result of running the tests """ # cap verbosity at 3 because nose becomes *very* verbose beyond that verbose = min(verbose, 3) self._show_system_info() cwd = os.getcwd() if self.package_path in os.listdir(cwd): # The tests give weird errors if the package to test is # in current directory. raise RuntimeError(( "This function does not run correctly when, at the time " "pygpu was imported, the working directory was pygpu's " "parent directory. You should exit your Python prompt, change " "directory, then launch Python again, import pygpu, then " "launch pygpu.test().")) argv, plugins = self.prepare_test_args(verbose, extra_argv, coverage, capture, knownfailure) # The "plugins" keyword of NumpyTestProgram gets ignored if config is # specified. Moreover, using "addplugins" instead can lead to strange # errors. So, we specify the plugins in the Config as well. cfg = Config(includeExe=True, plugins=PluginManager(plugins=plugins)) t = NumpyTestProgram(argv=argv, exit=False, config=cfg) return t.result libgpuarray-0.7.6/pygpu/tests/support.py000066400000000000000000000120331326743622600204530ustar00rootroot00000000000000from __future__ import print_function import os import sys import numpy from nose.plugins.skip import SkipTest from pygpu import gpuarray if numpy.__version__ < '1.6.0': skip_single_f = True else: skip_single_f = False dtypes_all = ["float32", "float64", "int8", "int16", "uint8", "uint16", "int32", "int64", "uint32", "uint64"] dtypes_no_complex = dtypes_all # Sometimes int8 is just a source of trouble (like with overflows) dtypes_no_complex_big = ["float32", "float64", "int16", "uint16", "int32", "int64", "uint32", "uint64"] def get_env_dev(): for name in ['GPUARRAY_TEST_DEVICE', 'DEVICE']: if name in os.environ: return os.environ[name] raise RuntimeError( "No test device specified. Specify one using the DEVICE " "or GPUARRAY_TEST_DEVICE environment variables.") context = gpuarray.init(get_env_dev()) print("*** Testing for", context.devname, file=sys.stderr) def guard_devsup(func): def f(*args, **kwargs): try: func(*args, **kwargs) except gpuarray.UnsupportedException as e: raise SkipTest("operation not supported") return f def rand(shape, dtype): r = numpy.random.randn(*shape) * 10 if r.dtype.startswith('u'): r = numpy.absolute(r) return r.astype(dtype) def check_flags(x, y): assert isinstance(x, gpuarray.GpuArray) if y.size == 0 and y.flags["C_CONTIGUOUS"] and y.flags["F_CONTIGUOUS"]: # Different numpy version have different value for # C_CONTIGUOUS in that case. pass elif x.flags["C_CONTIGUOUS"] != y.flags["C_CONTIGUOUS"]: # Numpy 1.10 can set c/f contiguous more frequently by # ignoring strides on dimensions of size 1. assert x.flags["C_CONTIGUOUS"] is True, (x.flags, y.flags) assert x.flags["F_CONTIGUOUS"] is False, (x.flags, y.flags) assert y.flags["C_CONTIGUOUS"] is False, (x.flags, y.flags) # That depend of numpy version. # assert y.flags["F_CONTIGUOUS"] is True, (x.flags, y.flags) else: if not (skip_single_f and x.shape == ()): # Numpy below 1.6.0 does not have a consistent handling of # f-contiguous for 0-d arrays if not any([s == 1 for s in x.shape]): # Numpy 1.10 can set f contiguous more frequently by # ignoring strides on dimensions of size 1. assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"], ( x.flags, y.flags) else: assert x.flags["F_CONTIGUOUS"] assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"], (x.flags, y.flags) # Don't check for OWNDATA since it is always true for a GpuArray assert x.flags["ALIGNED"] == y.flags["ALIGNED"], (x.flags, y.flags) assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"], (x.flags, y.flags) def check_meta_only(x, y): assert isinstance(x, gpuarray.GpuArray) assert x.shape == y.shape assert x.dtype == y.dtype if y.size != 0: assert x.strides == y.strides def check_content(x, y): assert isinstance(x, gpuarray.GpuArray) assert numpy.allclose(numpy.asarray(x), numpy.asarray(y)) def check_meta(x, y): check_meta_only(x, y) check_flags(x, y) def check_all(x, y): check_meta(x, y) check_content(x, y) def check_meta_content(x, y): check_meta_only(x, y) check_content(x, y) def gen_gpuarray(shape_orig, dtype='float32', offseted_outer=False, offseted_inner=False, sliced=1, order='c', nozeros=False, incr=0, ctx=None, cls=None): if sliced is True: sliced = 2 elif sliced is False: sliced = 1 shape = numpy.asarray(shape_orig).copy() if sliced != 1 and len(shape) > 0: shape[0] *= numpy.absolute(sliced) if offseted_outer and len(shape) > 0: shape[0] += 1 if offseted_inner and len(shape) > 0: shape[-1] += 1 low = 0.0 if nozeros: low = 1.0 a = numpy.random.uniform(low, 10.0, shape) a += incr a = numpy.asarray(a, dtype=dtype) b = gpuarray.array(a, context=ctx, cls=cls) assert order in ['c', 'f'] if order == 'f' and len(shape) > 0: a = numpy.asfortranarray(a) b = gpuarray.asfortranarray(b) if order == 'f' and len(shape) > 0 and b.size > 1: assert b.flags['F_CONTIGUOUS'] if offseted_outer and len(shape) > 0: b = b[1:] a = a[1:] if offseted_inner and len(shape) > 0: # The b[..., 1:] act as the test for this subtensor case. b = b[..., 1:] a = a[..., 1:] if sliced != 1 and len(shape) > 0: a = a[::sliced] b = b[::sliced] if False and shape_orig == (): assert a.shape == (1,) assert b.shape == (1,) else: assert a.shape == shape_orig, (a.shape, shape_orig) assert b.shape == shape_orig, (b.shape, shape_orig) assert numpy.allclose(a, numpy.asarray(b)), (a, numpy.asarray(b)) return a, b libgpuarray-0.7.6/pygpu/tests/test_basic.py000066400000000000000000000046771326743622600210760ustar00rootroot00000000000000import pygpu from pygpu.basic import (tril, triu) from unittest import TestCase from .support import (gen_gpuarray, context) import numpy def test_tril(): for shape in [(10, 5), (5, 10), (10, 10)]: for order in ['c', 'f']: for inplace in [True, False]: ac, ag = gen_gpuarray(shape, 'float32', order=order, ctx=context) result = tril(ag, inplace=inplace) assert numpy.all(numpy.tril(ac) == result) if inplace: assert numpy.all(numpy.tril(ac) == ag) else: assert numpy.all(ac == ag) def test_triu(): for shape in [(10, 5), (5, 10), (10, 10)]: for order in ['c', 'f']: for inplace in [True, False]: ac, ag = gen_gpuarray(shape, 'float32', order=order, ctx=context) result = triu(ag, inplace=inplace) assert numpy.all(numpy.triu(ac) == result) if inplace: assert numpy.all(numpy.triu(ac) == ag) else: assert numpy.all(ac == ag) class test_errors(TestCase): def runTest(self): self.assertRaises(ValueError, self.run_1d_triu) self.assertRaises(ValueError, self.run_3d_triu) self.assertRaises(ValueError, self.run_1d_tril) self.assertRaises(ValueError, self.run_3d_tril) self.assertRaises(ValueError, self.run_noncontiguous_tril) self.assertRaises(ValueError, self.run_noncontiguous_triu) def run_1d_triu(self): ac, ag = gen_gpuarray((10, ), 'float32', ctx=context) triu(ag) def run_3d_triu(self): ac, ag = gen_gpuarray((10, 10, 10), 'float32', ctx=context) triu(ag) def run_1d_tril(self): ac, ag = gen_gpuarray((10, ), 'float32', ctx=context) tril(ag) def run_3d_tril(self): ac, ag = gen_gpuarray((10, 10, 10), 'float32', ctx=context) tril(ag) def run_noncontiguous_tril(self): a = numpy.random.rand(5, 5) b = pygpu.array(a, context=context) b = b[::-1] assert b.flags.c_contiguous is b.flags.f_contiguous is False tril(b) def run_noncontiguous_triu(self): a = numpy.random.rand(5, 5) b = pygpu.array(a, context=context) b = b[::-1] assert b.flags.c_contiguous is b.flags.f_contiguous is False triu(b) libgpuarray-0.7.6/pygpu/tests/test_blas.py000066400000000000000000000205711326743622600207250ustar00rootroot00000000000000from itertools import product import numpy from nose.plugins.skip import SkipTest from .support import (guard_devsup, gen_gpuarray, context) try: import scipy.linalg.blas try: fblas = scipy.linalg.blas.fblas except AttributeError: fblas = scipy.linalg.blas except ImportError as e: raise SkipTest("no scipy blas to compare against") import pygpu.blas as gblas def test_dot(): bools = [True, False] for N, dtype, offseted_i, sliced in product( [1, 256, 1337], ['float32', 'float64'], bools, bools): yield dot, N, dtype, offseted_i, sliced, True, False for overwrite, init_z in product(bools, bools): yield dot, 666, 'float32', False, False, overwrite, init_z @guard_devsup def dot(N, dtype, offseted_i, sliced, overwrite, init_z): cX, gX = gen_gpuarray((N,), dtype, offseted_inner=offseted_i, sliced=sliced, ctx=context) cY, gY = gen_gpuarray((N,), dtype, offseted_inner=offseted_i, sliced=sliced, ctx=context) if init_z: gZ = gen_gpuarray((), dtype, offseted_inner=offseted_i, sliced=sliced, ctx=context)[1] else: gZ = None if dtype == 'float32': cr = fblas.sdot(cX, cY) else: cr = fblas.ddot(cX, cY) gr = gblas.dot(gX, gY, gZ, overwrite_z=overwrite) numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6) def test_gemv(): bools = [False, True] for shape, order, trans, offseted_i, sliced in product( [(100, 128), (128, 50)], 'fc', bools, bools, [1, 2, -1, -2]): yield (gemv, shape, 'float32', order, trans, offseted_i, sliced, True, False) for overwrite, init_y in product(bools, bools): yield (gemv, (4, 3), 'float32', 'f', False, False, 1, overwrite, init_y) yield gemv, (32, 32), 'float64', 'f', False, False, 1, True, False for alpha, beta, overwrite in product( [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): yield (gemv, (32, 32), 'float32', 'f', False, False, 1, overwrite, True, alpha, beta) @guard_devsup def gemv(shp, dtype, order, trans, offseted_i, sliced, overwrite, init_y, alpha=1.0, beta=0.0): cA, gA = gen_gpuarray(shp, dtype, order=order, offseted_inner=offseted_i, sliced=sliced, ctx=context) if trans: shpX = (shp[0],) shpY = (shp[1],) else: shpX = (shp[1],) shpY = (shp[0],) cX, gX = gen_gpuarray(shpX, dtype, offseted_inner=offseted_i, sliced=sliced, ctx=context) if init_y: cY, gY = gen_gpuarray(shpY, dtype, ctx=context) else: cY, gY = None, None if dtype == 'float32': cr = fblas.sgemv(alpha, cA, cX, beta, cY, trans=trans, overwrite_y=overwrite) else: cr = fblas.dgemv(alpha, cA, cX, beta, cY, trans=trans, overwrite_y=overwrite) gr = gblas.gemv(alpha, gA, gX, beta, gY, trans_a=trans, overwrite_y=overwrite) numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6) def test_gemm(): bools = [False, True] for (m, n, k), order, trans, offseted_o in product( [(48, 15, 32), (15, 32, 48)], list(product(*['fc']*3)), list(product(bools, bools)), bools): yield (gemm, m, n, k, 'float32', order, trans, offseted_o, 1, False, False) for sliced, overwrite, init_res in product([1, 2, -1, -2], bools, bools): yield (gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'), (False, False), False, sliced, overwrite, init_res) yield (gemm, 32, 32, 32, 'float64', ('f', 'f', 'f'), (False, False), False, 1, False, False) for alpha, beta, overwrite in product( [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): yield (gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'), (False, False), False, 1, overwrite, True, alpha, beta) @guard_devsup def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, init_res, alpha=1.0, beta=0.0): if trans[0]: shpA = (k, m) else: shpA = (m, k) if trans[1]: shpB = (n, k) else: shpB = (k, n) cA, gA = gen_gpuarray(shpA, dtype, order=order[0], offseted_outer=offseted_o, sliced=sliced, ctx=context) cB, gB = gen_gpuarray(shpB, dtype, order=order[1], offseted_outer=offseted_o, sliced=sliced, ctx=context) if init_res: cC, gC = gen_gpuarray((m, n), dtype, order=order[2], ctx=context) else: cC, gC = None, None if dtype == 'float32': cr = fblas.sgemm(alpha, cA, cB, beta, cC, trans_a=trans[0], trans_b=trans[1], overwrite_c=overwrite) else: cr = fblas.dgemm(alpha, cA, cB, beta, cC, trans_a=trans[0], trans_b=trans[1], overwrite_c=overwrite) gr = gblas.gemm(alpha, gA, gB, beta, gC, trans_a=trans[0], trans_b=trans[1], overwrite_c=overwrite) numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6) def test_ger(): bools = [False, True] for (m, n), order, sliced_x, sliced_y in product( [(4, 5)], 'fc', [1, 2, -2, -1], [1, 2, -2, -1]): yield ger, m, n, 'float32', order, sliced_x, sliced_y, False yield ger, 4, 5, 'float64', 'f', 1, 1, False for init_res, overwrite in product(bools, bools): yield ger, 4, 5, 'float32', 'f', 1, 1, init_res, overwrite def ger(m, n, dtype, order, sliced_x, sliced_y, init_res, overwrite=False): cX, gX = gen_gpuarray((m,), dtype, order, sliced=sliced_x, ctx=context) cY, gY = gen_gpuarray((n,), dtype, order, sliced=sliced_y, ctx=context) if init_res: cA, gA = gen_gpuarray((m, n), dtype, order, ctx=context) else: cA, gA = None, None if dtype == 'float32': cr = fblas.sger(1.0, cX, cY, a=cA, overwrite_a=overwrite) else: cr = fblas.dger(1.0, cX, cY, a=cA, overwrite_a=overwrite) gr = gblas.ger(1.0, gX, gY, gA, overwrite_a=overwrite) numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6) def test_rgemmBatch_3d(): bools = [False, True] for b, (m, n, k), order, trans, offseted_o in product( [1, 17, 31], [(24, 7, 16), (7, 16, 24)], list(product('fc', 'fc', 'c')), list(product(bools, bools)), bools): yield (rgemmBatch_3d, b, m, n, k, 'float32', order, trans, offseted_o, 1, False, False) for sliced, overwrite, init_res in product([1, 2, -1, -2], bools, bools): yield (rgemmBatch_3d, 5, 4, 3, 2, 'float32', ('f', 'f', 'c'), (False, False), False, sliced, overwrite, init_res) yield (rgemmBatch_3d, 16, 16, 16, 16, 'float64', ('f', 'f', 'c'), (False, False), False, 1, False, False) for alpha, beta, overwrite in product( [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): yield (rgemmBatch_3d, 16, 16, 9, 16, 'float32', ('f', 'f', 'c'), (False, False), False, 1, overwrite, True, alpha, beta) @guard_devsup def rgemmBatch_3d(b, m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, init_res, alpha=1.0, beta=0.0): if trans[0]: shpA = (b, k, m) else: shpA = (b, m, k) if trans[1]: shpB = (b, n, k) else: shpB = (b, k, n) cA, gA = gen_gpuarray(shpA, dtype, order=order[0], offseted_outer=offseted_o, sliced=sliced, ctx=context) cB, gB = gen_gpuarray(shpB, dtype, order=order[1], offseted_outer=offseted_o, sliced=sliced, ctx=context) if init_res: cC, gC = gen_gpuarray((b, m, n), dtype, order=order[2], ctx=context) else: cC, gC = None, None cr = numpy.empty((b, m, n), dtype=dtype) if dtype == 'float32': fn_gemm_c = fblas.sgemm else: fn_gemm_c = fblas.dgemm for i in range(b): cCi = cC if cC is None else cC[i] cr[i] = fn_gemm_c(alpha, cA[i], cB[i], beta, cCi, trans_a=trans[0], trans_b=trans[1], overwrite_c=overwrite) gr = gblas.gemmBatch_3d(alpha, gA, gB, beta, gC, trans_a=trans[0], trans_b=trans[1], overwrite_c=overwrite) numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-5) libgpuarray-0.7.6/pygpu/tests/test_collectives.py000066400000000000000000000275021326743622600223210ustar00rootroot00000000000000from __future__ import print_function import os import sys import unittest from six.moves import range from six import PY3 import pickle import numpy as np from pygpu import gpuarray from pygpu.collectives import COMM_ID_BYTES, GpuCommCliqueId, GpuComm from pygpu.tests.support import (check_all, gen_gpuarray, context as ctx) def get_user_gpu_rank(): for name in ['GPUARRAY_TEST_DEVICE', 'DEVICE']: if name in os.environ: devname = os.environ[name] if devname.startswith("opencl"): return -1 if devname[-1] == 'a': return 0 return int(devname[-1]) return -1 try: from mpi4py import MPI MPI_IMPORTED = True except: MPI_IMPORTED = False print("mpi4py found: " + str(MPI_IMPORTED), file=sys.stderr) @unittest.skipIf(get_user_gpu_rank() == -1, "Collective operations supported on CUDA devices only.") class TestGpuCommCliqueId(unittest.TestCase): def setUp(self): self.cid = GpuCommCliqueId(context=ctx) def _create_in_scope_from_string(self): comm_id = bytearray(b'pipes' * (COMM_ID_BYTES // 5 + 1)) return GpuCommCliqueId(context=ctx, comm_id=comm_id) def test_create_from_string_id(self): cid2 = self._create_in_scope_from_string() a = bytearray(b'pipes' * (COMM_ID_BYTES // 5 + 1)) assert cid2.comm_id == a[:COMM_ID_BYTES], (cid2.comm_id, a[:COMM_ID_BYTES]) b = bytearray(b'mlkies' * (COMM_ID_BYTES // 6 + 1)) cid2.comm_id = b assert cid2.comm_id == b[:COMM_ID_BYTES], (cid2.comm_id, b[:COMM_ID_BYTES]) with self.assertRaises(ValueError): cid2.comm_id = bytearray(b'testestestest') def test_pickle(self): with self.assertRaises(RuntimeError): pickle.dumps(self.cid) with self.assertRaises(RuntimeError): pickle.dumps(self.cid, protocol=0) with self.assertRaises(RuntimeError): pickle.dumps(self.cid, protocol=1) with self.assertRaises(RuntimeError): pickle.dumps(self.cid, protocol=2) if PY3: with self.assertRaises(RuntimeError): pickle.dumps(self.cid, protocol=3) with self.assertRaises(RuntimeError): pickle.dumps(self.cid, protocol=-1) def test_create_from_previous(self): cid2 = GpuCommCliqueId(context=ctx, comm_id=bytearray(b'y' * COMM_ID_BYTES)) cid3 = GpuCommCliqueId(context=ctx, comm_id=cid2.comm_id) assert cid2.comm_id == cid3.comm_id def test_richcmp(self): cid1 = GpuCommCliqueId(context=ctx, comm_id=bytearray(b'y' * COMM_ID_BYTES)) cid2 = GpuCommCliqueId(context=ctx, comm_id=cid1.comm_id) cid3 = GpuCommCliqueId(context=ctx, comm_id=bytearray(b'z' * COMM_ID_BYTES)) assert cid1 == cid2 assert cid1 != cid3 assert cid3 > cid2 assert cid3 >= cid2 assert cid1 >= cid2 assert cid2 < cid3 assert cid2 <= cid3 assert cid2 <= cid1 with self.assertRaises(TypeError): a = cid2 > "asdfasfa" @unittest.skipUnless(MPI_IMPORTED, "Needs mpi4py module") @unittest.skipIf(get_user_gpu_rank() == -1, "Collective operations supported on CUDA devices only") class TestGpuComm(unittest.TestCase): @classmethod def setUpClass(cls): if get_user_gpu_rank() == -1 or not MPI_IMPORTED: return cls.mpicomm = MPI.COMM_WORLD cls.size = cls.mpicomm.Get_size() cls.rank = cls.mpicomm.Get_rank() cls.ctx = gpuarray.init("cuda" + str(cls.rank)) print("*** Collectives testing for", cls.ctx.devname, file=sys.stderr) cls.cid = GpuCommCliqueId(context=cls.ctx) cls.mpicomm.Bcast(cls.cid.comm_id, root=0) cls.gpucomm = GpuComm(cls.cid, cls.size, cls.rank) def test_count(self): assert self.gpucomm.count == self.size, (self.gpucomm.count, self.size) def test_rank(self): assert self.gpucomm.rank == self.rank, (self.gpucomm.rank, self.rank) def test_reduce(self): cpu, gpu = gen_gpuarray((3, 4, 5), order='c', incr=self.rank, ctx=self.ctx) rescpu = np.empty_like(cpu) resgpu = gpu._empty_like_me() if self.rank != 0: self.gpucomm.reduce(gpu, 'sum', resgpu, root=0) self.mpicomm.Reduce([cpu, MPI.FLOAT], None, op=MPI.SUM, root=0) else: self.gpucomm.reduce(gpu, 'sum', resgpu) self.mpicomm.Reduce([cpu, MPI.FLOAT], [rescpu, MPI.FLOAT], op=MPI.SUM, root=0) if self.rank == 0: assert np.allclose(resgpu, rescpu) resgpu = self.gpucomm.reduce(gpu, 'sum', root=0) if self.rank == 0: assert resgpu.shape == gpu.shape, (resgpu.shape, gpu.shape) assert resgpu.dtype == gpu.dtype, (resgpu.dtype, gpu.dtype) assert resgpu.flags['C'] == gpu.flags['C'] assert resgpu.flags['F'] == gpu.flags['F'] assert np.allclose(resgpu, rescpu) else: assert resgpu is None if self.rank == 0: resgpu = self.gpucomm.reduce(gpu, 'sum') assert resgpu.shape == gpu.shape, (resgpu.shape, gpu.shape) assert resgpu.dtype == gpu.dtype, (resgpu.dtype, gpu.dtype) assert resgpu.flags['C'] == gpu.flags['C'] assert resgpu.flags['F'] == gpu.flags['F'] assert np.allclose(resgpu, rescpu) else: resgpu = self.gpucomm.reduce(gpu, 'sum', root=0) assert resgpu is None def test_all_reduce(self): cpu, gpu = gen_gpuarray((3, 4, 5), order='c', incr=self.rank, ctx=self.ctx) rescpu = np.empty_like(cpu) resgpu = gpu._empty_like_me() self.gpucomm.all_reduce(gpu, 'sum', resgpu) self.mpicomm.Allreduce([cpu, MPI.FLOAT], [rescpu, MPI.FLOAT], op=MPI.SUM) assert np.allclose(resgpu, rescpu) resgpu = self.gpucomm.all_reduce(gpu, 'sum') assert resgpu.shape == gpu.shape, (resgpu.shape, gpu.shape) assert resgpu.dtype == gpu.dtype, (resgpu.dtype, gpu.dtype) assert resgpu.flags['C'] == gpu.flags['C'] assert resgpu.flags['F'] == gpu.flags['F'] assert np.allclose(resgpu, rescpu) def test_reduce_scatter(self): texp = self.size * np.arange(5 * self.size) + sum(range(self.size)) exp = texp[self.rank * 5:self.rank * 5 + 5] # order c cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (self.size, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = gpuarray.empty((5,), dtype='int64', order='C', context=self.ctx) self.gpucomm.reduce_scatter(gpu, 'sum', resgpu) assert np.allclose(resgpu, exp) # order f cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (5, self.size), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = gpuarray.empty((5,), dtype='int64', order='F', context=self.ctx) self.gpucomm.reduce_scatter(gpu, 'sum', resgpu) assert np.allclose(resgpu, exp) # make result order c (one less dim) cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (self.size, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['C_CONTIGUOUS'] is True # c-contiguous split problem (for size == 1, it can always be split) if self.size != 1: cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank np.reshape(cpu, (self.size + 1, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) with self.assertRaises(TypeError): resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') # make result order f (one less dim) cpu = np.arange(5 * self.size) + self.rank np.reshape(cpu, (5, self.size), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['F_CONTIGUOUS'] is True # f-contiguous split problem (for size == 1, it can always be split) if self.size != 1: cpu = np.arange(5 * (self.size + 1), dtype='int32') + self.rank np.reshape(cpu, (5, self.size + 1), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) with self.assertRaises(TypeError): resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') # make result order c (same dim - less size) texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size)) exp = texp[self.rank * 15:self.rank * 15 + 15] np.reshape(exp, (3, 5), order='C') cpu = np.arange(5 * self.size * 3) + self.rank np.reshape(cpu, (self.size * 3, 5), order='C') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['C_CONTIGUOUS'] is True # make result order f (same dim - less size) texp = self.size * np.arange(5 * self.size * 3) + sum(range(self.size)) exp = texp[self.rank * 15:self.rank * 15 + 15] np.reshape(exp, (5, 3), order='F') cpu = np.arange(5 * self.size * 3) + self.rank np.reshape(cpu, (5, self.size * 3), order='F') gpu = gpuarray.asarray(cpu, context=self.ctx) resgpu = self.gpucomm.reduce_scatter(gpu, 'sum') check_all(resgpu, exp) assert resgpu.flags['F_CONTIGUOUS'] is True def test_broadcast(self): if self.rank == 0: cpu, gpu = gen_gpuarray((3, 4, 5), order='c', incr=self.rank, ctx=self.ctx) else: cpu = np.zeros((3, 4, 5), dtype='float32') gpu = gpuarray.asarray(cpu, context=self.ctx) if self.rank == 0: self.gpucomm.broadcast(gpu) else: self.gpucomm.broadcast(gpu, root=0) self.mpicomm.Bcast(cpu, root=0) assert np.allclose(gpu, cpu) def test_all_gather(self): texp = np.arange(self.size * 10, dtype='int32') cpu = np.arange(self.rank * 10, self.rank * 10 + 10, dtype='int32') a = cpu gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=0) check_all(resgpu, texp) a = cpu.reshape((2, 5), order='C') exp = texp.reshape((2 * self.size, 5), order='C') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=0) check_all(resgpu, exp) a = cpu.reshape((2, 5), order='C') exp = texp.reshape((self.size, 2, 5), order='C') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=1) check_all(resgpu, exp) a = cpu.reshape((2, 5), order='C') exp = texp.reshape((self.size, 1, 1, 2, 5), order='C') gpu = gpuarray.asarray(a, context=self.ctx) resgpu = self.gpucomm.all_gather(gpu, nd_up=3) check_all(resgpu, exp) a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2 * self.size), order='F') gpu = gpuarray.asarray(a, context=self.ctx, order='F') resgpu = self.gpucomm.all_gather(gpu, nd_up=0) check_all(resgpu, exp) a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2, self.size), order='F') gpu = gpuarray.asarray(a, context=self.ctx, order='F') resgpu = self.gpucomm.all_gather(gpu, nd_up=1) check_all(resgpu, exp) a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2, 1, 1, self.size), order='F') gpu = gpuarray.asarray(a, context=self.ctx, order='F') resgpu = self.gpucomm.all_gather(gpu, nd_up=3) check_all(resgpu, exp) with self.assertRaises(Exception): resgpu = self.gpucomm.all_gather(gpu, nd_up=-2) libgpuarray-0.7.6/pygpu/tests/test_elemwise.py000066400000000000000000000265051326743622600216210ustar00rootroot00000000000000import operator import numpy from mako.template import Template from unittest import TestCase from pygpu import gpuarray, ndgpuarray as elemary from pygpu.dtypes import dtype_to_ctype, get_common_dtype from pygpu.elemwise import as_argument, ielemwise2 from pygpu._elemwise import GpuElemwise, arg from six import PY2 from .support import (guard_devsup, context, gen_gpuarray, check_meta_content) dtypes_test = ['float32', 'int8', 'uint64'] operators1 = [operator.neg, operator.pos, operator.abs] operators2 = [operator.add, operator.sub, operator.floordiv, operator.mod, operator.mul, operator.truediv, operator.eq, operator.ne, operator.lt, operator.le, operator.gt, operator.ge] if PY2: operators2.append(operator.div) ioperators2 = [operator.iadd, operator.isub, operator.ifloordiv, operator.imod, operator.imul, operator.itruediv] if PY2: ioperators2.append(operator.idiv) elems = [2, 0.3, numpy.asarray(3, dtype='int8'), numpy.asarray(7, dtype='uint32'), numpy.asarray(2.45, dtype='float32')] def test_elemwise1_ops_array(): for op in operators1: for dtype in dtypes_test: yield elemwise1_ops_array, op, dtype @guard_devsup def elemwise1_ops_array(op, dtype): c, g = gen_gpuarray((50,), dtype, ctx=context, cls=elemary) out_c = op(c) out_g = op(g) assert out_c.shape == out_g.shape assert out_c.dtype == out_g.dtype assert numpy.allclose(out_c, numpy.asarray(out_g)) def test_elemwise2_ops_array(): for op in operators2: for dtype1 in dtypes_test: for dtype2 in dtypes_test: yield elemwise2_ops_array, op, dtype1, dtype2, (50,) def test_ielemwise2_ops_array(): for op in ioperators2: for dtype1 in dtypes_test: for dtype2 in dtypes_test: yield ielemwise2_ops_array, op, dtype1, dtype2, (50,) class test_elemwise_output_not_broadcasted(TestCase): def test_all(self): test_values = [((1, 4), (6, 4)), ((2, 1, 8, 7), (2, 2, 8, 7))] for shapea, shapeb in test_values: # Sould fail: dimensions are not all equal. self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb, False) # Should fail: broascast should not be done on output. self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb, True) # Should fail: dimensions are not all equal. self.assertRaises(ValueError, self.check_elemwise2, shapeb, shapeb, shapea, False) # Should fail: broadcast should not be done on output. self.assertRaises(ValueError, self.check_elemwise2, shapeb, shapeb, shapea, True) # Should pass: output would be done on read-only input. self.run_ielemwise2(shapeb, shapea, broadcast=True) # Should pass: output would be done on read-only inputs. self.check_elemwise2(shapea, shapea, shapeb, broadcast=True) self.check_elemwise2(shapea, shapeb, shapeb, broadcast=True) self.check_elemwise2(shapeb, shapea, shapeb, broadcast=True) @guard_devsup def run_ielemwise2(self, shapea, shapeb, broadcast=True): na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary) nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary) ielemwise2(ga, '+', gb, broadcast=broadcast) na += nb assert numpy.allclose(na, numpy.asarray(ga), atol=1e-6) @guard_devsup def check_elemwise2(self, shapea, shapeb, output_shape, broadcast=True): # We rewrite this version of elemwise2 to skip the scaling of output # that is done in the official elemwise2 function. na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary) nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary) odtype = get_common_dtype(ga, gb, True) res = gpuarray.empty(output_shape, dtype=odtype, context=ga.context, cls=ga.__class__) a_arg = as_argument(ga, 'a', read=True) b_arg = as_argument(gb, 'b', read=True) res_arg = as_argument(res, 'res', write=True) args = [res_arg, a_arg, b_arg] oper = "res = (%(out_t)s)a %(op)s (%(out_t)s)b" % { 'op': '+', 'out_t': dtype_to_ctype(odtype)} k = GpuElemwise(ga.context, oper, args, convert_f16=True) k(res, ga, gb, broadcast=broadcast) nres = na + nb assert numpy.allclose(nres, numpy.asarray(res), atol=1e-6) @guard_devsup def elemwise2_ops_array(op, dtype1, dtype2, shape): ac, ag = gen_gpuarray(shape, dtype1, ctx=context, cls=elemary) bc, bg = gen_gpuarray(shape, dtype2, nozeros=True, ctx=context, cls=elemary) out_c = op(ac, bc) out_g = op(ag, bg) assert out_c.shape == out_g.shape assert out_c.dtype == out_g.dtype assert numpy.allclose(out_c, numpy.asarray(out_g)) @guard_devsup def ielemwise2_ops_array(op, dtype1, dtype2, shape): incr = 0 if op == operator.isub and dtype1[0] == 'u': # array elements are smaller than 10 by default, so we avoid underflow incr = 10 ac, ag = gen_gpuarray(shape, dtype1, incr=incr, ctx=context, cls=elemary) bc, bg = gen_gpuarray(shape, dtype2, nozeros=True, ctx=context, cls=elemary) try: out_c = op(ac, bc) except TypeError: # TODO: currently, we use old Numpy semantic and tolerate more case. # So we can't test that we raise the same error return out_g = op(ag, bg) assert out_g is ag assert numpy.allclose(out_c, numpy.asarray(out_g), atol=1e-6) def test_elemwise_f16(): yield elemwise1_ops_array, operator.neg, 'float16' yield elemwise2_ops_array, operator.add, 'float16', 'float16', (50,) yield ielemwise2_ops_array, operator.iadd, 'float16', 'float16', (50,) def test_elemwise2_ops_mixed(): for op in operators2: for dtype in dtypes_test: for elem in elems: yield elemwise2_ops_mixed, op, dtype, (50,), elem def test_ielemwise2_ops_mixed(): for op in ioperators2: for dtype in dtypes_test: for elem in elems: yield ielemwise2_ops_mixed, op, dtype, (50,), elem @guard_devsup def elemwise2_ops_mixed(op, dtype, shape, elem): c, g = gen_gpuarray(shape, dtype, ctx=context, cls=elemary) out_c = op(c, elem) out_g = op(g, elem) assert out_c.shape == out_g.shape assert out_c.dtype == out_g.dtype assert numpy.allclose(out_c, numpy.asarray(out_g)) c, g = gen_gpuarray(shape, dtype, nozeros=True, ctx=context, cls=elemary) out_c = op(elem, c) out_g = op(elem, g) assert out_c.shape == out_g.shape assert out_c.dtype == out_g.dtype assert numpy.allclose(out_c, numpy.asarray(out_g)) @guard_devsup def ielemwise2_ops_mixed(op, dtype, shape, elem): incr = 0 if op == operator.isub and dtype[0] == 'u': # array elements are smaller than 10 by default, so we avoid underflow incr = 10 c, g = gen_gpuarray(shape, dtype, incr=incr, ctx=context, cls=elemary) try: out_c = op(c, elem) except TypeError: # TODO: currently, we use old Numpy semantic and tolerate more case. # So we can't test that we raise the same error return out_g = op(g, elem) assert out_g is g assert out_c.shape == out_g.shape assert out_c.dtype == out_g.dtype assert numpy.allclose(out_c, numpy.asarray(out_g)) def test_divmod(): for dtype1 in dtypes_test: for dtype2 in dtypes_test: yield divmod_array, dtype1, dtype2, (50,) for dtype in dtypes_test: for elem in elems: yield divmod_mixed, dtype, (50,), elem @guard_devsup def divmod_array(dtype1, dtype2, shape): ac, ag = gen_gpuarray(shape, dtype1, ctx=context, cls=elemary) bc, bg = gen_gpuarray(shape, dtype2, nozeros=True, ctx=context, cls=elemary) out_c = divmod(ac, bc) out_g = divmod(ag, bg) assert out_c[0].shape == out_g[0].shape assert out_c[1].shape == out_g[1].shape assert out_c[0].dtype == out_g[0].dtype assert out_c[1].dtype == out_g[1].dtype assert numpy.allclose(out_c[0], numpy.asarray(out_g[0])) assert numpy.allclose(out_c[1], numpy.asarray(out_g[1])) @guard_devsup def divmod_mixed(dtype, shape, elem): c, g = gen_gpuarray(shape, dtype, nozeros=True, ctx=context, cls=elemary) out_c = divmod(c, elem) out_g = divmod(g, elem) assert out_c[0].shape == out_g[0].shape assert out_c[1].shape == out_g[1].shape assert out_c[0].dtype == out_g[0].dtype assert out_c[1].dtype == out_g[1].dtype assert numpy.allclose(out_c[0], numpy.asarray(out_g[0])) assert numpy.allclose(out_c[1], numpy.asarray(out_g[1])) out_c = divmod(elem, c) out_g = divmod(elem, g) assert out_c[0].shape == out_g[0].shape assert out_c[1].shape == out_g[1].shape assert out_c[0].dtype == out_g[0].dtype assert out_c[1].dtype == out_g[1].dtype assert numpy.allclose(out_c[0], numpy.asarray(out_g[0])) assert numpy.allclose(out_c[1], numpy.asarray(out_g[1])) def test_elemwise_bool(): a = gpuarray.empty((2,), context=context) exc = None try: bool(a) except ValueError as e: exc = e assert exc is not None a = gpuarray.zeros((1,), context=context) assert not bool(a) a = gpuarray.zeros((), context=context) assert not bool(a) def test_broadcast(): for shapea, shapeb in [((3, 5), (3, 5)), ((1, 5), (3, 5)), ((3, 5), (3, 1)), ((1, 5), (3, 1)), ((3, 1), (3, 5)), ((3, 5), (3, 1)), ((1, 1), (1, 1)), ((3, 4, 5), (4, 5)), ((4, 5), (3, 4, 5)), ((), ())]: yield broadcast, shapea, shapeb def broadcast(shapea, shapeb): ac, ag = gen_gpuarray(shapea, 'float32', ctx=context, cls=elemary) bc, bg = gen_gpuarray(shapeb, 'float32', ctx=context, cls=elemary) rc = ac + bc rg = ag + bg check_meta_content(rg, rc) _inf_preamb_tpl = Template(''' WITHIN_KERNEL ${flt} infinity() {return INFINITY;} WITHIN_KERNEL ${flt} neg_infinity() {return -INFINITY;} ''') def test_infinity(): for dtype in ['float32', 'float64']: ac, ag = gen_gpuarray((2,), dtype, ctx=context, cls=elemary) out_g = ag._empty_like_me() flt = 'ga_float' if dtype == 'float32' else 'ga_double' out_arg = arg('out', out_g.dtype, scalar=False, read=False, write=True) preamble = _inf_preamb_tpl.render(flt=flt) # +infinity ac[:] = numpy.inf expr_inf = 'out = infinity()' kernel = GpuElemwise(context, expr_inf, [out_arg], preamble=preamble) kernel(out_g) assert numpy.array_equal(ac, numpy.asarray(out_g)) # -infinity ac[:] = -numpy.inf expr_neginf = 'out = neg_infinity()' kernel = GpuElemwise(context, expr_neginf, [out_arg], preamble=preamble) kernel(out_g) assert numpy.array_equal(ac, numpy.asarray(out_g)) libgpuarray-0.7.6/pygpu/tests/test_gpu_ndarray.py000066400000000000000000000642461326743622600223260ustar00rootroot00000000000000from __future__ import print_function import unittest import copy from six.moves import range from six import PY3 import pickle import numpy from nose.tools import assert_raises import pygpu from pygpu.gpuarray import GpuArray, GpuKernel from .support import (guard_devsup, check_meta, check_flags, check_all, check_content, gen_gpuarray, context as ctx, dtypes_all, dtypes_no_complex, skip_single_f) def product(*args, **kwds): # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111 pools = map(tuple, args) * kwds.get('repeat', 1) result = [[]] for pool in pools: result = [x + [y] for x in result for y in pool] for prod in result: yield tuple(prod) def permutations(elements): if len(elements) <= 1: yield elements else: for perm in permutations(elements[1:]): for i in range(len(elements)): yield perm[:i] + elements[:1] + perm[i:] def test_hash(): g = pygpu.empty((2, 3), context=ctx) exc = None try: hash(g) except TypeError as e: exc = e assert exc is not None def test_bool(): for data in [numpy.empty((0, 33)), [[1]], [[0]], [], [0], [1], 0, 1]: assert (bool(pygpu.asarray(data, context=ctx)) == bool(numpy.asarray(data))) def test_transfer(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: yield transfer, shp, dtype, offseted def transfer(shp, dtype, offseted): a, b = gen_gpuarray(shp, dtype, offseted, ctx=ctx) # Test that passing dtype doesn't break. c = numpy.asarray(b, dtype=dtype) c = numpy.asarray(b) assert numpy.allclose(c, a) assert a.shape == b.shape == c.shape assert a.strides == b.strides == c.strides assert a.dtype == b.dtype == c.dtype == dtype assert c.flags.c_contiguous def test_cast(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype1 in dtypes_no_complex: for dtype2 in dtypes_no_complex: yield cast, shp, dtype1, dtype2 @guard_devsup def cast(shp, dtype1, dtype2): a, b = gen_gpuarray(shp, dtype1, False, ctx=ctx) ac = a.astype(dtype2) bc = b.astype(dtype2) assert ac.dtype == bc.dtype assert ac.shape == bc.shape assert numpy.allclose(a, numpy.asarray(b)) def test_transfer_not_contiguous(): for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: yield transfer_not_contiguous, shp, dtype @guard_devsup def transfer_not_contiguous(shp, dtype): a = numpy.random.rand(*shp) * 10 b = pygpu.array(a, context=ctx) a = a[::-1] b = b[::-1] c = numpy.asarray(b) assert numpy.allclose(c, a) assert a.shape == b.shape == c.shape # the result array (c) is C contiguous assert a.strides == b.strides == (-c.strides[0],) + c.strides[1:] assert a.dtype == b.dtype == c.dtype assert c.flags.c_contiguous def test_transfer_fortran(): for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: yield transfer_fortran, shp, dtype @guard_devsup def transfer_fortran(shp, dtype): a = numpy.random.rand(*shp) * 10 b = pygpu.array(a, context=ctx) a_ = numpy.asfortranarray(a) if len(shp) > 1: assert a_.strides != a.strides a = a_ b = pygpu.asfortranarray(b) c = numpy.asarray(b) assert a.shape == b.shape == c.shape assert a.dtype == b.dtype == c.dtype assert a.flags.f_contiguous assert c.flags.f_contiguous assert a.strides == b.strides == c.strides assert numpy.allclose(c, a) def test_ascontiguousarray(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted_o in [True, False]: for offseted_i in [True, True]: for sliced in [1, 2, -1, -2]: for order in ['f', 'c']: yield (ascontiguousarray, shp, dtype, offseted_o, offseted_i, sliced, order) @guard_devsup def ascontiguousarray(shp, dtype, offseted_o, offseted_i, sliced, order): cpu, gpu = gen_gpuarray(shp, dtype, offseted_o, offseted_i, sliced, order, ctx=ctx) a = numpy.ascontiguousarray(cpu) b = pygpu.ascontiguousarray(gpu) # numpy upcast with a view to 1d scalar. if (sliced != 1 or shp == () or (offseted_i and len(shp) > 1)): assert b is not gpu if sliced == 1 and not offseted_i: assert (a.data is cpu.data) == (b.bytes is gpu.bytes) else: assert b is gpu assert a.shape == b.shape assert a.dtype == b.dtype assert a.flags.c_contiguous assert b.flags['C_CONTIGUOUS'] assert a.strides == b.strides assert numpy.allclose(cpu, a) assert numpy.allclose(cpu, b) def test_asfortranarray(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted_outer in [True, False]: for offseted_inner in [True, False]: for sliced in [1, 2, -1, -2]: for order in ['f', 'c']: yield (asfortranarray, shp, dtype, offseted_outer, offseted_inner, sliced, order) @guard_devsup def asfortranarray(shp, dtype, offseted_outer, offseted_inner, sliced, order): cpu, gpu = gen_gpuarray(shp, dtype, offseted_outer, offseted_inner, sliced, order, ctx=ctx) a = numpy.asfortranarray(cpu) b = pygpu.asfortranarray(gpu) # numpy upcast with a view to 1d scalar. if gpu.flags['F_CONTIGUOUS']: assert ctx.kind != b'cuda' or b.gpudata == gpu.gpudata elif (sliced != 1 or shp == () or (offseted_outer and len(shp) > 1) or (order != 'f' and len(shp) > 1)): assert b is not gpu else: assert b is gpu assert a.shape == b.shape assert a.dtype == b.dtype assert a.flags.f_contiguous assert b.flags['F_CONTIGUOUS'] if not any([s == 1 for s in cpu.shape]): # Older version then Numpy 1.10 do not set c/f contiguous more # frequently as we do. This cause extra copy. assert a.strides == b.strides assert numpy.allclose(cpu, a) assert numpy.allclose(cpu, b) def test_zeros(): for shp in [(), (0,), (5,), (0, 0), (1, 0), (0, 1), (6, 7), (0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), (4, 8, 9), (1, 8, 9)]: for order in ["C", "F"]: for dtype in dtypes_all: yield zeros, shp, order, dtype @guard_devsup def zeros(shp, order, dtype): x = pygpu.zeros(shp, dtype, order, context=ctx) y = numpy.zeros(shp, dtype, order) check_all(x, y) def test_zeros_no_dtype(): # no dtype and order param x = pygpu.zeros((), context=ctx) y = numpy.zeros(()) check_meta(x, y) def test_zero_noparam(): try: pygpu.zeros() assert False except TypeError: pass def test_empty(): for shp in [(), (0,), (5,), (0, 0), (1, 0), (0, 1), (6, 7), (0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), (4, 8, 9), (1, 8, 9)]: for order in ["C", "F"]: for dtype in dtypes_all: yield empty, shp, order, dtype def empty(shp, order, dtype): x = pygpu.empty(shp, dtype, order, context=ctx) y = numpy.empty(shp, dtype, order) check_meta(x, y) def test_empty_no_dtype(): x = pygpu.empty((), context=ctx) # no dtype and order param y = numpy.empty(()) check_meta(x, y) def test_empty_no_params(): try: pygpu.empty() assert False except TypeError: pass def test_mapping_getitem_ellipsis(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: yield mapping_getitem_ellipsis, shp, dtype, offseted def mapping_getitem_ellipsis(shp, dtype, offseted): a, a_gpu = gen_gpuarray(shp, dtype, offseted, ctx=ctx) b = a_gpu[...] if ctx.kind == b'cuda': assert b.gpudata == a_gpu.gpudata assert b.strides == a.strides assert b.shape == a.shape b_cpu = numpy.asarray(b) assert numpy.allclose(a, b_cpu) def test_getitem_none(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: yield getitem_none, shp def getitem_none(shp): a, a_gpu = gen_gpuarray(shp, ctx=ctx) assert numpy.allclose(a_gpu[..., None], a[..., None]) for _ in range(5): # Choose something to slice with, always works indcs = tuple(numpy.random.choice([0, slice(None), slice(1, None)], size=len(shp))) indcs = indcs[:1] + (None,) + indcs[1:] assert numpy.allclose(a_gpu[indcs], a[indcs]) if shp: assert numpy.allclose(a_gpu[1:, None], a[1:, None]) def test_mapping_setitem(): for shp in [(9,), (8, 9), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: yield mapping_setitem_ellipsis, shp, dtype, offseted yield mapping_setitem_ellipsis2, shp, dtype, offseted yield mapping_setitem_firstaxis, shp, dtype, offseted @guard_devsup def mapping_setitem_ellipsis(shp, dtype, offseted): a, a_gpu = gen_gpuarray(shp, dtype, offseted, ctx=ctx) a[...] = 2 a_gpu[...] = 2 assert numpy.allclose(a, numpy.asarray(a_gpu)) @guard_devsup def mapping_setitem_ellipsis2(shp, dtype, offseted): a, a_gpu = gen_gpuarray(shp, dtype, offseted, ctx=ctx) b, b_gpu = gen_gpuarray(shp[1:], dtype, False, ctx=ctx) a[:] = b a_gpu[:] = b_gpu assert numpy.allclose(a, numpy.asarray(a_gpu)) @guard_devsup def mapping_setitem_firstaxis(shp, dtype, offseted): a, a_gpu = gen_gpuarray(shp, dtype, offseted, ctx=ctx) b, b_gpu = gen_gpuarray(shp[1:], dtype, False, ctx=ctx) a[0] = b a_gpu[0] = b_gpu assert numpy.allclose(a, numpy.asarray(a_gpu)) class WriteReadTest(unittest.TestCase): def setUp(self): self.cpu, self.gpu = gen_gpuarray((3, 4, 5), ctx=ctx) self.cpu[0, 0, 0] = 80 def test_write(self): self.gpu.write(self.cpu) res = numpy.asarray(self.gpu) assert numpy.allclose(self.cpu, res) self.cpu[0, 0, 0] = 160 self.cpu.setflags(write=False) self.gpu.write(self.cpu) res = numpy.asarray(self.gpu) assert numpy.allclose(self.cpu, res) self.cpu = numpy.ndarray((2, 4, 5), dtype="float32", order='C') self.assertRaises(ValueError, self.gpu.write, self.cpu) self.cpu = numpy.ndarray((3, 4, 5), dtype="float64", order='C') self.assertRaises(ValueError, self.gpu.write, self.cpu) cpu2 = numpy.random.random((3, 4, 5)) cpu2 = numpy.asarray(cpu2, dtype='float32', order='F') self.gpu.write(cpu2) res = numpy.asarray(self.gpu) assert numpy.allclose(cpu2, res) cpu2 = numpy.random.random((3, 4, 2, 5)) cpu2 = numpy.asarray(cpu2, dtype='float32', order='C') self.gpu.write(cpu2[:, :, 0, :]) res = numpy.asarray(self.gpu) assert numpy.allclose(cpu2[:, :, 0, :], res) cpu2 = numpy.random.random((3, 4, 2, 5)) cpu2 = numpy.asarray(cpu2, dtype='float32', order='F') self.gpu.write(cpu2[:, :, 0, :]) res = numpy.asarray(self.gpu) assert numpy.allclose(cpu2[:, :, 0, :], res) def test_read(self): self.gpu.read(self.cpu) res = numpy.asarray(self.gpu) assert numpy.allclose(self.cpu, res) self.cpu = numpy.ndarray((3, 4, 5), dtype="float32", order='C') self.cpu.setflags(write=False) self.assertRaises(ValueError, self.gpu.read, self.cpu) self.cpu = numpy.ndarray((2, 4, 5), dtype="float32", order='C') self.assertRaises(ValueError, self.gpu.read, self.cpu) self.cpu = numpy.ndarray((3, 4, 5), dtype="float64", order='C') self.assertRaises(ValueError, self.gpu.read, self.cpu) self.cpu = numpy.ndarray((3, 4, 5), dtype="float32", order='F') self.assertRaises(ValueError, self.gpu.read, self.cpu) self.cpu = numpy.ndarray((3, 4, 2, 5), dtype="float32", order='C') self.assertRaises(ValueError, self.gpu.read, self.cpu[:, :, 0, :]) def test_copy_view(): for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [False, True]: # order1 is the order of the original data for order1 in ['c', 'f']: # order2 is the order wanted after copy for order2 in ['c', 'f']: yield copy_view, shp, dtype, offseted, order1, order2 def check_memory_region(a, a_op, b, b_op): assert (numpy.may_share_memory(a, a_op) == pygpu.gpuarray.may_share_memory(b, b_op)) @guard_devsup def copy_view(shp, dtype, offseted, order1, order2): # TODO test copy unbroadcast! a, b = gen_gpuarray(shp, dtype, offseted, order=order1, ctx=ctx) assert numpy.allclose(a, numpy.asarray(b)) check_flags(b, a) c = b.copy(order2) assert numpy.allclose(a, numpy.asarray(c)) check_flags(c, a.copy(order2)) check_memory_region(a, a.copy(order2), b, c) d = copy.copy(b) assert numpy.allclose(a, numpy.asarray(d)) check_flags(d, copy.copy(a)) check_memory_region(a, copy.copy(a), b, d) e = b.view() assert numpy.allclose(a, numpy.asarray(e)) check_flags(e, a.view()) check_memory_region(a, a.view(), b, e) f = copy.deepcopy(b) assert numpy.allclose(a, numpy.asarray(f)) check_flags(f, copy.deepcopy(a)) check_memory_region(a, copy.deepcopy(a), b, f) g = copy.copy(b.view()) assert numpy.allclose(a, numpy.asarray(g)) check_memory_region(a, copy.copy(a.view()), b, g) check_flags(g, copy.copy(a.view())) def test_shape(): for shps in [((), (1,)), ((5,), (1, 5)), ((5,), (5, 1)), ((2, 3), (6,)), ((6,), (2, 3)), ((1,), ()), ((4,), (-1,)), ((4, 3), (-1,)), ((4, 3), (-1, 3)), ((4, 3), (4, -1)), ((4, 3), (3, -1)), ((4, 3), (12, -1)), ((4, 3), (-1, 12)), ((5, 4, 3, 2), (2, -1, 12)), ((4, 2), (2, 2, -1)), # ((4, 3), (13, -1)), ]: for offseted in [True, False]: for order1 in ['c', 'f']: if -1 not in shps[1]: yield shape_, shps, offseted, order1 for order2 in ['a', 'c', 'f']: yield reshape, shps, offseted, order1, order2 def shape_(shps, offseted, order): ac, ag = gen_gpuarray(shps[0], 'float32', offseted, order=order, ctx=ctx) try: ac.shape = shps[1] except AttributeError: # If numpy says it can't be done, we don't try to test it return ag.shape = shps[1] assert ac.strides == ag.strides, (ac.strides, ag.strides) # np.allclose don't test shapes assert ac.shape == ag.shape, (ac.shape, ag.shape) assert numpy.allclose(ac, numpy.asarray(ag)) def reshape(shps, offseted, order1, order2): ac, ag = gen_gpuarray(shps[0], 'float32', offseted, order=order1, ctx=ctx) outc = ac.reshape(shps[1], order=order2) outg = ag.reshape(shps[1], order=order2) assert outc.shape == outg.shape assert outc.strides == outg.strides assert numpy.allclose(outc, numpy.asarray(outg)) def test_strides(): yield strides_, (4, 4), 'c', 1, (4, 4) yield strides_, (4, 4), 'c', 1, (4, 16) yield strides_, (4, 4), 'c', 1, (16, 4) yield strides_, (4, 4), 'c', 1, (16, 8) yield strides_, (4, 4), 'c', 1, (16, 0) yield strides_, (4, 4), 'c', -1, (-20, 4) yield strides_, (4, 4), 'c', -1, (-12, 4) def set_strides(a, newstr): a.strides = newstr def strides_(shp, order, sliced, newstr): ac, ag = gen_gpuarray(shp, 'float32', sliced=sliced, order=order, ctx=ctx) try: ac.strides = newstr except ValueError: assert_raises(ValueError, set_strides, ag, newstr) return ag.strides = newstr check_flags(ag, ac) assert numpy.allclose(ac, numpy.asarray(ag)) def test_transpose(): for shp in [(2, 3), (4, 8, 9), (1, 2, 3, 4)]: for offseted in [True, False]: for order in ['c', 'f']: for sliced in [1, 2, -2, -1]: yield transpose, shp, offseted, sliced, order for perm in permutations(list(range(len(shp)))): yield (transpose_perm, shp, perm, offseted, sliced, order) def transpose(shp, offseted, sliced, order): ac, ag = gen_gpuarray(shp, 'float32', offseted, sliced=sliced, order=order, ctx=ctx) rc = ac.transpose() rg = ag.transpose() check_all(rg, rc) # also check that we are exactly equal since this only a copy op assert numpy.all(rc == numpy.asarray(rg)) # Test NumPy interface rg = numpy.transpose(ag) check_all(rg, rc) # also check that we are exactly equal since this only a copy op assert numpy.all(rc == numpy.asarray(rg)) def transpose_perm(shp, perm, offseted, sliced, order): ac, ag = gen_gpuarray(shp, 'float32', offseted, sliced=sliced, order=order, ctx=ctx) rc = ac.transpose(perm) rg = ag.transpose(perm) check_all(rg, rc) # also check that we are exactly equal since this only a copy op assert numpy.all(rc == numpy.asarray(rg)) # Test NumPy interface rg = numpy.transpose(ag, perm) check_all(rg, rc) # also check that we are exactly equal since this only a copy op assert numpy.all(rc == numpy.asarray(rg)) def test_transpose_args(): ac, ag = gen_gpuarray((4, 3, 2), 'float32', ctx=ctx) rc = ac.transpose(0, 2, 1) rg = ag.transpose(0, 2, 1) check_all(rg, rc) # also check that we are exactly equal since this only a copy op assert numpy.all(rc == numpy.asarray(rg)) def test_len(): for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: yield len_, shp, dtype, offseted def len_(shp, dtype, offseted): a, a_gpu = gen_gpuarray(shp, dtype, offseted, ctx=ctx) assert len(a_gpu) == shp[0] def test_mapping_getitem_w_int(): for dtype in dtypes_all: for offseted in [True, False]: yield mapping_getitem_w_int, dtype, offseted @guard_devsup def mapping_getitem_w_int(dtype, offseted): # test vector dim = (2,) a, _a = gen_gpuarray(dim, dtype, offseted, ctx=ctx) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[-1], a[-1]) _cmp(_a[1], a[1]) _cmp(_a[0], a[0]) _cmp(_a[::1], a[::1]) _cmpNs(_a[::-1], a[::-1]) _cmp(_a[...], a[...]) _cmpf(_a, 2) # test scalar dim = () a, _a = gen_gpuarray(dim, dtype, offseted, ctx=ctx) _cmp(_a[...], a[...]) _cmpf(_a, 0) _cmpf(_a, slice(1)) # test 4d-tensor dim = (5, 4, 3, 2) a, _a = gen_gpuarray(dim, dtype, offseted, ctx=ctx) _cmpf(_a, slice(-1), slice(-1), 10, -10) _cmpf(_a, slice(-1), slice(-1), -10, slice(-1)) _cmpf(_a, 0, slice(0, -1, -20), -10) _cmpf(_a, 10) _cmpf(_a, (10, 0, 0, 0)) _cmpf(_a, -10) # test with integer _cmp(_a[1], a[1]) _cmp(_a[-1], a[-1]) _cmp(_a[numpy.int64(1)], a[numpy.int64(1)]) _cmp(_a[numpy.int64(-1)], a[numpy.int64(-1)]) # test with slice _cmp(_a[1:], a[1:]) _cmp(_a[1:2], a[1:2]) _cmp(_a[-1:1], a[-1:1]) _cmp(_a[6:7:], a[6:7:]) # test with tuple (mix slice, integer, numpy.int64) _cmpNs(_a[0, 0, ::numpy.int64(-1), ::-1], a[0, 0, ::-1, ::-1]) _cmpNs(_a[:, :, ::numpy.int64(-1), ::-1], a[:, :, ::-1, ::-1]) _cmpNs(_a[:, :, numpy.int64(1), -1], a[:, :, 1, -1]) _cmpNs(_a[:, :, ::-1, ::-1], a[:, :, ::-1, ::-1]) _cmpNs(_a[:, :, ::-10, ::-10], a[:, :, ::-10, ::-10]) _cmpNs(_a[:, :, 1, -1], a[:, :, 1, -1]) _cmpNs(_a[:, :, -1, :], a[:, :, -1, :]) _cmpNs(_a[:, ::-2, -1, :], a[:, ::-2, -1, :]) _cmpNs(_a[:, ::-20, -1, :], a[:, ::-20, -1, :]) _cmpNs(_a[:, ::-2, -1], a[:, ::-2, -1]) _cmpNs(_a[0, ::-2, -1], a[0, ::-2, -1]) _cmp(_a[-1, -1, -1, -2], a[-1, -1, -1, -2]) # test ellipse _cmp(_a[...], a[...]) def _cmp(x, y): assert isinstance(x, GpuArray) assert x.shape == y.shape assert x.dtype == y.dtype assert x.strides == y.strides assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"], (x.flags, y.flags) if y.size == 0: # F_CONTIGUOUS flags change definition with different numpy version # TODO: ideally, we should be F_CONTIGUOUS in that case. pass elif not (skip_single_f and y.shape == ()): assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"], (x.flags, y.flags) else: assert x.flags["F_CONTIGUOUS"] # GpuArrays always own their data so don't check that flag. if x.flags["WRITEABLE"] != y.flags["WRITEABLE"]: assert x.ndim == 0 assert x.flags["ALIGNED"] == y.flags["ALIGNED"], (x.flags, y.flags) assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"], (x.flags, y.flags) x = numpy.asarray(x) assert x.shape == y.shape assert x.dtype == y.dtype assert x.strides == y.strides if not numpy.all(x == y): print(x) print(y) assert numpy.all(x == y), (x, y) def _cmpNs(x, y): """ Don't compare the stride after the transfer There is a copy that have been made on the gpu before the transfer """ assert x.shape == y.shape assert x.dtype == y.dtype assert x.strides == y.strides assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"] assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"] assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"] assert x.flags["ALIGNED"] == y.flags["ALIGNED"] # we don't check owndata since it is always true for GpuArrays assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"] x_ = numpy.asarray(x) assert x_.shape == y.shape assert x_.dtype == y.dtype assert numpy.all(x_ == y), (x_, y) def _cmpf(x, *y): try: x.__getitem__(y) except IndexError: pass else: raise Exception("Did not generate out or bound error") def _cmpfV(x, *y): try: if len(y) == 1: x.__getitem__(*y) else: x.__getitem__(y) except ValueError: pass else: raise Exception("Did not generate value error") def test_take1(): yield do_take1, (4, 3), [2, 0], False yield do_take1, (4, 3), [2, 0], True yield do_take1, (12, 4, 3), [1, 1, 1, 1, 1, 2, 2, 3, 3, 0, 0, 9], False def do_take1(shp, idx, offseted): c, g = gen_gpuarray(shp, dtype='float32', ctx=ctx, order='c') ci = numpy.asarray(idx) gi = pygpu.asarray(ci, context=ctx) rc = c.take(ci, axis=0) rg = g.take1(gi) check_content(rg, rc) def test_flags(): for fl in ['C', 'F', 'W', 'B', 'O', 'A', 'U', 'CA', 'FA', 'FNC', 'FORC', 'CARRAY', 'FARRAY', 'FORTRAN', 'BEHAVED', 'OWNDATA', 'ALIGNED', 'WRITEABLE', 'CONTIGUOUS', 'UPDATEIFCOPY', 'C_CONTIGUOUS', 'F_CONTIGUOUS']: yield flag_dict, fl for p in ['c_contiguous', 'f_contiguous', 'contiguous', 'fortran', 'updateifcopy', 'owndata', 'aligned', 'writeable', 'behaved', 'carray', 'forc', 'fnc', 'farray']: yield flag_prop, p def flag_dict(fl): c2, g2 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='c') c3, g3 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='f') assert c2.flags[fl] == g2.flags[fl] assert c3.flags[fl] == g3.flags[fl] def flag_prop(p): c2, g2 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='c') c3, g3 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='f') assert getattr(c2.flags, p) == getattr(g2.flags, p) assert getattr(c3.flags, p) == getattr(g3.flags, p) class TestPickle(unittest.TestCase): def test_GpuArray(self): with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx)) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=0) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=1) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=2) if PY3: with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=3) with self.assertRaises(RuntimeError): pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=-1) def test_GpuContext(self): with self.assertRaises(RuntimeError): pickle.dumps(ctx) with self.assertRaises(RuntimeError): pickle.dumps(ctx, protocol=0) with self.assertRaises(RuntimeError): pickle.dumps(ctx, protocol=1) with self.assertRaises(RuntimeError): pickle.dumps(ctx, protocol=2) if PY3: with self.assertRaises(RuntimeError): pickle.dumps(ctx, protocol=3) with self.assertRaises(RuntimeError): pickle.dumps(ctx, protocol=-1) def test_GpuKernel(self): k = GpuKernel("#include \"cluda.h\"\nKERNEL void " "k(GLOBAL_MEM ga_float *in)" "{in[0] = 0;}", "k", [], context=ctx) with self.assertRaises(RuntimeError): pickle.dumps(k) with self.assertRaises(RuntimeError): pickle.dumps(k, protocol=0) with self.assertRaises(RuntimeError): pickle.dumps(k, protocol=1) with self.assertRaises(RuntimeError): pickle.dumps(k, protocol=2) if PY3: with self.assertRaises(RuntimeError): pickle.dumps(k, protocol=3) with self.assertRaises(RuntimeError): pickle.dumps(k, protocol=-1) libgpuarray-0.7.6/pygpu/tests/test_operations.py000066400000000000000000000051141326743622600221630ustar00rootroot00000000000000import numpy import pygpu from .support import (gen_gpuarray, context, SkipTest) def test_array_split(): xc, xg = gen_gpuarray((8,), 'float32', ctx=context) rc = numpy.array_split(xc, 3) rg = pygpu.array_split(xg, 3) assert len(rc) == len(rg) for pc, pg in zip(rc, rg): numpy.testing.assert_allclose(pc, numpy.asarray(pg)) xc, xg = gen_gpuarray((8,), 'float32', ctx=context) rc = numpy.array_split(xc, 3, axis=-1) rg = pygpu.array_split(xg, 3, axis=-1) assert len(rc) == len(rg) for pc, pg in zip(rc, rg): numpy.testing.assert_allclose(pc, numpy.asarray(pg)) def test_split(): for spl in (3, [3, 5, 6, 10]): yield xsplit, '', (9,), spl def test_xsplit(): if tuple(int(v) for v in numpy.version.version.split('.')[:2]) < (1, 11): raise SkipTest("Numpy version too old") for l in ('h', 'v'): for spl in (2, [3, 6]): yield xsplit, l, (4, 4), spl yield xsplit, l, (2, 2, 2), 2 for spl in (2, [3, 6]): yield xsplit, 'd', (2, 2, 4), spl def xsplit(l, shp, spl): xc, xg = gen_gpuarray(shp, 'float32', ctx=context) n = l + 'split' rc = getattr(numpy, n)(xc, spl) rg = getattr(pygpu, n)(xg, spl) assert len(rc) == len(rg) for pc, pg in zip(rc, rg): numpy.testing.assert_allclose(pc, numpy.asarray(pg)) def test_concatenate(): ac, ag = gen_gpuarray((2, 2), 'float32', ctx=context) bc, bg = gen_gpuarray((1, 2), 'float32', ctx=context) rc = numpy.concatenate((ac, bc), axis=0) rg = pygpu.concatenate((ag, bg), axis=0) numpy.testing.assert_allclose(rc, numpy.asarray(rg)) rc = numpy.concatenate((ac, bc.T), axis=1) rg = pygpu.concatenate((ag, bg.T), axis=1) numpy.testing.assert_allclose(rc, numpy.asarray(rg)) rc = numpy.concatenate((ac, bc.T), axis=-1) rg = pygpu.concatenate((ag, bg.T), axis=-1) numpy.testing.assert_allclose(rc, numpy.asarray(rg)) def test_hstack(): for shp in [(3,), (3, 1)]: yield xstack, 'h', (shp, shp), (), context def test_vstack(): for shp in [(3,), (3, 1)]: yield xstack, 'v', (shp, shp), (), context def test_dstack(): for shp in [(3,), (3, 1)]: yield xstack, 'd', (shp, shp), (), context def xstack(l, shps, tup, ctx): tupc = list(tup) tupg = list(tup) for shp in shps: tc, tg = gen_gpuarray(shp, 'float32', ctx=context) tupc.append(tc) tupg.append(tg) n = l + 'stack' rc = getattr(numpy, n)(tuple(tupc)) rg = getattr(pygpu, n)(tuple(tupg), ctx) numpy.testing.assert_allclose(rc, numpy.asarray(rg)) libgpuarray-0.7.6/pygpu/tests/test_reduction.py000066400000000000000000000105221326743622600217730ustar00rootroot00000000000000import numpy from nose.tools import assert_raises from pygpu import gpuarray, ndgpuarray as elemary from pygpu.reduction import ReductionKernel from .support import (guard_devsup, check_meta_content, context, gen_gpuarray, dtypes_no_complex_big, dtypes_no_complex) def test_red_array_basic(): for dtype in dtypes_no_complex_big: for shape, redux in [((10,), [True]), ((20, 30), [True, True]), ((20, 30), [True, False]), ((20, 30), [False, True]), ((8, 5, 10), [True, True, True]), ((8, 5, 10), [True, True, False]), ((8, 5, 10), [True, False, True]), ((8, 5, 10), [False, True, True]), ((8, 5, 10), [True, False, False]), ((8, 5, 10), [False, True, False]), ((8, 5, 10), [False, False, True]), ]: yield red_array_sum, dtype, shape, redux @guard_devsup def red_array_sum(dtype, shape, redux): c, g = gen_gpuarray(shape, dtype, ctx=context) axes = [i for i in range(len(redux)) if redux[i]] axes.reverse() out_c = c # numpy.sum doesn't support multiple axis before 1.7.0 for ax in axes: out_c = numpy.apply_along_axis(sum, ax, out_c).astype(dtype) out_g = ReductionKernel(context, dtype, "0", "a + b", redux)(g) assert out_c.shape == out_g.shape assert out_g.dtype == numpy.dtype(dtype) # since we do not use the same summing algorithm, # there will be differences assert numpy.allclose(out_c, numpy.asarray(out_g), rtol=2e-5) def test_red_big_array(): for redux in [[True, False, False], [True, False, True], [False, True, True], [False, True, False]]: yield red_array_sum, 'float32', (2000, 30, 100), redux def test_red_broadcast(): from pygpu.tools import as_argument dtype = float xshape = (5, 10, 15) yshape = (1, 10, 15) redux = [False, True, False] nx, gx = gen_gpuarray(xshape, dtype, ctx=context) ny, gy = gen_gpuarray(yshape, dtype, ctx=context) nz = nx*ny axes = [i for i in range(len(redux)) if redux[i]] axes.reverse() # numpy.sum doesn't support multiple axis before 1.7.0 for ax in axes: nz = numpy.apply_along_axis(sum, ax, nz).astype(dtype) args = [as_argument(gx, 'a'), as_argument(gy, 'b')] gz = ReductionKernel(context, dtype, "0", "a+b", redux, map_expr="a[i]*b[i]", arguments=args)( gx, gy, broadcast=True) assert numpy.allclose(nz, numpy.asarray(gz)) def test_reduction_ops(): for axis in [None, 0, 1]: for op in ['all', 'any']: yield reduction_op, op, 'bool', axis for op in ['prod', 'sum']: # 'min', 'max']: for dtype in dtypes_no_complex: yield reduction_op, op, dtype, axis def reduction_op(op, dtype, axis): c, g = gen_gpuarray((2, 3), dtype=dtype, ctx=context, cls=elemary) rc = getattr(c, op)(axis=axis) rg = getattr(g, op)(axis=axis) check_meta_content(rg, rc) outc = numpy.empty(rc.shape, dtype=rc.dtype) outg = gpuarray.empty(rg.shape, dtype=rg.dtype, context=context) rc = getattr(c, op)(axis=axis, out=outc) rg = getattr(g, op)(axis=axis, out=outg) check_meta_content(outg, outc) def test_reduction_wrong_type(): c, g = gen_gpuarray((2, 3), dtype='float32', ctx=context, cls=elemary) out1 = gpuarray.empty((2, 3), dtype='int32', context=context) out2 = gpuarray.empty((3, 2), dtype='float32', context=context) try: g.sum(out=out1) assert False, "Expected a TypeError out of the sum" except TypeError: pass try: g.sum(out=out2) assert False, "Expected a TypeError out of the sum" except TypeError: pass def test_reduction_0d(): c, g = gen_gpuarray((), dtype='bool', ctx=context, cls=elemary) rc = c.any() rg = g.any() assert numpy.all(rc == numpy.asarray(rg)) rc = c.all() rg = g.all() assert numpy.all(rc == numpy.asarray(rg)) def test_reduction_f16(): c, g = gen_gpuarray((3,), dtype='float16', ctx=context, cls=elemary) assert_raises(NotImplementedError, g.sum) libgpuarray-0.7.6/pygpu/tests/test_tools.py000066400000000000000000000074461326743622600211520ustar00rootroot00000000000000from pygpu.tools import check_args from .support import context, gen_gpuarray def test_check_args_simple(): ac, ag = gen_gpuarray((50,), 'float32', ctx=context) bc, bg = gen_gpuarray((50,), 'float32', ctx=context) n, nd, dims, strs, offsets = check_args((ag, bg)) assert n == 50 assert nd == 1 assert dims == (50,) assert strs == ((4,), (4,)) assert offsets == (0, 0) ac, ag = gen_gpuarray((50, 1, 20), 'float32', ctx=context) bc, bg = gen_gpuarray((50, 1, 20), 'float32', ctx=context) n, nd, dims, strs, offsets = check_args((ag, bg)) assert n == 1000 assert nd == 3 assert dims == (50, 1, 20) assert strs == ((80, 80, 4), (80, 80, 4)) assert offsets == (0, 0) def test_check_args_collapse_1(): ac, ag = gen_gpuarray((50, 1, 20), 'float32', ctx=context) bc, bg = gen_gpuarray((50, 1, 20), 'float32', ctx=context) n, nd, dims, strs, offsets = check_args((ag, bg), collapse=False) assert n == 1000 assert nd == 3 assert dims == (50, 1, 20) assert strs == ((80, 80, 4), (80, 80, 4)) assert offsets == (0, 0) n, nd, dims, strs, offsets = check_args((ag, bg), collapse=True) assert n == 1000 assert nd == 1 assert dims == (1000,) assert strs == ((4,), (4,)) assert offsets == (0, 0) def test_check_args_collapse_2(): ac, ag = gen_gpuarray((50, 1, 20), 'float32', ctx=context, sliced=2, offseted_inner=True) bc, bg = gen_gpuarray((50, 1, 20), 'float32', ctx=context) n, nd, dims, strs, offsets = check_args((ag, bg), collapse=True) assert n == 1000 assert nd == 2 assert dims == (50, 20) assert strs == ((168, 4), (80, 4)) assert offsets == (4, 0) def test_check_args_collapse_3(): ac, ag = gen_gpuarray((50, 2, 10), 'float32', ctx=context, sliced=2, offseted_outer=True) bc, bg = gen_gpuarray((50, 2, 10), 'float32', ctx=context) n, nd, dims, strs, offsets = check_args((ag, bg), collapse=True) assert n == 1000 assert nd == 2 assert dims == (50, 20) assert strs == ((160, 4), (80, 4)) assert offsets == (80, 0) def test_check_args_collapse_4(): ac, ag = gen_gpuarray((1,), 'float32', ctx=context) n, nd, dims, strs, offsets = check_args((ag,), collapse=False) assert n == 1 assert nd == 1 assert dims == (1,) assert strs == ((4,),) assert offsets == (0,) ac, ag = gen_gpuarray((1, 1), 'float32', ctx=context) n, nd, dims, strs, offsets = check_args((ag,), collapse=True) assert n == 1 assert nd == 1 assert dims == (1,) assert strs == ((4,),) assert offsets == (0,) def test_check_args_broadcast_1(): ac, ag = gen_gpuarray((1,), 'float32', ctx=context) bc, bg = gen_gpuarray((50,), 'float32', ctx=context) n, nd, dims, strs, offsets = check_args((ag, bg), broadcast=True) assert n == 50 assert nd == 1 assert dims == (50,) assert strs == ((0,), (4,)) assert offsets == (0, 0) def test_check_args_broadcast_2(): ac, ag = gen_gpuarray((50, 1, 20), 'float32', ctx=context, sliced=2, offseted_inner=True) bc, bg = gen_gpuarray((50, 1, 20), 'float32', ctx=context) n, nd, dims, strs, offsets = check_args((ag, bg), collapse=True, broadcast=True) assert n == 1000 assert nd == 2 assert dims == (50, 20) assert strs == ((168, 4), (80, 4)) assert offsets == (4, 0) def test_check_args_broadcast_3(): ac, ag = gen_gpuarray((10, 20, 30), 'float32', ctx=context) bc, bg = gen_gpuarray((1, 1, 1), 'float32', ctx=context) n, nd, dims, strs, offsets = check_args((ag, bg), broadcast=True) assert n == 6000 assert nd == 3 assert dims == (10, 20, 30) assert strs == ((2400, 120, 4), (0, 0, 0)) assert offsets == (0, 0) libgpuarray-0.7.6/pygpu/tools.py000066400000000000000000000143701326743622600167430ustar00rootroot00000000000000import functools import six from six.moves import reduce from heapq import nsmallest from operator import itemgetter, mul import numpy from .dtypes import dtype_to_ctype, _fill_dtype_registry from .gpuarray import GpuArray _fill_dtype_registry() def as_argument(obj, name): if isinstance(obj, GpuArray): return ArrayArg(obj.dtype, name) else: return ScalarArg(numpy.asarray(obj).dtype, name) class Argument(object): def __init__(self, dtype, name): self.dtype = dtype self.name = name def ctype(self): return dtype_to_ctype(self.dtype) def __hash__(self): return hash(type(self)) ^ hash(self.dtype) ^ hash(self.name) def __eq__(self, other): return (type(self) == type(other) and self.dtype == other.dtype and self.name == other.name) class ArrayArg(Argument): def decltype(self): return "GLOBAL_MEM {} *".format(self.ctype()) def expr(self): return "{}[i]".format(self.name) def isarray(self): return True def spec(self): return GpuArray class ScalarArg(Argument): def decltype(self): return self.ctype() def expr(self): return self.name def isarray(self): return False def spec(self): return self.dtype def check_args(args, collapse=False, broadcast=False): """ Returns the properties of arguments and checks if they all match (are all the same shape) If `collapse` is True dimension collapsing will be performed. If `collapse` is False dimension collapsing will not be performed. If `broadcast` is True array broadcasting will be performed which means that dimensions which are of size 1 in some arrays but not others will be repeated to match the size of the other arrays. If `broadcast` is False no broadcasting takes place. """ # For compatibility with old collapse=None option if collapse is None: collapse = True strs = [] offsets = [] dims = None for arg in args: if isinstance(arg, GpuArray): strs.append(arg.strides) offsets.append(arg.offset) if dims is None: n, nd, dims = arg.size, arg.ndim, arg.shape else: if arg.ndim != nd: raise ValueError("Array order differs") if not broadcast and arg.shape != dims: raise ValueError("Array shape differs") else: strs.append(None) offsets.append(None) if dims is None: raise TypeError("No arrays in kernel arguments, " "something is wrong") tdims = dims if broadcast or collapse: # make the strides and dims editable dims = list(dims) strs = [list(str) if str is not None else str for str in strs] if broadcast: # Set strides to 0s when needed. # Get the full shape in dims (no ones unless all arrays have it). if 1 in dims: for i, ary in enumerate(args): if strs[i] is None: continue shp = ary.shape for i, d in enumerate(shp): if dims[i] != d and dims[i] == 1: dims[i] = d n *= d tdims = tuple(dims) for i, ary in enumerate(args): if strs[i] is None: continue shp = ary.shape if tdims != shp: for j, d in enumerate(shp): if dims[j] != d: # Might want to add a per-dimension enable mechanism if d == 1: strs[i][j] = 0 else: raise ValueError("Array shape differs") if collapse and nd > 1: # remove dimensions that are of size 1 for i in range(nd - 1, -1, -1): if nd > 1 and dims[i] == 1: del dims[i] for str in strs: if str is not None: del str[i] nd -= 1 # collapse contiguous dimensions for i in range(nd - 1, 0, -1): if all(str is None or str[i] * dims[i] == str[i - 1] for str in strs): dims[i - 1] *= dims[i] del dims[i] for str in strs: if str is not None: str[i - 1] = str[i] del str[i] nd -= 1 if broadcast or collapse: # re-wrap dims and tuples dims = tuple(dims) strs = [tuple(str) if str is not None else None for str in strs] return n, nd, dims, tuple(strs), tuple(offsets) def lru_cache(maxsize=20): def decorating_function(user_function): cache = {} last_use = {} time = [0] # workaround for Python 2, which doesn't have nonlocal @functools.wraps(user_function) def wrapper(*key): time[0] += 1 last_use[key] = time[0] try: result = cache[key] wrapper.hits += 1 except KeyError: result = user_function(*key) cache[key] = result wrapper.misses += 1 # purge least recently used cache entries if len(cache) > wrapper.maxsize: for key, _ in nsmallest(wrapper.maxsize // 10, six.iteritems(last_use), key=itemgetter(1)): del cache[key], last_use[key] return result def clear(): cache.clear() last_use.clear() wrapper.hits = wrapper.misses = 0 time[0] = 0 @functools.wraps(user_function) def get(*key): result = cache[key] time[0] += 1 last_use[key] = time[0] wrapper.hits += 1 return result wrapper.hits = wrapper.misses = 0 wrapper.maxsize = maxsize wrapper.clear = clear wrapper.get = get return wrapper return decorating_function def prod(iterable): return reduce(mul, iterable, 1) libgpuarray-0.7.6/release.txt000066400000000000000000000007201326743622600162400ustar00rootroot00000000000000Release process: - Make sure you are on the proper release branch - Make a git tag git tag vX.Y.Z - Push to master the commit and the tag git push --tags central master This push will trigger package builds for windows and linux that will be uploaded to the mila-udem conda channel. - Add a release on github with a tag in the form of 'vX.Y.Z' https://github.com/Theano/libgpuarray/releases/new - Make note of the major changes since the last release libgpuarray-0.7.6/setup.cfg000066400000000000000000000002251326743622600157000ustar00rootroot00000000000000[versioneer] VCS=git style=pep440 versionfile_source=pygpu/_version.py versionfile_build=pygpu/_version.py tag_prefix=v parentdir_prefix=libgpuarray-libgpuarray-0.7.6/setup.py000077500000000000000000000132031326743622600155740ustar00rootroot00000000000000import sys import os import versioneer import distutils.command.clean import shutil have_cython = False try: import Cython if Cython.__version__ < '0.25': raise Exception('cython is too old or not installed ' '(at least 0.25 required)') from Cython.Build import cythonize have_cython = True except Exception: # for devel version raise def cythonize(args): for arg in args: arg.sources = [(s[:-3] + 'c' if s.endswith('.pyx') else s) for s in arg.sources] # clang gives an error if passed -mno-fused-madd # (and I don't even understand why it's passed in the first place) if sys.platform == 'darwin': from distutils import sysconfig, ccompiler sysconfig_customize_compiler = sysconfig.customize_compiler def customize_compiler(compiler): sysconfig_customize_compiler(compiler) if sys.platform == 'darwin': while '-mno-fused-madd' in compiler.compiler: compiler.compiler.remove('-mno-fused-madd') while '-mno-fused-madd' in compiler.compiler_so: compiler.compiler_so.remove('-mno-fused-madd') while '-mno-fused-madd' in compiler.linker_so: compiler.linker_so.remove('-mno-fused-madd') sysconfig.customize_compiler = customize_compiler ccompiler.customize_compiler = customize_compiler try: from setuptools import setup, Extension as _Extension # setuptools is stupid and rewrites "sources" to change '.pyx' to '.c' # if it can't find Pyrex (and in recent versions, Cython). # # This is a really stupid thing to do behind the users's back (since # it breaks development builds) especially with no way of disabling it # short of the hack below. class Extension(_Extension): def __init__(self, *args, **kwargs): save_sources = kwargs.get('sources', None) _Extension.__init__(self, *args, **kwargs) self.sources = save_sources except ImportError: from distutils.core import setup, Extension import numpy as np to_del = [] for i, a in enumerate(sys.argv): if a == '--disable-cython': to_del.append(i) have_cython = False for i in reversed(to_del): del sys.argv[i] del to_del include_dirs = [np.get_include()] library_dirs = [] if sys.platform == 'win32' and not os.getenv('CONDA_BUILD'): # This is a hack so users don't need to do many steps for windows install # Just use the default location. current_dir = os.path.abspath(os.path.dirname(__file__)) include_dirs += [os.path.join(current_dir, 'src')] default_bin_dir = os.path.join(current_dir, 'lib') if not os.path.isdir(default_bin_dir): raise RuntimeError('default binary dir {} does not exist, you may need to build the C library in release mode'.format(default_bin_dir)) library_dirs += [default_bin_dir] class cmd_clean(distutils.command.clean.clean): def run(self): import glob with open('.clean', 'r') as f: ignores = f.read() for wildcard in filter(bool, ignores.split('\n')): for filename in glob.glob(wildcard): try: os.remove(filename) except OSError: shutil.rmtree(filename, ignore_errors=True) # It's an old-style class in Python 2.7... distutils.command.clean.clean.run(self) ea = [] if sys.platform in ('darwin', 'linux'): # Silence unused stuff warnings ea = ["-Wno-unused-variable", "-Wno-unused-function"] exts = [Extension('pygpu.gpuarray', sources=['pygpu/gpuarray.pyx'], include_dirs=include_dirs, libraries=['gpuarray'], library_dirs=library_dirs, extra_compile_args=ea, define_macros=[('GPUARRAY_SHARED', None)] ), Extension('pygpu.blas', sources=['pygpu/blas.pyx'], include_dirs=include_dirs, libraries=['gpuarray'], library_dirs=library_dirs, extra_compile_args=ea, define_macros=[('GPUARRAY_SHARED', None)] ), Extension('pygpu._elemwise', sources=['pygpu/_elemwise.pyx'], include_dirs=include_dirs, libraries=['gpuarray'], library_dirs=library_dirs, extra_compile_args=ea, define_macros=[('GPUARRAY_SHARED', None)] ), Extension('pygpu.collectives', sources=['pygpu/collectives.pyx'], include_dirs=include_dirs, libraries=['gpuarray'], library_dirs=library_dirs, extra_compile_args=ea, define_macros=[('GPUARRAY_SHARED', None)] )] cmds = versioneer.get_cmdclass() cmds["clean"] = cmd_clean version_data = versioneer.get_versions() if version_data['error'] is not None: raise ValueError("Can't determine version for build: %s\n Please make sure that your git checkout includes tags." % (version_data['error'],)) setup(name='pygpu', version=version_data['version'], cmdclass=cmds, description='numpy-like wrapper on libgpuarray for GPU computations', packages=['pygpu', 'pygpu/tests'], include_package_data=True, package_data={'pygpu': ['gpuarray.h', 'gpuarray_api.h', 'blas_api.h', 'numpy_compat.h', 'collectives.h', 'collectives_api.h']}, ext_modules=cythonize(exts), install_requires=['mako>=0.7', 'six'], ) libgpuarray-0.7.6/src/000077500000000000000000000000001326743622600146475ustar00rootroot00000000000000libgpuarray-0.7.6/src/CMakeLists.txt000066400000000000000000000106601326743622600174120ustar00rootroot00000000000000include(CheckFunctionExists) set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DDEBUG") if(CMAKE_COMPILER_IS_GNUCC) add_definitions(-Wdeclaration-after-statement) endif() include_directories("${CMAKE_CURRENT_SOURCE_DIR}") add_custom_command( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/gpuarray_types.c ${CMAKE_CURRENT_SOURCE_DIR}/gpuarray/types.h COMMAND python gen_types.py WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/gen_types.py) add_custom_command( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/cluda_cuda.h.c COMMAND python head.py cluda_cuda.h WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/head.py ${CMAKE_CURRENT_SOURCE_DIR}/cluda_cuda.h ) add_custom_command( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/cluda_opencl.h.c COMMAND python head.py cluda_opencl.h WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/head.py ${CMAKE_CURRENT_SOURCE_DIR}/cluda_opencl.h ) macro (set_rel var) file (RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}") # clear previous list (if any) set(SET_REL_TMP) foreach (_src ${ARGN}) if (_relPath) list(APPEND SET_REL_TMP "${_relPath}/${_src}") else() list(APPEND SET_REL_TMP "${_src}") endif() endforeach() if (_relPath) set(${var} ${SET_REL_TMP} PARENT_SCOPE) else() set(${var} ${SET_REL_TMP}) endif() endmacro() set(_GPUARRAY_SRC cache/lru.c cache/twoq.c cache/disk.c gpuarray_types.c gpuarray_error.c gpuarray_util.c gpuarray_buffer.c gpuarray_buffer_blas.c gpuarray_buffer_collectives.c gpuarray_array.c gpuarray_array_blas.c gpuarray_array_collectives.c gpuarray_kernel.c gpuarray_extension.c gpuarray_elemwise.c gpuarray_reduction.c gpuarray_buffer_cuda.c gpuarray_blas_cuda_cublas.c gpuarray_collectives_cuda_nccl.c gpuarray_buffer_opencl.c gpuarray_blas_opencl_clblas.c gpuarray_blas_opencl_clblast.c ) set_property(SOURCE gpuarray_buffer_cuda.c APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cluda_cuda.h.c) set_property(SOURCE gpuarray_buffer_opencl.c APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cluda_opencl.h.c) check_function_exists(strlcat HAVE_STRL) check_function_exists(mkstemp HAVE_MKSTEMP) if(UNIX) add_definitions(-D_GNU_SOURCE) endif() if(NOT HAVE_STRL) list(APPEND _GPUARRAY_SRC gpuarray_strl.c) endif() if(NOT HAVE_MKSTEMP) list(APPEND _GPUARRAY_SRC gpuarray_mkstemp.c) endif() configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/private_config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/private_config.h ) add_subdirectory(util) add_subdirectory(loaders) set_rel(GPUARRAY_SRC ${_GPUARRAY_SRC}) list(APPEND GPUARRAY_SRC ${UTIL_SRC} ${LOADERS_SRC}) add_library(gpuarray SHARED ${GPUARRAY_SRC}) set_target_properties(gpuarray PROPERTIES COMPILE_FLAGS "-DGPUARRAY_BUILDING_DLL -DGPUARRAY_SHARED" INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib MACOSX_RPATH OFF # This is the shared library version VERSION 3.0 ) add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) target_link_libraries(gpuarray ${CMAKE_DL_LIBS}) target_link_libraries(gpuarray-static ${CMAKE_DL_LIBS}) # Generate gpuarray/abi_version.h that contains the ABI version number. get_target_property(GPUARRAY_ABI_VERSION gpuarray VERSION) string(REPLACE "." ";" GPUARRAY_ABI_VERSION_NUMBERS ${GPUARRAY_ABI_VERSION}) list(GET GPUARRAY_ABI_VERSION_NUMBERS 0 GPUARRAY_ABI_VERSION_MAJOR) list(GET GPUARRAY_ABI_VERSION_NUMBERS 1 GPUARRAY_ABI_VERSION_MINOR) math(EXPR GPUARRAY_ABI_NUMBER "1000*${GPUARRAY_ABI_VERSION_MAJOR} + ${GPUARRAY_ABI_VERSION_MINOR}") FILE(WRITE gpuarray/abi_version.h "\#ifndef GPUARRAY_ABI_VERSION\n\#define GPUARRAY_ABI_VERSION ${GPUARRAY_ABI_NUMBER}\n\#endif\n" ) # set SOVERSION and ensure it is the first part of VERSION. set_property(TARGET gpuarray PROPERTY SOVERSION ${GPUARRAY_ABI_VERSION_MAJOR}) set(headers gpuarray/array.h gpuarray/blas.h gpuarray/collectives.h gpuarray/buffer.h gpuarray/buffer_blas.h gpuarray/buffer_collectives.h gpuarray/abi_version.h gpuarray/config.h gpuarray/elemwise.h gpuarray/error.h gpuarray/extension.h gpuarray/ext_cuda.h gpuarray/kernel.h gpuarray/types.h gpuarray/util.h ) install(FILES ${headers} DESTINATION include/gpuarray) if(NOT UNIX) install(FILES gpuarray/wincompat/stdint.h DESTINATION include/gpuarray/wincompat) endif() install(TARGETS gpuarray gpuarray-static RUNTIME DESTINATION bin LIBRARY DESTINATION lib ARCHIVE DESTINATION lib ) libgpuarray-0.7.6/src/cache.h000066400000000000000000000064161326743622600160720ustar00rootroot00000000000000#ifndef CACHE_H #define CACHE_H #include #include #include "private_config.h" #include "util/strb.h" #include "util/error.h" typedef void *cache_key_t; typedef void *cache_value_t; typedef int (*cache_eq_fn)(cache_key_t, cache_key_t); typedef uint32_t (*cache_hash_fn)(cache_key_t); typedef void (*cache_freek_fn)(cache_key_t); typedef void (*cache_freev_fn)(cache_value_t); typedef int (*kwrite_fn)(strb *res, cache_key_t key); typedef int (*vwrite_fn)(strb *res, cache_value_t val); typedef cache_key_t (*kread_fn)(const strb *b); typedef cache_value_t (*vread_fn)(const strb *b); typedef struct _cache cache; struct _cache { /** * Add the specified value to the cache under the key k, replacing * any previous value. * * The value and key belong to the cache and will be freed with the * supplied free functions whether the add is successful or not. * * The key and value data must stay valid until they are explicitely * released by the cache when it calls the supplied free functions. * * NULL is not a valid value or key. * * Returns 0 if value was added sucessfully and some other value otherwise. */ int (*add)(cache *c, cache_key_t k, cache_value_t v); /** * Remove the data associated with k from the cache. The value and * the key will be free with the supplied free functions. * * The passed in key is not claimed by the cache and need only be * valid until the call returns. It will not be freed through the * key free function. * * Returns 1 if the key was in the cache and 0 if not. */ int (*del)(cache *c, const cache_key_t k); /** * Get the data entry associated with k. * * The passed in key is not claimed by the cache and need only be * valid until the call returns. It will not be freed through the * key free function. * * Returns NULL if the key is not found, a value otherwise. */ cache_value_t (*get)(cache *c, const cache_key_t k); /** * Releases all entries in the cache as well as all of the support * structures. * * This must NOT free the passed in pointer. */ void (*destroy)(cache *c); cache_eq_fn keq; cache_hash_fn khash; cache_freek_fn kfree; cache_freev_fn vfree; /* Extra data goes here depending on cache type */ }; cache *cache_lru(size_t max_size, size_t elasticity, cache_eq_fn keq, cache_hash_fn khash, cache_freek_fn kfree, cache_freev_fn vfree, error *e); cache *cache_twoq(size_t hot_size, size_t warm_size, size_t cold_size, size_t elasticity, cache_eq_fn keq, cache_hash_fn khash, cache_freek_fn kfree, cache_freev_fn vfree, error *e); cache *cache_disk(const char *dirpath, cache *mem, kwrite_fn kwrite, vwrite_fn vwrite, kread_fn kread, vread_fn vread, error *e); /* API functions */ static inline int cache_add(cache *c, cache_key_t k, cache_value_t v) { return c->add(c, k, v); } static inline int cache_del(cache *c, cache_key_t k) { return c->del(c, k); } static inline cache_value_t cache_get(cache *c, cache_key_t k) { return c->get(c, k); } static inline void cache_destroy(cache *c) { c->destroy(c); free(c); } #endif libgpuarray-0.7.6/src/cache/000077500000000000000000000000001326743622600157125ustar00rootroot00000000000000libgpuarray-0.7.6/src/cache/disk.c000066400000000000000000000240361326743622600170150ustar00rootroot00000000000000#define _CRT_SECURE_NO_WARNINGS #include #include #include #include "private_config.h" #ifdef _WIN32 #define PATH_MAX 255 #define WIN32_LEAN_AND_MEAN #include #include #include #include #include #include struct timezone; struct timeval { long tv_sec; long tv_usec; } timeval; static int gettimeofday(struct timeval *tp, struct timezone *tzp) { /* * Note: some broken versions only have 8 trailing zero's, the * correct epoch has 9 trailing zero's This magic number is the * number of 100 nanosecond intervals since January 1, 1601 (UTC) * until 00:00:00 January 1, 1970 */ static const uint64_t EPOCH = ((uint64_t)116444736000000000ULL); SYSTEMTIME system_time; FILETIME file_time; uint64_t time; GetSystemTime(&system_time); SystemTimeToFileTime(&system_time, &file_time); time = ((uint64_t)file_time.dwLowDateTime); time += ((uint64_t)file_time.dwHighDateTime) << 32; tp->tv_sec = (long)((time - EPOCH) / 10000000L); tp->tv_usec = (long)(system_time.wMilliseconds * 1000); return 0; } #define open _open #define unlink _unlink #define mkdir(p, f) _mkdir(p) #define close _close #define strdup _strdup #define lstat _stat64 #define fstat _fstat64 #define stat __stat64 #else #define PATH_MAX 1024 #include #include #include #define O_BINARY 0 #define _setmode(a, b) #endif #include "cache.h" #include "util/skein.h" #define HEXP_LEN (128 + 2) typedef struct _disk_cache { cache c; cache * mem; kwrite_fn kwrite; vwrite_fn vwrite; kread_fn kread; vread_fn vread; const char *dirp; } disk_cache; /* Convert unsigned long long from network to host order */ static unsigned long long ntohull(const char *_in) { const unsigned char *in = (const unsigned char *)_in; return ((unsigned long long)in[0] << 56 | (unsigned long long)in[1] << 48 | (unsigned long long)in[2] << 40 | (unsigned long long)in[3] << 32 | (unsigned long long)in[4] << 24 | (unsigned long long)in[5] << 16 | (unsigned long long)in[6] << 8 | (unsigned long long)in[7]); } /* Convert unsigned long long from host to network order */ static void htonull(unsigned long long in, char *out) { out[0] = (unsigned char)(in >> 56); out[1] = (unsigned char)(in >> 48); out[2] = (unsigned char)(in >> 40); out[3] = (unsigned char)(in >> 32); out[4] = (unsigned char)(in >> 24); out[5] = (unsigned char)(in >> 16); out[6] = (unsigned char)(in >> 8); out[7] = (unsigned char)(in); } /* Concatenate prefix and suffix into a single path string while checking for overflow */ static int catp(char *path, const char *dirp, const char *rpath) { if (strlcpy(path, dirp, PATH_MAX) >= PATH_MAX) { errno = ENAMETOOLONG; return -1; } if (strlcat(path, rpath, PATH_MAX) >= PATH_MAX) { errno = ENAMETOOLONG; return -1; } return 0; } /* open() for a path specifed by the concatenation of dirp and rpath */ static int openp(const char *dirp, const char *rpath, int flags, int mode) { char path[PATH_MAX]; if (catp(path, dirp, rpath)) return -1; return open(path, flags, mode); } static int mkstempp(const char *dirp, char *template) { char path[PATH_MAX]; int res; if (catp(path, dirp, template)) return -1; res = mkstemp(path); /* We need to copy the result path back and set binary mode (for windows) */ if (res != -1) { _setmode(res, O_BINARY); memcpy(template, &path[strlen(dirp)], strlen(template)); } return res; } static int unlinkp(const char *dirp, const char *rpath) { char path[PATH_MAX]; if (catp(path, dirp, rpath)) return -1; return unlink(path); } static int renamep(const char *dirp, const char *ropath, const char *rnpath) { char opath[PATH_MAX]; char npath[PATH_MAX]; if (catp(opath, dirp, ropath)) return -1; if (catp(npath, dirp, rnpath)) return -1; return rename(opath, npath); } /* Ensure that a path exists by creating all intermediate directories */ int ensurep(const char *dirp, const char *rpath) { char path[PATH_MAX]; char *pp; char sep; if (dirp == NULL) { if (strlcpy(path, rpath, PATH_MAX) >= PATH_MAX) { errno = ENAMETOOLONG; return -1; } #ifdef _WIN32 /* Skip root dir (windows) */ pp = strchr(path, '\\'); if (pp) while (*pp == '\\') pp++; else pp = path; #else pp = path; /* Skip root dir (unix) */ while (*pp == '/') pp++; #endif } else { if (catp(path, dirp, rpath)) return -1; pp = path + strlen(dirp); } while ((pp = strpbrk(pp + 1, "\\/")) != NULL) { sep = *pp; *pp = '\0'; if (mkdir(path, 0777)) { if (errno != EEXIST) return -1; /* For now we suppose that EEXIST means that the directory is * already there. */ } *pp = sep; } return 0; } static int key_path(disk_cache *c, const cache_key_t key, char *out) { strb kb = STRB_STATIC_INIT; unsigned char hash[64]; int i; if (c->kwrite(&kb, key)) { strb_clear(&kb); return -1; } if (Skein_512((unsigned char *)kb.s, kb.l, hash)) { strb_clear(&kb); return -1; } strb_clear(&kb); if (snprintf(out, 10, "%02x%02x/%02x%02x", hash[0], hash[1], hash[2], hash[3]) != 9) return -1; for (i = 4; i < 64; i += 4) { if (snprintf(out+(i * 2 + 1), 9, "%02x%02x%02x%02x", hash[i], hash[i+1], hash[i+2], hash[i+3]) != 8) return -1; } return 0; } static int write_entry(disk_cache *c, const cache_key_t k, const cache_value_t v) { char hexp[HEXP_LEN]; char tmp_path[] = "tmp.XXXXXXXX"; strb b = STRB_STATIC_INIT; size_t kl, vl; int fd, err; if (key_path(c, k, hexp)) return -1; if (ensurep(c->dirp, hexp)) return -1; if (strb_ensure(&b, 16)) return -1; b.l = 16; c->kwrite(&b, k); kl = b.l - 16; c->vwrite(&b, v); vl = b.l - kl - 16; htonull(kl, b.s); htonull(vl, b.s + 8); if (strb_error(&b)) { strb_clear(&b); return -1; } fd = mkstempp(c->dirp, tmp_path); if (fd == -1) { strb_clear(&b); return -1; } err = strb_write(fd, &b); strb_clear(&b); close(fd); if (err) { unlinkp(c->dirp, tmp_path); return -1; } if (renamep(c->dirp, tmp_path, hexp)) { unlinkp(c->dirp, tmp_path); #ifdef _WIN32 /* On windows we can't rename over an existing file */ return (errno != EACCES) ? -1 : 0; #else return -1; #endif } return 0; } static int find_entry(disk_cache *c, const cache_key_t key, cache_key_t *_k, cache_value_t *_v) { struct stat st; strb b = STRB_STATIC_INIT; char *ts; size_t kl, vl; cache_key_t k; char hexp[HEXP_LEN]; int fd; if (key_path(c, key, hexp)) return 0; fd = openp(c->dirp, hexp, O_RDONLY|O_BINARY, 0); if (fd == -1) return 0; if (fstat(fd, &st)) { close(fd); return 0; } if (!(st.st_mode & S_IFREG)) { close(fd); return 0; } strb_read(&b, fd, st.st_size); close(fd); if (strb_error(&b) || b.l < 16) { strb_clear(&b); return 0; } kl = ntohull(b.s); vl = ntohull(b.s + 8); if (b.l < 16 + kl + vl) { strb_clear(&b); return 0; } ts = b.s; b.s += 16; b.l = kl; k = c->kread(&b); if (k && c->c.keq(key, k)) { if (_v) { b.s += kl; b.l = vl; *_v = c->vread(&b); if (*_v == NULL) goto error_find_entry; } if (_k) *_k = k; else c->c.kfree(k); b.s = ts; strb_clear(&b); return 1; } error_find_entry: if (k) c->c.kfree(k); b.s = ts; strb_clear(&b); return 0; } static int disk_add(cache *_c, cache_key_t k, cache_value_t v) { disk_cache *c = (disk_cache *)_c; /* Ignore write errors */ write_entry(c, k, v); return cache_add(c->mem, k, v); } static int disk_del(cache *_c, const cache_key_t key) { disk_cache *c = (disk_cache *)_c; char hexp[HEXP_LEN] = {0}; cache_del(c->mem, key); key_path(c, key, hexp); return (unlinkp(c->dirp, hexp) == 0); } static cache_value_t disk_get(cache *_c, const cache_key_t key) { disk_cache *c = (disk_cache *)_c; cache_key_t k; cache_value_t v; v = cache_get(c->mem, key); if (v != NULL) return v; if (find_entry(c, key, &k, &v)) { if (cache_add(c->mem, k, v)) return NULL; return v; } return NULL; } static void disk_destroy(cache *_c) { disk_cache *c = (disk_cache *)_c; cache_destroy(c->mem); free((void *)c->dirp); } cache *cache_disk(const char *dirpath, cache *mem, kwrite_fn kwrite, vwrite_fn vwrite, kread_fn kread, vread_fn vread, error *e) { struct stat st; disk_cache *res; char *dirp; size_t dirl = strlen(dirpath); char sep = '/'; /* This trickery is to make sure the path ends with a separator */ #ifdef _WIN32 if (dirpath[dirl - 1] == '\\') sep = '\\'; #endif if (dirpath[dirl - 1] != sep) dirl++; dirp = malloc(dirl + 1); /* With the NUL */ if (dirp == NULL) { error_sys(e, "malloc"); return NULL; } strlcpy(dirp, dirpath, dirl + 1); if (dirp[dirl - 1] != sep) { dirp[dirl - 1] = sep; dirp[dirl] = '\0'; } if (ensurep(NULL, dirp) != 0) { free(dirp); error_sys(e, "ensurep"); return NULL; } /* For Windows mkdir and lstat which can't handle trailing separator */ dirp[dirl - 1] = '\0'; mkdir(dirp, 0777); /* This may fail, but it's ok */ if (lstat(dirp, &st) != 0) { error_sys(e, "lstat"); return NULL; } /* Restore the good path at the end */ dirp[dirl - 1] = sep; if (!(st.st_mode & S_IFDIR)) { error_set(e, GA_SYS_ERROR, "Cache path exists but is not a directory"); return NULL; } res = calloc(sizeof(*res), 1); if (res == NULL) { error_sys(e, "calloc"); return NULL; } res->dirp = dirp; res->mem = mem; res->kwrite = kwrite; res->vwrite = vwrite; res->kread = kread; res->vread = vread; res->c.add = disk_add; res->c.del = disk_del; res->c.get = disk_get; res->c.destroy = disk_destroy; res->c.keq = mem->keq; res->c.khash = mem->khash; res->c.kfree = mem->kfree; res->c.vfree = mem->vfree; return (cache *)res; } libgpuarray-0.7.6/src/cache/lru.c000066400000000000000000000151001326743622600166550ustar00rootroot00000000000000#include #include "cache.h" #include "private_config.h" typedef struct _node node; typedef struct _list list; typedef struct _hash hash; typedef struct _lru_cache lru_cache; struct _node { node *prev; node *next; node *h_next; cache_key_t key; cache_value_t val; }; static inline void node_init(node *n, const cache_key_t k, const cache_value_t v) { n->prev = NULL; n->next = NULL; n->h_next = NULL; n->key = k; n->val = v; } static inline node *node_alloc(const cache_key_t key, const cache_value_t val) { node *res = malloc(sizeof(node)); if (res != NULL) node_init(res, key, val); return res; } static inline void node_free(node *n, cache_freek_fn kfree, cache_freev_fn vfree) { kfree(n->key); vfree(n->val); if (n->h_next != NULL) node_free(n->h_next, kfree, vfree); free(n); } static inline void node_unlink(node *n) { if (n->next != NULL) n->next->prev = n->prev; if (n->prev != NULL) n->prev->next = n->next; n->next = NULL; n->prev = NULL; } struct _list { node *head; node *tail; size_t size; }; static inline void list_init(list *l) { l->head = NULL; l->tail = NULL; l->size = 0; } static inline void list_clear(list *l) { l->head = NULL; l->tail = NULL; l->size = 0; } static inline node *list_pop(list *l) { if (l->head == NULL) return NULL; else { node *oldHead = l->head; l->head = l->head->next; node_unlink(oldHead); l->size--; if (l->size == 0) { l->tail = NULL; } return oldHead; } } static inline node *list_remove(list *l, node *n) { if (n == l->head) l->head = n->next; if (n == l->tail) l->tail = n->prev; node_unlink(n); l->size--; return n; } static inline void list_push(list *l, node *n) { node_unlink(n); if (l->head == NULL) { l->head = n; } else if (l->head == l->tail) { l->head->next = n; n->prev = l->head; } else { l->tail->next = n; n->prev = l->tail; } l->tail = n; l->size++; } struct _hash { node **keyval; size_t nbuckets; size_t size; }; static inline unsigned long long roundup2(unsigned long long s) { s--; s |= s >> 1; s |= s >> 2; s |= s >> 4; s |= s >> 8; s |= s >> 16; s |= s >> 32; s++; return s; } static inline int hash_init(hash *h, size_t size, error *e) { h->nbuckets = roundup2(size + (size/6)); h->keyval = calloc(h->nbuckets, sizeof(*h->keyval)); if (h->keyval == NULL) { error_sys(e, "calloc"); return -1; } h->size = 0; return 0; } static inline void hash_clear(hash *h, cache_freek_fn kfree, cache_freev_fn vfree) { size_t i; for (i = 0; i < h->nbuckets; i++) { if (h->keyval[i] != NULL) node_free(h->keyval[i], kfree, vfree); } free(h->keyval); h->nbuckets = 0; h->size = 0; h->keyval = NULL; } static inline node *hash_find(hash *h, const cache_key_t key, cache_eq_fn keq, cache_hash_fn khash) { size_t p = khash(key) & (h->nbuckets - 1); node *n; if (h->keyval[p] != NULL) { n = h->keyval[p]; do { if (keq(n->key, key)) return n; n = n->h_next; } while (n != NULL); } return NULL; } static inline node *hash_add(hash *h, const cache_key_t key, const cache_value_t val, cache_hash_fn khash) { size_t p = khash(key) & (h->nbuckets - 1); node *n = node_alloc(key, val); if (n == NULL) return NULL; if (h->keyval[p] == NULL) { h->keyval[p] = n; } else { n->h_next = h->keyval[p]; h->keyval[p] = n; } h->size++; return n; } static inline void hash_del(hash *h, node *n, cache_freek_fn kfree, cache_freev_fn vfree, cache_hash_fn khash) { size_t p = khash(n->key) & (h->nbuckets - 1); node *np; if (n == h->keyval[p]) { h->keyval[p] = n->h_next; n->h_next = NULL; node_free(n, kfree, vfree); h->size--; } else { np = h->keyval[p]; while (np->h_next != NULL) { if (np->h_next == n) { np->h_next = n->h_next; n->h_next = NULL; node_free(n, kfree, vfree); h->size--; break; } np = np->h_next; } } } static inline size_t hash_size(hash *h) { return h->size; } struct _lru_cache { cache c; hash data; list order; size_t maxSize; size_t elasticity; }; static inline void lru_prune(lru_cache *c) { if (c->maxSize > 0 && hash_size(&c->data) > (c->maxSize + c->elasticity)) { while (hash_size(&c->data) > c->maxSize) { node *n = list_pop(&c->order); hash_del(&c->data, n, c->c.kfree, c->c.vfree, c->c.khash); } } } static int lru_del(cache *_c, const cache_key_t k) { lru_cache *c = (lru_cache *)_c; node *n = hash_find(&c->data, k, c->c.keq, c->c.khash); if (n != NULL) { list_remove(&c->order, n); hash_del(&c->data, n, c->c.kfree, c->c.vfree, c->c.khash); return 1; } return 0; } static int lru_add(cache *_c, cache_key_t key, cache_value_t val) { lru_cache *c = (lru_cache *)_c; node *n; /* XXX: possible optimization here to combine remove and add. currently needs to be done this way since hash_add does not overwrite previous values */ lru_del(_c, key); n = hash_add(&c->data, key, val, c->c.khash); if (n == NULL) { return -1; } list_push(&c->order, n); lru_prune(c); return 0; } static cache_value_t lru_get(cache *_c, const cache_key_t key) { lru_cache *c = (lru_cache *)_c; node *n = hash_find(&c->data, key, c->c.keq, c->c.khash); if (n == NULL) { return NULL; } else { list_remove(&c->order, n); list_push(&c->order, n); return n->val; } } static void lru_destroy(cache *_c) { lru_cache *c = (lru_cache *)_c; hash_clear(&c->data, c->c.kfree, c->c.vfree); list_clear(&c->order); } cache *cache_lru(size_t max_size, size_t elasticity, cache_eq_fn keq, cache_hash_fn khash, cache_freek_fn kfree, cache_freev_fn vfree, error *e) { lru_cache *res = malloc(sizeof(*res)); if (res == NULL) { error_sys(e, "malloc"); return NULL; } if (hash_init(&res->data, max_size+elasticity, e)) { free(res); return NULL; } list_init(&res->order); res->maxSize = max_size; res->elasticity = elasticity; res->c.add = lru_add; res->c.del = lru_del; res->c.get = lru_get; res->c.destroy = lru_destroy; res->c.keq = keq; res->c.khash = khash; res->c.kfree = kfree; res->c.vfree = vfree; return (cache *)res; } libgpuarray-0.7.6/src/cache/twoq.c000066400000000000000000000175701326743622600170620ustar00rootroot00000000000000#include #include #include #include "cache.h" #include "private_config.h" typedef struct _node node; typedef struct _list list; typedef struct _hash hash; typedef struct _twoq_cache twoq_cache; #define HOT 0 #define WARM 1 #define COLD 2 struct _node { node *prev; node *next; node *h_next; cache_key_t key; cache_value_t val; int temp; }; static inline void node_init(node *n, const cache_key_t k, const cache_value_t v) { n->prev = NULL; n->next = NULL; n->h_next = NULL; n->key = k; n->val = v; n->temp = HOT; } static inline node *node_alloc(const cache_key_t key, const cache_value_t val) { node *res = malloc(sizeof(node)); if (res != NULL) node_init(res, key, val); return res; } static inline void node_free(node *n, cache_freek_fn kfree, cache_freev_fn vfree) { kfree(n->key); vfree(n->val); if (n->h_next != NULL) node_free(n->h_next, kfree, vfree); free(n); } static inline void node_unlink(node *n) { if (n->next != NULL) n->next->prev = n->prev; if (n->prev != NULL) n->prev->next = n->next; n->next = NULL; n->prev = NULL; } struct _list { node *head; node *tail; size_t size; }; static inline void list_init(list *l) { l->head = NULL; l->tail = NULL; l->size = 0; } static inline void list_clear(list *l) { l->head = NULL; l->tail = NULL; l->size = 0; } static inline node *list_pop(list *l) { if (l->head == NULL) return NULL; else { node *oldHead = l->head; l->head = l->head->next; node_unlink(oldHead); l->size--; if (l->size == 0) { l->tail = NULL; } return oldHead; } } static inline node *list_remove(list *l, node *n) { if (n == l->head) l->head = n->next; if (n == l->tail) l->tail = n->prev; node_unlink(n); l->size--; return n; } static inline void list_push(list *l, node *n) { node_unlink(n); if (l->head == NULL) { l->head = n; } else if (l->head == l->tail) { l->head->next = n; n->prev = l->head; } else { l->tail->next = n; n->prev = l->tail; } l->tail = n; l->size++; } struct _hash { node **keyval; size_t nbuckets; size_t size; }; static inline unsigned long long roundup2(unsigned long long s) { s--; s |= s >> 1; s |= s >> 2; s |= s >> 4; s |= s >> 8; s |= s >> 16; s |= s >> 32; s++; return s; } static inline int hash_init(hash *h, size_t size, error *e) { h->nbuckets = roundup2(size + (size/6)); h->keyval = calloc(h->nbuckets, sizeof(*h->keyval)); if (h->keyval == NULL) { error_sys(e, "calloc"); return -1; } h->size = 0; return 0; } static inline void hash_clear(hash *h, cache_freek_fn kfree, cache_freev_fn vfree) { size_t i; for (i = 0; i < h->nbuckets; i++) { if (h->keyval[i] != NULL) node_free(h->keyval[i], kfree, vfree); } free(h->keyval); h->nbuckets = 0; h->size = 0; h->keyval = NULL; } static inline node *hash_find(hash *h, const cache_key_t key, cache_eq_fn keq, cache_hash_fn khash) { size_t p = khash(key) & (h->nbuckets - 1); node *n; if (h->keyval[p] != NULL) { n = h->keyval[p]; do { if (keq(n->key, key)) return n; n = n->h_next; } while (n != NULL); } return NULL; } static inline node *hash_add(hash *h, const cache_key_t key, const cache_value_t val, cache_hash_fn khash) { size_t p = khash(key) & (h->nbuckets - 1); node *n = node_alloc(key, val); if (n == NULL) return NULL; if (h->keyval[p] == NULL) { h->keyval[p] = n; } else { n->h_next = h->keyval[p]; h->keyval[p] = n; } h->size++; return n; } static inline void hash_del(hash *h, node *n, cache_freek_fn kfree, cache_freev_fn vfree, cache_hash_fn khash) { size_t p = khash(n->key) & (h->nbuckets - 1); node *np; if (n == h->keyval[p]) { h->keyval[p] = n->h_next; n->h_next = NULL; node_free(n, kfree, vfree); h->size--; } else { np = h->keyval[p]; while (np->h_next != NULL) { if (np->h_next == n) { np->h_next = n->h_next; n->h_next = NULL; node_free(n, kfree, vfree); h->size--; break; } np = np->h_next; } } } struct _twoq_cache { cache c; hash data; list hot; list warm; list cold; size_t hot_size; size_t warm_size; size_t cold_size; size_t elasticity; }; static inline void twoq_prune(twoq_cache *c) { while (c->hot.size > c->hot_size) { node *n = list_pop(&c->hot); n->temp = COLD; list_push(&c->cold, n); } if (c->cold.size > c->cold_size + c->elasticity) { while (c->cold.size > c->cold_size) { node *n = list_pop(&c->cold); hash_del(&c->data, n, c->c.kfree, c->c.vfree, c->c.khash); } } } static int twoq_del(cache *_c, const cache_key_t k) { twoq_cache *c = (twoq_cache *)_c; node *n = hash_find(&c->data, k, c->c.keq, c->c.khash); if (n != NULL) { switch (n->temp) { case HOT: list_remove(&c->hot, n); break; case WARM: list_remove(&c->warm, n); break; case COLD: list_remove(&c->cold, n); break; default: assert(0 && "node temperature is not within expected values"); } hash_del(&c->data, n, c->c.kfree, c->c.vfree, c->c.khash); return 1; } return 0; } static int twoq_add(cache *_c, cache_key_t key, cache_value_t val) { twoq_cache *c = (twoq_cache *)_c; node *n; /* XXX: possible optimization here to combine remove and add. currently needs to be done this way since hash_add does not overwrite previous values */ twoq_del(_c, key); n = hash_add(&c->data, key, val, c->c.khash); if (n == NULL) { return -1; } list_push(&c->hot, n); twoq_prune(c); return 0; } static cache_value_t twoq_get(cache *_c, const cache_key_t key) { twoq_cache *c = (twoq_cache *)_c; node *nn; node *n = hash_find(&c->data, key, c->c.keq, c->c.khash); if (n == NULL) { return NULL; } else { switch (n->temp) { case HOT: list_remove(&c->hot, n); list_push(&c->hot, n); break; case WARM: list_remove(&c->warm, n); list_push(&c->warm, n); break; case COLD: list_remove(&c->cold, n); n->temp = WARM; list_push(&c->warm, n); if (c->warm.size > c->warm_size) { nn = list_pop(&c->warm); nn->temp = COLD; list_push(&c->cold, nn); } break; default: assert(0 && "node temperature is not within expected values"); } return n->val; } } static void twoq_destroy(cache *_c) { twoq_cache *c = (twoq_cache *)_c; hash_clear(&c->data, c->c.kfree, c->c.vfree); list_clear(&c->hot); list_clear(&c->warm); list_clear(&c->cold); } cache *cache_twoq(size_t hot_size, size_t warm_size, size_t cold_size, size_t elasticity, cache_eq_fn keq, cache_hash_fn khash, cache_freek_fn kfree, cache_freev_fn vfree, error *e) { twoq_cache *res; if (hot_size == 0 || warm_size == 0 || cold_size == 0) { error_set(e, GA_VALUE_ERROR, "cache_twoq: section size is 0"); return NULL; } res = malloc(sizeof(*res)); if (res == NULL) { error_sys(e, "malloc"); return NULL; } if (hash_init(&res->data, hot_size+warm_size+cold_size+elasticity, e)) { free(res); return NULL; } list_init(&res->hot); list_init(&res->warm); list_init(&res->cold); res->hot_size = hot_size; res->warm_size = warm_size; res->cold_size = cold_size; res->elasticity = elasticity; res->c.add = twoq_add; res->c.del = twoq_del; res->c.get = twoq_get; res->c.destroy = twoq_destroy; res->c.keq = keq; res->c.khash = khash; res->c.kfree = kfree; res->c.vfree = vfree; return (cache *)res; } libgpuarray-0.7.6/src/cluda_cuda.h000066400000000000000000000126551326743622600171150ustar00rootroot00000000000000#ifndef CLUDA_H #define CLUDA_H #define local_barrier() __syncthreads() #define WITHIN_KERNEL extern "C" __device__ #define KERNEL extern "C" __global__ #define GLOBAL_MEM /* empty */ #define LOCAL_MEM __shared__ #define LOCAL_MEM_ARG /* empty */ #define MAXFLOAT 3.402823466E+38F #ifdef NAN #undef NAN #endif #define NAN __int_as_float(0x7fffffff) /* NULL */ #ifdef INFINITY #undef INFINITY #endif #define INFINITY __int_as_float(0x7f800000) #define HUGE_VALF INFINITY #define HUGE_VAL __longlong_as_double(0x7ff0000000000000) #define M_E 2.7182818284590452354 #define M_LOG2E 1.4426950408889634074 #define M_LOG10E 0.43429448190325182765 #define M_LN2 0.69314718055994530942 #define M_LN10 2.30258509299404568402 #define M_PI 3.14159265358979323846 #define M_PI_2 1.57079632679489661923 #define M_PI_4 0.78539816339744830962 #define M_1_PI 0.31830988618379067154 #define M_2_PI 0.63661977236758134308 #define M_2_SQRTPI 1.12837916709551257390 #define M_SQRT2 1.41421356237309504880 #define M_SQRT1_2 0.70710678118654752440 #define LID_0 threadIdx.x #define LID_1 threadIdx.y #define LID_2 threadIdx.z #define LDIM_0 blockDim.x #define LDIM_1 blockDim.y #define LDIM_2 blockDim.z #define GID_0 blockIdx.x #define GID_1 blockIdx.y #define GID_2 blockIdx.z #define GDIM_0 gridDim.x #define GDIM_1 gridDim.y #define GDIM_2 gridDim.z #define ga_bool unsigned char #define ga_byte signed char #define ga_ubyte unsigned char #define ga_short short #define ga_ushort unsigned short #define ga_int int #define ga_uint unsigned int #define ga_long long long #define ga_ulong unsigned long long #define ga_float float #define ga_double double #define ga_size size_t #define ga_ssize ptrdiff_t #define GA_DECL_SHARED_PARAM(type, name) #define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[]; #define GA_WARP_SIZE warpSize struct ga_half { ga_ushort data; }; static __device__ inline float ga_half2float(ga_half h) { float r; asm("{ cvt.f32.f16 %0, %1; }\n" : "=f"(r) : "h"(h.data)); return r; } static __device__ inline ga_half ga_float2half(float f) { ga_half r; asm("{ cvt.rn.f16.f32 %0, %1; }\n" : "=h"(r.data) : "f"(f)); return r; } /* ga_int */ #define atom_add_ig(a, b) atomicAdd(a, b) #define atom_add_il(a, b) atomicAdd(a, b) #define atom_xchg_ig(a, b) atomicExch(a, b) #define atom_xchg_il(a, b) atomicExch(a, b) /* ga_uint */ #define atom_add_Ig(a, b) atomicAdd(a, b) #define atom_add_Il(a, b) atomicAdd(a, b) #define atom_xchg_Ig(a, b) atomicExch(a, b) #define atom_xchg_Il(a, b) atomicExch(a, b) /* ga_long */ __device__ ga_long atom_add_lg(ga_long *addr, ga_long val) { unsigned long long *waddr = (unsigned long long *)addr; unsigned long long old = *waddr; unsigned long long assumed; do { assumed = old; old = atomicCAS(waddr, assumed, (val + (ga_long)(assumed))); } while (assumed != old); return (ga_long)old; } #define atom_add_ll(a, b) atom_add_lg(a, b) __device__ ga_long atom_xchg_lg(ga_long *addr, ga_long val) { unsigned long long res; res = atomicExch((unsigned long long *)addr, val); return (ga_long)res; } #define atom_xchg_ll(a, b) atom_xchg_lg(a, b) /* ga_ulong */ #define atom_add_Lg(a, b) atomicAdd(a, b) #define atom_add_Ll(a, b) atomicAdd(a, b) #define atom_xchg_Lg(a, b) atomicExch(a, b) #define atom_xchg_Ll(a, b) atomicExch(a, b) /* ga_float */ #define atom_add_fg(a, b) atomicAdd(a, b) #define atom_add_fl(a, b) atomicAdd(a, b) #define atom_xchg_fg(a, b) atomicExch(a, b) #define atom_xchg_fl(a, b) atomicExch(a, b) /* ga_double */ #if __CUDA_ARCH__ < 600 __device__ ga_double atom_add_dg(ga_double *addr, ga_double val) { unsigned long long *waddr = (unsigned long long *)addr; unsigned long long old = *waddr; unsigned long long assumed; do { assumed = old; old = atomicCAS(waddr, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } #define atom_add_dl(a, b) atom_add_dg(a, b) #else #define atom_add_dg(a, b) atomicAdd(a, b) #define atom_add_dl(a, b) atomicAdd(a, b) #endif __device__ ga_double atom_xchg_dg(ga_double *addr, ga_double val) { unsigned long long res; res = atomicExch((unsigned long long *)addr, __double_as_longlong(val)); return __longlong_as_double(res); } #define atom_xchg_dl(a, b) atom_xchg_dg(a, b) /* ga_half */ __device__ ga_half atom_add_eg(ga_half *addr, ga_half val) { ga_uint *base = (ga_uint *)((ga_size)addr & ~2); ga_uint old, assumed, sum, new_; ga_half tmp; old = *base; do { assumed = old; tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); sum = ga_float2half(ga_half2float(val) + ga_half2float(tmp)).data; new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254); old = atomicCAS(base, assumed, new_); } while (assumed != old); tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); return tmp; } #define atom_add_el(a, b) atom_add_eg(a, b) __device__ ga_half atom_xchg_eg(ga_half *addr, ga_half val) { ga_uint *base = (ga_uint *)((ga_size)addr & ~2); ga_uint old, assumed, new_; ga_half tmp; old = *base; do { assumed = old; new_ = __byte_perm(old, val.data, ((ga_size)addr & 2) ? 0x5410 : 0x3254); old = atomicCAS(base, assumed, new_); } while (assumed != old); tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); return tmp; } #define atom_xchg_el(a, b) atom_xchg_eg(a, b) #endif libgpuarray-0.7.6/src/cluda_cuda.h.c000066400000000000000000001010721326743622600173260ustar00rootroot00000000000000static const char cluda_cuda_h[] = { 0x23, 0x69, 0x66, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x43, 0x4c, 0x55, 0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x43, 0x4c, 0x55, 0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, 0x29, 0x20, 0x5f, 0x5f, 0x73, 0x79, 0x6e, 0x63, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x28, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x57, 0x49, 0x54, 0x48, 0x49, 0x4e, 0x5f, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x22, 0x43, 0x22, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x22, 0x43, 0x22, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x4c, 0x4f, 0x42, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x2f, 0x2a, 0x20, 0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x41, 0x52, 0x47, 0x20, 0x2f, 0x2a, 0x20, 0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x41, 0x58, 0x46, 0x4c, 0x4f, 0x41, 0x54, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x2e, 0x34, 0x30, 0x32, 0x38, 0x32, 0x33, 0x34, 0x36, 0x36, 0x45, 0x2b, 0x33, 0x38, 0x46, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x41, 0x4e, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x41, 0x4e, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x41, 0x4e, 0x20, 0x5f, 0x5f, 0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x30, 0x78, 0x37, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x4e, 0x55, 0x4c, 0x4c, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x20, 0x5f, 0x5f, 0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x30, 0x78, 0x37, 0x66, 0x38, 0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x48, 0x55, 0x47, 0x45, 0x5f, 0x56, 0x41, 0x4c, 0x46, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x48, 0x55, 0x47, 0x45, 0x5f, 0x56, 0x41, 0x4c, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x30, 0x78, 0x37, 0x66, 0x66, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x45, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x2e, 0x37, 0x31, 0x38, 0x32, 0x38, 0x31, 0x38, 0x32, 0x38, 0x34, 0x35, 0x39, 0x30, 0x34, 0x35, 0x32, 0x33, 0x35, 0x34, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x4c, 0x4f, 0x47, 0x32, 0x45, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x34, 0x34, 0x32, 0x36, 0x39, 0x35, 0x30, 0x34, 0x30, 0x38, 0x38, 0x38, 0x39, 0x36, 0x33, 0x34, 0x30, 0x37, 0x34, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x4c, 0x4f, 0x47, 0x31, 0x30, 0x45, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, 0x34, 0x33, 0x34, 0x32, 0x39, 0x34, 0x34, 0x38, 0x31, 0x39, 0x30, 0x33, 0x32, 0x35, 0x31, 0x38, 0x32, 0x37, 0x36, 0x35, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x4c, 0x4e, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, 0x36, 0x39, 0x33, 0x31, 0x34, 0x37, 0x31, 0x38, 0x30, 0x35, 0x35, 0x39, 0x39, 0x34, 0x35, 0x33, 0x30, 0x39, 0x34, 0x32, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x4c, 0x4e, 0x31, 0x30, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x2e, 0x33, 0x30, 0x32, 0x35, 0x38, 0x35, 0x30, 0x39, 0x32, 0x39, 0x39, 0x34, 0x30, 0x34, 0x35, 0x36, 0x38, 0x34, 0x30, 0x32, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x2e, 0x31, 0x34, 0x31, 0x35, 0x39, 0x32, 0x36, 0x35, 0x33, 0x35, 0x38, 0x39, 0x37, 0x39, 0x33, 0x32, 0x33, 0x38, 0x34, 0x36, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x5f, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x35, 0x37, 0x30, 0x37, 0x39, 0x36, 0x33, 0x32, 0x36, 0x37, 0x39, 0x34, 0x38, 0x39, 0x36, 0x36, 0x31, 0x39, 0x32, 0x33, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x5f, 0x34, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, 0x37, 0x38, 0x35, 0x33, 0x39, 0x38, 0x31, 0x36, 0x33, 0x33, 0x39, 0x37, 0x34, 0x34, 0x38, 0x33, 0x30, 0x39, 0x36, 0x32, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x31, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, 0x33, 0x31, 0x38, 0x33, 0x30, 0x39, 0x38, 0x38, 0x36, 0x31, 0x38, 0x33, 0x37, 0x39, 0x30, 0x36, 0x37, 0x31, 0x35, 0x34, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x32, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, 0x36, 0x33, 0x36, 0x36, 0x31, 0x39, 0x37, 0x37, 0x32, 0x33, 0x36, 0x37, 0x35, 0x38, 0x31, 0x33, 0x34, 0x33, 0x30, 0x38, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x32, 0x5f, 0x53, 0x51, 0x52, 0x54, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x31, 0x32, 0x38, 0x33, 0x37, 0x39, 0x31, 0x36, 0x37, 0x30, 0x39, 0x35, 0x35, 0x31, 0x32, 0x35, 0x37, 0x33, 0x39, 0x30, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x53, 0x51, 0x52, 0x54, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x34, 0x31, 0x34, 0x32, 0x31, 0x33, 0x35, 0x36, 0x32, 0x33, 0x37, 0x33, 0x30, 0x39, 0x35, 0x30, 0x34, 0x38, 0x38, 0x30, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x53, 0x51, 0x52, 0x54, 0x31, 0x5f, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, 0x37, 0x30, 0x37, 0x31, 0x30, 0x36, 0x37, 0x38, 0x31, 0x31, 0x38, 0x36, 0x35, 0x34, 0x37, 0x35, 0x32, 0x34, 0x34, 0x30, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x70, 0x74, 0x72, 0x64, 0x69, 0x66, 0x66, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, 0x5f, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x5b, 0x5d, 0x3b, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x77, 0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, 0x0a, 0x0a, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x61, 0x73, 0x6d, 0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, 0x66, 0x33, 0x32, 0x2e, 0x66, 0x31, 0x36, 0x20, 0x25, 0x30, 0x2c, 0x20, 0x25, 0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e, 0x22, 0x20, 0x3a, 0x20, 0x22, 0x3d, 0x66, 0x22, 0x28, 0x72, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x68, 0x22, 0x28, 0x68, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x61, 0x73, 0x6d, 0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, 0x72, 0x6e, 0x2e, 0x66, 0x31, 0x36, 0x2e, 0x66, 0x33, 0x32, 0x20, 0x25, 0x30, 0x2c, 0x20, 0x25, 0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e, 0x22, 0x20, 0x3a, 0x20, 0x22, 0x3d, 0x68, 0x22, 0x28, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x66, 0x22, 0x28, 0x66, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x72, 0x65, 0x73, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x2b, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, 0x29, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00}; libgpuarray-0.7.6/src/cluda_opencl.h000066400000000000000000000203621326743622600174530ustar00rootroot00000000000000#ifndef CLUDA_H #define CLUDA_H #define local_barrier() barrier(CLK_LOCAL_MEM_FENCE) #define WITHIN_KERNEL /* empty */ #define KERNEL __kernel #define GLOBAL_MEM __global #define LOCAL_MEM __local #define LOCAL_MEM_ARG __local /* NAN */ #ifndef NULL #define NULL ((void*)0) #endif /* INFINITY */ #define LID_0 get_local_id(0) #define LID_1 get_local_id(1) #define LID_2 get_local_id(2) #define LDIM_0 get_local_size(0) #define LDIM_1 get_local_size(1) #define LDIM_2 get_local_size(2) #define GID_0 get_group_id(0) #define GID_1 get_group_id(1) #define GID_2 get_group_id(2) #define GDIM_0 get_num_groups(0) #define GDIM_1 get_num_groups(1) #define GDIM_2 get_num_groups(2) #define ga_bool uchar #define ga_byte char #define ga_ubyte uchar #define ga_short short #define ga_ushort ushort #define ga_int int #define ga_uint uint #define ga_long long #define ga_ulong ulong #define ga_float float #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64: enable #define ga_double double #endif #define ga_size ulong #define ga_ssize long #define GA_DECL_SHARED_PARAM(type, name) , __local type *name #define GA_DECL_SHARED_BODY(type, name) #define GA_WARP_SIZE __GA_WARP_SIZE typedef struct _ga_half { half data; } ga_half; #define ga_half2float(p) vload_half(0, &((p).data)) static inline ga_half ga_float2half(ga_float f) { ga_half r; vstore_half_rte(f, 0, &r.data); return r; } #pragma OPENCL_EXTENSION cl_khr_int64_base_atomics: enable #define gen_atom32_add(name, argtype, aspace) \ argtype name(volatile aspace argtype *, argtype); \ argtype name(volatile aspace argtype *addr, argtype val) { \ union { \ argtype a; \ int w; \ } p, n; \ int a; \ p.a = *addr; \ do { \ a = p.w; \ n.a = p.a + val; \ p.w = atomic_cmpxchg((volatile aspace int *)addr, a, n.w); \ } while (p.w != a); \ return n.a; \ } #define gen_atom64_add(name, argtype, aspace) \ argtype name(volatile aspace argtype *, argtype); \ argtype name(volatile aspace argtype *addr, argtype val) { \ union { \ argtype a; \ long w; \ } p, n; \ long a; \ p.a = *addr; \ do { \ a = p.w; \ n.a = p.a + val; \ p.w = atom_cmpxchg((volatile aspace long *)addr, a, n.w); \ } while (p.w != a); \ return n.a; \ } #define gen_atom64_xchg(name, argtype, aspace) \ argtype name(volatile aspace argtype *, argtype); \ argtype name(volatile aspace argtype *addr, argtype val) { \ union { \ argtype a; \ long w; \ } p, n; \ n.a = val; \ p.w = atom_xchg((volatile aspace long *)addr, n.w); \ return p.a; \ } /* ga_int */ #define atom_add_ig(a, b) atomic_add(a, b) #define atom_add_il(a, b) atomic_add(a, b) #define atom_xchg_ig(a, b) atomic_xchg(a, b) #define atom_xchg_il(a, b) atomic_xchg(a, b) /* ga_uint */ #define atom_add_Ig(a, b) atomic_add(a, b) #define atom_add_Il(a, b) atomic_add(a, b) #define atom_xchg_Ig(a, b) atomic_xchg(a, b) #define atom_xchg_Il(a, b) atomic_xchg(a, b) /* ga_float */ gen_atom32_add(atom_add_fg, ga_float, global) gen_atom32_add(atom_add_fl, ga_float, local) #define atom_xchg_fg(a, b) atomic_xchg(a, b) #define atom_xchg_fl(a, b) atomic_xchg(a, b) #ifdef cl_khr_int64_base_atomics #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable /* ga_long */ #define atom_add_lg(a, b) atom_add(a, b) #define atom_add_ll(a, b) atom_add(a, b) #define atom_xchg_lg(a, b) atom_xchg(a, b) #define atom_xchg_ll(a, b) atom_xchg(a, b) /* ga_ulong */ #define atom_add_Lg(a, b) atom_add(a, b) #define atom_add_Ll(a, b) atom_add(a, b) #define atom_xchg_Lg(a, b) atom_xchg(a, b) #define atom_xchg_Ll(a, b) atom_xchg(a, b) /* ga_double */ #ifdef cl_khr_fp64 gen_atom64_add(atom_add_dg, ga_double, global) gen_atom64_add(atom_add_dl, ga_double, local) gen_atom64_xchg(atom_xchg_dg, ga_double, global) gen_atom64_xchg(atom_xchg_dl, ga_double, local) #endif #endif /* ga_half */ #define gen_atomh_add(name, aspace) \ ga_half name(volatile aspace ga_half *addr, ga_half val); \ ga_half name(volatile aspace ga_half *addr, ga_half val) { \ ga_uint idx = ((ga_size)addr & 2) >> 1; \ volatile aspace int *base = (volatile aspace int *)((ga_size)addr & ~2); \ union { \ int i; \ ga_half h[2]; \ } o, a, n; \ float fo; \ float fval; \ fval = ga_half2float(val); \ o.i = *base; \ do { \ a.i = o.i; \ fo = ga_half2float(o.h[idx]); \ n.i = o.i; \ n.h[idx] = ga_float2half(fval + fo); \ o.i = atomic_cmpxchg(base, a.i, n.i); \ } while (o.i != a.i); \ return n.h[idx]; \ } #define gen_atomh_xchg(name, aspace) \ ga_half name(volatile aspace ga_half *addr, ga_half val); \ ga_half name(volatile aspace ga_half *addr, ga_half val) { \ ga_uint idx = ((ga_size)addr & 2) >> 1; \ volatile aspace int *base = (volatile aspace int *)((ga_size)addr & ~2); \ union { \ int i; \ ga_half h[2]; \ } o, a, n; \ o.i = *base; \ do { \ a.i = o.i; \ n.i = o.i; \ n.h[idx] = val; \ o.i = atomic_cmpxchg(base, a.i, n.i); \ } while (o.i != a.i); \ return o.h[idx]; \ } gen_atomh_add(atom_add_eg, global) gen_atomh_add(atom_add_el, local) gen_atomh_xchg(atom_xchg_eg, global) gen_atomh_xchg(atom_xchg_el, local) #endif libgpuarray-0.7.6/src/cluda_opencl.h.c000066400000000000000000001427321326743622600177020ustar00rootroot00000000000000static const char cluda_opencl_h[] = { 0x23, 0x69, 0x66, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x43, 0x4c, 0x55, 0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x43, 0x4c, 0x55, 0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, 0x29, 0x20, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, 0x43, 0x4c, 0x4b, 0x5f, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x46, 0x45, 0x4e, 0x43, 0x45, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x57, 0x49, 0x54, 0x48, 0x49, 0x4e, 0x5f, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x2f, 0x2a, 0x20, 0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x4c, 0x4f, 0x42, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x41, 0x52, 0x47, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x2f, 0x2a, 0x20, 0x4e, 0x41, 0x4e, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x55, 0x4c, 0x4c, 0x0a, 0x20, 0x20, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x55, 0x4c, 0x4c, 0x20, 0x28, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x2a, 0x29, 0x30, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, 0x75, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x75, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x66, 0x70, 0x36, 0x34, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, 0x4f, 0x50, 0x45, 0x4e, 0x43, 0x4c, 0x20, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x53, 0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x66, 0x70, 0x36, 0x34, 0x3a, 0x20, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x2c, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x5f, 0x5f, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x0a, 0x0a, 0x74, 0x79, 0x70, 0x65, 0x64, 0x65, 0x66, 0x20, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x5f, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, 0x20, 0x26, 0x28, 0x28, 0x70, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x74, 0x65, 0x28, 0x66, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x26, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, 0x4f, 0x50, 0x45, 0x4e, 0x43, 0x4c, 0x5f, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x53, 0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, 0x4f, 0x50, 0x45, 0x4e, 0x43, 0x4c, 0x20, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x53, 0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x66, 0x70, 0x36, 0x34, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x66, 0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00}; libgpuarray-0.7.6/src/gen_types.py000066400000000000000000000112531326743622600172200ustar00rootroot00000000000000import sys from mako import exceptions from mako.template import Template TYPEMAP = {} i = 0 def add_type(name, C, sz): global i TYPEMAP[i] = ("ga_"+name, sz), name, C i+=1 add_type("bool", "uint8_t", 1) add_type("byte", "int8_t", 1) add_type("ubyte", "uint8_t", 1) for name, sz in [("short", 2), ("int", 4), ("long", 8)]: add_type(name, "int%s_t"%(sz*8,), sz) add_type("u"+name, "uint%s_t"%(sz*8,), sz) add_type("longlong", "int128_t", 16) add_type("ulonglong", "uint128_t", 16) add_type("float", "float", 4) add_type("double", "double", 8) add_type("quad", "ga_quad", 16) add_type("cfloat", "ga_cfloat", 8) add_type("cdouble", "ga_cdouble", 16) add_type("cquad", "ga_cquad", 32) assert i <= 23 i=23 # to sync with numpy. add_type("half", "half_t", 2); add_type("size", "size_t", "sizeof(size_t)"); add_type("ssize", "ssize_t", "sizeof(ssize_t)"); decls = """ #ifdef _MSC_VER typedef signed __int8 int8_t; typedef unsigned __int8 uint8_t; typedef signed __int16 int16_t; typedef unsigned __int16 uint16_t; typedef signed __int32 int32_t; typedef unsigned __int32 uint32_t; typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t; #else #include #endif typedef struct _int128 { union int128_u { int8_t as_int8[16]; int16_t as_int16[8]; int32_t as_int32[4]; int64_t as_int64[2]; } value; } int128_t; typedef struct _uint128 { union uint128_u { uint8_t as_uint8[16]; uint16_t as_uint16[8]; uint32_t as_uint32[4]; uint64_t as_uint64[2]; } value; } uint128_t; typedef struct _quad { union { struct { int16_t exp; uint16_t hi; uint32_t lo; } s; uint128_t raw; } u; } ga_quad; typedef uint16_t half_t; typedef struct _cfloat { float r; float i; } ga_cfloat; typedef struct _cdouble { double r; double i; } ga_cdouble; typedef struct _cquad { ga_quad r; ga_quad i; } ga_cquad; """ ntypes = i VECTORMAP = {} i = 0 def add_type(name, sz): global i VECTORMAP[i] = ("ga_"+name, sz, "GA_"+name.upper()), name i+=1 for s in [2, 3, 4, 8, 16]: add_type("byte"+str(s), s) add_type("ubyte"+str(s), s) for name, sz in [("short", 2), ("int", 4), ("long", 8)]: for s in [2, 3, 4, 8, 16]: add_type(name+str(s), sz*s) add_type("u"+name+str(s), sz*s) for name, sz in [("float", 4), ("double", 8), ("half", 2)]: for s in [2, 4, 8, 16]: add_type(name+str(s), sz*s) nvec = i head_tmpl = Template(""" /* This file is generated by gen_types.py */ /** \\file types.h * \\brief Type declarations and access. */ #ifndef GPUARRAY_TYPES_H #define GPUARRAY_TYPES_H #include #include #include #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif /** * Structure that holds the properties of a type. */ typedef struct _gpuarray_type { /** * Type name to use in the buffers. */ const char *cluda_name; /** * Size of one element (in bytes). */ size_t size; /** * Alignement requirement for the type. */ size_t align; /** * Code for the type. */ int typecode; } gpuarray_type; /** * List of all built-in types. */ enum GPUARRAY_TYPES { GA_BUFFER = -1, % for i, v in sorted(TYPEMAP.items()): GA_${v[1].upper()} = ${i}, % endfor /** \\cond INTERNAL_DOCS */ GA_NBASE = ${ntypes}, GA_DELIM = 255, /* To be forward-compatible with numpy */ /** \\endcond */ % for i, v in sorted(VECTORMAP.items()): GA_${v[1].upper()}, % endfor /** \\cond INTERNAL_DOCS */ GA_NVEC, GA_ENDVEC = 512 /** \\endcond */ }; #ifdef __cplusplus } #endif #endif /* GPUARRAY_TYPES */ """) impl_tmpl = Template(""" /* This file is generated by gen_types.py */ #include "gpuarray/types.h" #include /* For NULL */ ${decls} % for k, v in TYPEMAP.items(): typedef struct {char c; ${v[2]} x; } st_${v[1]}; #define ${v[1].upper()}_ALIGN (sizeof(st_${v[1]}) - sizeof(${v[2]})) % endfor const gpuarray_type scalar_types[] = { % for i in range(ntypes): % if i in TYPEMAP: {"${TYPEMAP[i][0][0]}", ${TYPEMAP[i][0][1]}, ${TYPEMAP[i][1].upper()}_ALIGN, GA_${TYPEMAP[i][1].upper()}}, % else: {NULL, 0, 0, -1}, % endif % endfor }; const gpuarray_type vector_types[] = { % for i, v in sorted(VECTORMAP.items()): {"${v[0][0]}", ${v[0][1]}, 0, GA_${v[1].upper()}}, % endfor }; """) try: header = head_tmpl.render(TYPEMAP=TYPEMAP, VECTORMAP=VECTORMAP, ntypes=ntypes) impl = impl_tmpl.render(TYPEMAP=TYPEMAP, VECTORMAP=VECTORMAP, ntypes=ntypes, decls=decls) except Exception: print(exceptions.text_error_template().render()) sys.exit(1) with open("gpuarray/types.h", "w") as f: f.write(header) with open("gpuarray_types.c", "w") as f: f.write(impl) libgpuarray-0.7.6/src/gpuarray/000077500000000000000000000000001326743622600165015ustar00rootroot00000000000000libgpuarray-0.7.6/src/gpuarray/array.h000066400000000000000000000520541326743622600177760ustar00rootroot00000000000000#ifndef GPUARRAY_ARRAY_H #define GPUARRAY_ARRAY_H /** * \file array.h * \brief Array functions. */ #include #include #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif /** * Main array structure. */ typedef struct _GpuArray { /** * Device data buffer. */ gpudata *data; /** * Size of each dimension. The number of elements is #nd. */ size_t *dimensions; /** * Stride for each dimension. The number of elements is #nd. */ ssize_t *strides; /** * Offset to the first array element into the device data buffer. */ size_t offset; /** * Number of dimensions. */ unsigned int nd; /** * Flags for this array (see \ref aflags). */ int flags; /** * Type of the array elements. */ int typecode; /** * \defgroup aflags Array Flags * @{ */ /* Try to keep in sync with numpy values for now */ /** * Array is C-contiguous. */ #define GA_C_CONTIGUOUS 0x0001 /** * Array is Fortran-contiguous. */ #define GA_F_CONTIGUOUS 0x0002 /** * Buffer data is properly aligned for the type. This should always * be true for arrays allocated through this library. * * If this isn't true you can't use kernels on the data, since they * require aligned access. */ #define GA_ALIGNED 0x0100 /** * Can write to the data buffer. (This is always true for arrays * allocated through this library). */ #define GA_WRITEABLE 0x0400 /** * Array data is behaved (properly aligned and writable). */ #define GA_BEHAVED (GA_ALIGNED|GA_WRITEABLE) /** * Array layout is that of a C array. */ #define GA_CARRAY (GA_C_CONTIGUOUS|GA_BEHAVED) /** * Array layout is that of a Fortran array. */ #define GA_FARRAY (GA_F_CONTIGUOUS|GA_BEHAVED) /** * @} */ /* Numpy flags that will not be supported at this level (and why): NPY_OWNDATA: data is refcounted NPY_NOTSWAPPED: data is alway native endian NPY_FORCECAST: no casts NPY_ENSUREARRAY: no inherited classes NPY_UPDATEIFCOPY: cannot support without refcount (or somesuch) Maybe will define other flags later */ } GpuArray; /** * Type used to specify the desired order to some functions */ typedef enum _ga_order { /** * Any order is fine. */ GA_ANY_ORDER=-1, /** * C order is desired. */ GA_C_ORDER=0, /** * Fortran order is desired. */ GA_F_ORDER=1 } ga_order; /** * Checks if all the specified flags are set. * * \param a array * \param flags flags to check * * \returns true if all flags in `flags` are set and false otherwise. */ static inline int GpuArray_CHKFLAGS(const GpuArray *a, int flags) { return (a->flags & flags) == flags; } /* Add tests here when you need them */ /** * Checks if the array data is writable. * * \param a array * * \returns true if the data area of `a` is writable */ #define GpuArray_ISWRITEABLE(a) GpuArray_CHKFLAGS(a, GA_WRITEABLE) /** * Checks if the array elements are aligned. * * \param a array * * \returns true if the elements of `a` are aligned. */ #define GpuArray_ISALIGNED(a) GpuArray_CHKFLAGS(a, GA_ALIGNED) /** * Checks if the array elements are contiguous in memory. * * \param a array * * \returns true if the data area of `a` is contiguous */ #define GpuArray_ISONESEGMENT(a) ((a)->flags & (GA_C_CONTIGUOUS|GA_F_CONTIGUOUS)) /** * Checks if the array elements are c contiguous in memory. * * \param a array * * \returns true if the data area of `a` is contiguous */ #define GpuArray_IS_C_CONTIGUOUS(a) ((a)->flags & GA_C_CONTIGUOUS) /** * Checks if the array elements are f contiguous in memory. * * \param a array * * \returns true if the data area of `a` is contiguous */ #define GpuArray_IS_F_CONTIGUOUS(a) ((a)->flags & GA_F_CONTIGUOUS) /** * This is the same as GpuArray_IS_F_CONTIGUOUS, but not the same as PyArray_ISFORTRAN. * * PyArray_ISFORTRAN checks if the array elements are laid out if * Fortran order and NOT c order. * * \param a array * * \returns true if the data area of `a` is Fortran-contiguous */ #define GpuArray_ISFORTRAN(a) (GpuArray_CHKFLAGS(a, GA_F_CONTIGUOUS)) /** * Retrive the size of the elements in the array. * * \param a array * * \returns the size of the array elements. */ #define GpuArray_ITEMSIZE(a) gpuarray_get_elsize((a)->typecode) /** * Fix the flags of an array using the current strides and shape. * * \param a GpuArray to fix flags for */ GPUARRAY_PUBLIC void GpuArray_fix_flags(GpuArray *a); /** * Initialize and allocate a new empty (uninitialized data) array. * * \param a the GpuArray structure to initialize. Content will be * ignored so make sure to deallocate any previous array first. * \param ctx context in which to allocate array data. Must come from * the same backend as the operations vector. * \param typecode type of the elements in the array * \param nd desired order (number of dimensions) * \param dims size for each dimension. * \param ord desired layout of data. * * \returns A return of GA_NO_ERROR means that the structure is * properly initialized and that the memory requested is reserved on * the device. Any other error code means that the structure is * left uninitialized. */ GPUARRAY_PUBLIC int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode, unsigned int nd, const size_t *dims, ga_order ord); /** * Initialize and allocate a new zero-initialized array. * * \param a the GpuArray structure to initialize. Content will be * ignored so make sure to deallocate any previous array first. * \param ctx context in which to allocate array data. Must come from * the same backend as the operations vector. * \param typecode type of the elements in the array * \param nd desired order (number of dimensions) * \param dims size for each dimension. * \param ord desired layout of data. * * \returns A return of GA_NO_ERROR means that the structure is * properly initialized and that the memory requested is reserved on * the device. Any other error code means that the structure is * left uninitialized. */ GPUARRAY_PUBLIC int GpuArray_zeros(GpuArray *a, gpucontext *ctx, int typecode, unsigned int nd, const size_t *dims, ga_order ord); /** * Initialize and allocate a new array structure from a pre-existing buffer. * * The array will be considered to own the gpudata structure after the * call is made and will free it when deallocated. An error return * from this function will deallocate `data`. * This increment the ref count of gpudata. This seem to contradict the above. * * \param a the GpuArray structure to initialize. Content will be * ignored so make sure to deallocate any previous array first. * \param data buffer to user. * \param offset position of the first data element of the array in the buffer. * \param typecode type of the elements in the array * \param nd order of the data (number of dimensions). * \param dims size for each dimension. * \param strides stride for each dimension. * \param writeable true if the buffer is writable false otherwise. * * \returns A return of GA_NO_ERROR means that the structure is * properly initialized. Any other error code means that the structure * is left uninitialized and the provided buffer is deallocated. */ GPUARRAY_PUBLIC int GpuArray_fromdata(GpuArray *a, gpudata *data, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, int writeable); /** * Initialize an array structure to provide a view of another. * * The new structure will point to the same data area and have the * same values of properties as the source one. The data area is * shared and writes from one array will be reflected in the other. * The properties are copied and not shared and can be modified * independantly. * * \param v the result array * \param a the source array * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_view(GpuArray *v, const GpuArray *a); /** * Blocks until all operations (kernels, copies) involving `a` are finished. * * \param a the array to synchronize * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_sync(GpuArray *a); /** * Returns a sub-view of a source array. * * The indexing follows simple basic model where each dimension is * indexed separately. For a single dimension the indexing selects * from the start index (included) to the end index (excluded) while * selecting one over step elements. As an example for the array `[ 0 * 1 2 3 4 5 6 7 8 9 ]` indexed with start index 1 stop index 8 and * step 2 the result would be `[ 1 3 5 7 ]`. * * The special value 0 for step means that only one element * corresponding to the start index and the resulting array order will * be one smaller. * * \param r the result array * \param a the source array * \param starts the start of the subsection for each dimension (length must be a->nd) * \param stops the end of the subsection for each dimension (length must be a->nd) * \param steps the steps for the subsection for each dimension (length must be a->nd) * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_index(GpuArray *r, const GpuArray *a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps); GPUARRAY_PUBLIC int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps); /** * Take a portion of an array along axis 0. * * This operation allows arbitrary indexing of an array along its * first axis. The indexed array `v` can be of any dimension or * strides. The result and index array (`a` and `i` respectively) need * to be C contiguous. * * The dimension 0 of `a` has to match dimension 0 of `i` and the * others have to match their equivalent on `v`. `i` has to have a * single dimension. * * If `check_error` is not 0, the function will check for indexing * errors in the kernel and will return GA_VALUE_ERROR in that * case. No other error will produce that error code. This is not * always done because it introduces a synchronization point which may * affect performance. * * \param a the result array (nd) * \param v the source array (nd) * \param i the index array (1d) * \param check_error whether to check for index errors or not * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, int check_error); /** * Sets the content of an array to the content of another array. * * The value array must be smaller or equal in number of dimensions to * the destination array. Each of its dimensions' size must be either * exactly equal to the destination array's corresponding dimensions * or 1. Dimensions of size 1 will be repeated to fill the full size * of the destination array. Extra size 1 dimensions will be added at * the end to make the two arrays shape-equivalent. * * \param a the destination array * \param v the value array * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_setarray(GpuArray *a, const GpuArray *v); /** * Change the dimensions of an array. * * Return a new array with the desired dimensions. The new dimensions * must have the same total size as the old ones. A copy of the * underlying data may be performed if necessary, unless `nocopy` is * 0. * * \param res the result array * \param a the source array * \param nd new dimensions order * \param newdims new dimensions (length is nd) * \param ord the desired resulting order * \param nocopy if 0 error out if a data copy is required. * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_reshape(GpuArray *res, const GpuArray *a, unsigned int nd, const size_t *newdims, ga_order ord, int nocopy); GPUARRAY_PUBLIC int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, const size_t *newdims, ga_order ord); /** * Rearrange the axes of an array. * * Return a new array with its shape and strides swapped accordingly * to the `new_axes` parameter. If `new_axes` is NULL then the order * is reversed. The returned array is a view on the data of the old * one. * * \param res the result array * \param a the source array * \param new_axes either NULL or a list of a->nd elements * * \return GA_NO_ERROR if the operation was successful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_transpose(GpuArray *res, const GpuArray *a, const unsigned int *new_axes); GPUARRAY_PUBLIC int GpuArray_transpose_inplace(GpuArray *a, const unsigned int *new_axes); /** * Release all device and host memory associated with `a`. * * This function frees all host memory, and releases the device memory * if it is the owner. In case an array has views it is the * responsability of the caller to ensure a base array is not cleared * before its views. * * This function will also zero out the structure to prevent * accidental reuse. * * \param a the array to clear */ GPUARRAY_PUBLIC void GpuArray_clear(GpuArray *a); /** * Checks if two arrays may share device memory. * * \param a an array * \param b an array * * \returns 1 if `a` and `b` may share a portion of their data. */ GPUARRAY_PUBLIC int GpuArray_share(const GpuArray *a, const GpuArray *b); /** * Retursns the context of an array. * * \param a an array * * \returns the context in which `a` was allocated. */ GPUARRAY_PUBLIC gpucontext *GpuArray_context(const GpuArray *a); /** * Copies all the elements of one array to another. * * The arrays `src` and `dst` must have the same size (total number of * elements) and be in the same context. * * \param dst destination array * \param src source array * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_move(GpuArray *dst, const GpuArray *src); /** * Copy data from the host memory to the device memory. * * \param dst destination array (must be contiguous) * \param src source host memory (contiguous block) * \param src_sz size of data to copy (in bytes) * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_write(GpuArray *dst, const void *src, size_t src_sz); /** * Copy data from the device memory to the host memory. * * \param dst destination host memory (contiguous block) * \param dst_sz size of data to copy (in bytes) * \param src source array (must be contiguous) * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_read(void *dst, size_t dst_sz, const GpuArray *src); /** * Set all of an array's data to a byte pattern. * * \param a an array (must be contiguous) * \param data the byte to repeat * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_memset(GpuArray *a, int data); /** * Make a copy of an array. * * This is analogue to GpuArray_view() except it copies the device * memory and no data is shared. * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_copy(GpuArray *res, const GpuArray *a, ga_order order); /** * Copy between arrays in different contexts. * * This works like GpuArray_move() except it will work between arrays * that aren't in the same context. * * Source and target arrays must be contiguous. This restriction may * be lifted in the future. * * \param res result array * \param a array to transfer * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_transfer(GpuArray *res, const GpuArray *a); /** * Split an array into multiple views. * * The created arrays will be sub-portions of `a` where `axis` is * divided according to the values in `p`. No checks are performed on * the values in `p` except to make sure that they don't reference * values outside of the bounds of the source array. * * If an error occurs partway during the operation, the created arrays * will be cleared before returning. * * \param rs list of array pointers to store results (must be of length n+1) * \param a array to split * \param n number of splits (length of p) * \param p list of split points * \param axis axis to split * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_split(GpuArray **rs, const GpuArray *a, size_t n, size_t *p, unsigned int axis); /** * Concatenate the arrays in `as` along the axis `axis`. * * If an error occurs during the operation, the result array may be * cleared before returning. * * \param r the result array * \param as list of pointer to arrays to concatenate * \param n number of array in list `as` * \param axis the axis along which to concatenate * \param restype the typecode of the result array * * \return GA_NO_ERROR if the operation was succesful. * \return an error code otherwise */ GPUARRAY_PUBLIC int GpuArray_concatenate(GpuArray *r, const GpuArray **as, size_t n, unsigned int axis, int restype); /** * Get a description of the last error in the context of `a`. * * The description may reflect operations with other arrays in the * same context if other operations were performed between the * occurence of the error and the call to this function. * * Operations in other contexts, however have no incidence on the * return value. * * \param a an array * \param err the error code returned * * \returns A user-readable string describing the nature of the error. */ GPUARRAY_PUBLIC const char *GpuArray_error(const GpuArray *a, int err); /** * Print a textual description of `a` to the specified file * descriptor. * * \param fd a file descriptior open for writing * \param a an array */ GPUARRAY_PUBLIC void GpuArray_fprintf(FILE *fd, const GpuArray *a); GPUARRAY_PUBLIC int GpuArray_fdump(FILE *fd, const GpuArray *a); /** * @brief Computes simultaneously the maxima and the arguments of maxima over * specified axes of the tensor. * * Returns two tensors of identical shape. Both tensors' axes are a subset of * the axes of the original tensor. The axes to be reduced are specified by * the caller, and the maxima and arguments of maxima are computed over them. * * @param [out] dstMax The resulting tensor of maxima * @param [out] dstArgmax the resulting tensor of arguments at maxima * @param [in] src The source tensor. * @param [in] reduxLen The number of axes reduced. Must be >= 1 and * <= src->nd. * @param [in] reduxList A list of integers of length reduxLen, indicating * the axes to be reduced. The order of the axes * matters for dstArgmax index calculations. All * entries in the list must be unique, >= 0 and * < src->nd. * * For example, if a 5D-tensor is reduced with an axis * list of [3,4,1], then reduxLen shall be 3, and the * index calculation in every point shall take the form * * dstArgmax[i0,i2] = i3 * src.shape[4] * src.shape[1] + * i4 * src.shape[1] + * i1 * * where (i3,i4,i1) are the coordinates of the maximum- * valued element within subtensor [i0,:,i2,:,:] of src. * @return GA_NO_ERROR if the operation was successful, or a non-zero error * code otherwise. */ GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, GpuArray* dstArgmax, const GpuArray* src, unsigned reduxLen, const unsigned* reduxList); #ifdef __cplusplus } #endif #endif libgpuarray-0.7.6/src/gpuarray/blas.h000066400000000000000000000033401326743622600175730ustar00rootroot00000000000000#ifndef GPUARRAY_BLAS_H #define GPUARRAY_BLAS_H #include #include #ifdef __cplusplus extern "C" { #endif // only for vector-vector dot GPUARRAY_PUBLIC int GpuArray_rdot(GpuArray *X, GpuArray *Y, GpuArray *Z, int nocopy); #define GpuArray_hdot GpuArray_rdot #define GpuArray_sdot GpuArray_rdot #define GpuArray_ddot GpuArray_rdot GPUARRAY_PUBLIC int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, double beta, GpuArray *Y, int nocopy); #define GpuArray_hgemv GpuArray_rgemv #define GpuArray_sgemv GpuArray_rgemv #define GpuArray_dgemv GpuArray_rgemv GPUARRAY_PUBLIC int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuArray *A, GpuArray *B, double beta, GpuArray *C, int nocopy); #define GpuArray_hgemm GpuArray_rgemm #define GpuArray_sgemm GpuArray_rgemm #define GpuArray_dgemm GpuArray_rgemm GPUARRAY_PUBLIC int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, int nocopy); #define GpuArray_hger GpuArray_rger #define GpuArray_sger GpuArray_rger #define GpuArray_dger GpuArray_rger GPUARRAY_PUBLIC int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alpha, GpuArray *A, GpuArray *B, double beta, GpuArray *C, int nocopy); #define GpuArray_hgemmBatch_3d GpuArray_rgemmBatch_3d #define GpuArray_sgemmBatch_3d GpuArray_rgemmBatch_3d #define GpuArray_dgemmBatch_3d GpuArray_rgemmBatch_3d #ifdef __cplusplus } #endif #endif libgpuarray-0.7.6/src/gpuarray/buffer.h000066400000000000000000000551661326743622600201400ustar00rootroot00000000000000/** \file buffer.h * \brief This file contains the interface definition for the backends. * * For normal use you should not call the functions defined in this * file directly. * * \see array.h For managing buffers * \see kernel.h For using kernels */ #ifndef GPUARRAY_BUFFER_H #define GPUARRAY_BUFFER_H #include #include #include #include #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif struct _gpudata; /** * Opaque struct for buffer data. */ typedef struct _gpudata gpudata; struct _gpucontext; /** * Opaque struct for context data. */ typedef struct _gpucontext gpucontext; struct _gpukernel; /** * Opaque struct for kernel data. */ typedef struct _gpukernel gpukernel; /** * Gets information about the number of available platforms for the * backend specified in `name`. * * \param name the backend name * \param platcount will contain number of compatible * platforms in host * * \return #GA_NO_ERROR, if success */ GPUARRAY_PUBLIC int gpu_get_platform_count(const char* name, unsigned int* platcount); /** * Gets information about the number of compatible devices on a * specific host's `platform` for the backend specified in `name`. * * \param name the backend name * \param platform number for a platform in host * \param devcount will contain number of compatible devices in * `platform` * * \return #GA_NO_ERROR, if success */ GPUARRAY_PUBLIC int gpu_get_device_count(const char* name, unsigned int platform, unsigned int* devcount); /** * Opaque structure that holds properties for the context. */ typedef struct _gpucontext_props gpucontext_props; /** * Allocate and initialized an instance of gpucontext_props. * * Initialization is done with default values. * * \param res pointer to storage space for the created object * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpucontext_props_new(gpucontext_props **res); /** * Set the device number for a CUDA device. * * \param p properties object * \param devno device number * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpucontext_props_cuda_dev(gpucontext_props *p, int devno); /** * Set the platform and device for OpenCL. * * \param p properties object * \param platno platform number * \param devno device number * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpucontext_props_opencl_dev(gpucontext_props *p, int platno, int devno); /** * Set the scheduling mode for the device. * * \param p properties object * \param sched scheduling mode. One of \ref sched_modes "these". * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpucontext_props_sched(gpucontext_props *p, int sched); /** \defgroup sched_modes * @{ */ /** * Automatic scheduling, decide what to do depending on the workload, * number of cores in the computer and other relevant factors. (default) */ #define GA_CTX_SCHED_AUTO 0 /** * Single-work scheduling. Optimize for speed in a single process, * with a single thread. This is the fastest mode, but it may keep * the CPU busy more than necessary. */ #define GA_CTX_SCHED_SINGLE 1 /** * Multi-work scheduling. Try to not keep the CPU busy more than * necessary and let other threads a chance at some CPU time. This * may increase the latency when waiting for GPU operations. */ #define GA_CTX_SCHED_MULTI 2 /** @}*/ /** * Set single-stream mode. * * All operations on the device will be serialized on a single stream. * This will also disable most of the interlocking normally done * between multiple streams to keep everything in order. * * This mode can be faster if you don't have a lot of device-level * parallelism in your workload. * * \param p properties object * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpucontext_props_set_single_stream(gpucontext_props *p); /** * Set the path for the kernel cache. * * The cache can be shared with other running instances, even on * shared drives. * * \param p properties object * \param path desired location of the kernel cache * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpucontext_props_kernel_cache(gpucontext_props *p, const char *path); /** * Configure the allocation cache. * * The maximum size is also a limit on the total amount of memory * allocated on the device. * * \param p properties object * \param initial initial size of the cache * \param max maximum size of the cache * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpucontext_props_alloc_cache(gpucontext_props *p, size_t initial, size_t max); /** * Free a properties object. * * This should not be called on a properties object that has been * passed to gpucontext_init(). * * \param p properties object * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC void gpucontext_props_del(gpucontext_props *p); /** * Create a context on the specified device. * * \warning This function is not thread-safe. * * The passed-in properties pointer will be managed by this function * and needs not be freed. This means that you shouldn't touch the * properties object after passing it to this function. * * \param res a pointer to a location that will be allocated * \param name the backend name. * \param props a properties object for the context. Can be NULL for * defaults. * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpucontext_init(gpucontext **res, const char *name, gpucontext_props *props); /** * Dereference a context. * * This removes a reference to the context and as soon as the * reference count drops to zero the context is destroyed. The * context can stay alive after you call this function because some * object keep a reference to their context. * * \param ctx a valid context pointer. */ GPUARRAY_PUBLIC void gpucontext_deref(gpucontext *ctx); /** * Fetch a context property. * * The property must be a context property. The currently defined * properties and their type are defined in \ref props "Properties". * * \param ctx context * \param prop_id property id (from \ref props "Properties") * \param res pointer to the return space of the appropriate type * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpucontext_property(gpucontext *ctx, int prop_id, void *res); /** * Get a string describing `err`. * * If you need to get a description of a error that occurred during * context creation, call this function using NULL as the context. * This version of the call is not thread-safe. * * \param ctx the context in which the error occured * \param err error code * * \returns string description of error */ GPUARRAY_PUBLIC const char *gpucontext_error(gpucontext *ctx, int err); /** * Allocates a buffer of size `sz` in context `ctx`. * * Buffers are reference counted internally and start with a * reference count of 1. * * \param ctx a context pointer * \param sz the requested size * \param flags see \ref alloc_flags "Allocation flags" * \param data optional pointer to host buffer * \param ret error return pointer * * \returns A non-NULL pointer to a gpudata structure. This * structure is intentionally opaque as its content may change * according to the backend used. */ GPUARRAY_PUBLIC gpudata *gpudata_alloc(gpucontext *ctx, size_t sz, void *data, int flags, int *ret); /** * \defgroup alloc_flags Allocation flags * @{ */ /** * The buffer is available for reading and writing from kernels. * * This is the default (0) value. */ #define GA_BUFFER_READ_WRITE 0x00 /** * Allocate the buffer in device-only memory. * * This is the default (0) value. */ #define GA_BUFFER_DEV 0x00 /** * Signal that the memory in this buffer will only be read by kernels. * * You can use gpudata_write() to set the contents. * * You may not call gpudata_memset() with the resulting buffer as the * destination. */ #define GA_BUFFER_READ_ONLY 0x01 /** * Signal that the memory in this buffer will only be written by * kernels (i.e. it is an output buffer). * * You can read the contents with gpudata_read(). */ #define GA_BUFFER_WRITE_ONLY 0x02 /** * Initialize the contents of the buffer with the user-supplied host * buffer (`data`). This buffer must be at least `sz` large. */ #define GA_BUFFER_INIT 0x04 /** * Allocate the buffer in host-reachable memory enabling you to * retrieve a pointer to the contents as the * `GA_BUFFER_PROP_HOSTPOINTER` property. */ #define GA_BUFFER_HOST 0x08 /*#define GA_BUFFER_USE_DATA 0x10*/ /* The upper 16 bits are private flags */ #define GA_BUFFER_MASK 0xffff /** * @} */ /** * Increase the reference count to the passed buffer by 1. * * \param b a buffer */ GPUARRAY_PUBLIC void gpudata_retain(gpudata *b); /** * Release a buffer. * * This will decrement the reference count of the buffer by 1. If * that count reaches 0 all associated ressources will be released. * * Even if your application does not have any references left to a * buffer it may still hang around if it is in use by internal * mechanisms (kernel call, ...) */ GPUARRAY_PUBLIC void gpudata_release(gpudata *b); /** * Check if two buffers may overlap. * * Both buffers must have been created with the same backend. * * \param a first buffer * \param b second buffer * \param ret error return pointer * * \retval 1 The buffers may overlap * \retval 0 The buffers do not overlap. * \retval -1 An error was encoutered, `ret` contains a detailed * error code if not NULL. */ GPUARRAY_PUBLIC int gpudata_share(gpudata *a, gpudata *b, int *ret); /** * Copy the content of a buffer to another. * * Both buffers must be in the same context and contiguous. * Additionally the buffers must not overlap otherwise the content of * the destination buffer is not defined. * * \param dst destination buffer * \param dstoff offset inside the destination buffer * \param src source buffer * \param srcoff offset inside the source buffer * \param sz size of data to copy (in bytes) * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpudata_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz); /** * Transfer the content of buffer across contexts. * * If possible it will try to the the transfer in an efficient way * using backend-specific tricks. If those fail or can't be used, it * will fallback to a copy through the host. * * \param dst buffer to transfer to * \param dstoff offset in the destination buffer * \param src buffer to transfer from * \param srcoff offset in the source buffer * \param sz size of the region to transfer * * \returns the new buffer in dst_ctx or NULL if no efficient way to * transfer could be found. */ GPUARRAY_PUBLIC int gpudata_transfer(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz); /** * Transfer data from a buffer to memory. * * The buffer and the memory region must be contiguous. * * \param dst destination in memory * \param src source buffer * \param srcoff offset inside the source buffer * \param sz size of data to copy (in bytes) * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpudata_read(void *dst, gpudata *src, size_t srcoff, size_t sz); /** * Transfer data from memory to a buffer. * * The buffer and the memory region must be contiguous. * * \param dst destination buffer * \param dstoff offset inside the destination buffer * \param src source in memory * \param sz size of data to copy (in bytes) * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpudata_write(gpudata *dst, size_t dstoff, const void *src, size_t sz); /** * Set a buffer to a byte pattern. * * This function acts like the C function memset() for device buffers. * * \param dst destination buffer * \param dstoff offset into the destination buffer * \param data byte value to write into the destination. * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpudata_memset(gpudata *dst, size_t dstoff, int data); /** * Synchronize a buffer. * * Waits for all previous read, writes, copies and kernel calls * involving this buffer to be finished. * * This call is not required for normal use of the library as all * exposed operations will properly synchronize amongst themselves. * This call may be useful in a performance timing context to ensure * that the work is really done, or before interaction with another * library to wait for pending operations. */ GPUARRAY_PUBLIC int gpudata_sync(gpudata *b); /** * Fetch a buffer property. * * Can be used for buffer properties and context properties. Context * properties will fetch the value for the context associated with the * buffer. The currently defined properties and their type are * defined in \ref props "Properties". * * \param buf buffer * \param prop_id property id (from \ref props "Properties") * \param res pointer to the return space of the appropriate type * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpudata_property(gpudata *buf, int prop_id, void *res); GPUARRAY_PUBLIC gpucontext *gpudata_context(gpudata *b); /** * Compile a kernel. * * Compile the kernel composed of the concatenated strings in * `strings` and return a callable kernel. If lengths is NULL then * all the strings must be NUL-terminated. Otherwise, it doesn't * matter (but the lengths must not include the final NUL byte if * provided). * * \param ctx context to work in * \param count number of input strings * \param strings table of string pointers * \param lengths (optional) length for each string in the table * \param fname name of the kernel function (as defined in the code) * \param numargs number of kernel arguments * \param typecodes the type of each argument * \param flags flags for compilation (see #ga_usefl) * \param ret error return pointer * \param err_str returns pointer to debug message from GPU backend * (if provided a non-NULL err_str) * * If `*err_str` is not NULL on return, the caller must call * `free(*err_str)` after use. * * \returns Allocated kernel structure or NULL if an error occured. * `ret` will be updated with the error code if not NULL. */ GPUARRAY_PUBLIC gpukernel *gpukernel_init(gpucontext *ctx, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int numargs, const int *typecodes, int flags, int *ret, char **err_str); /** * Retain a kernel. * * Increase the reference count of the passed kernel by 1. * * \param k a kernel */ GPUARRAY_PUBLIC void gpukernel_retain(gpukernel *k); /** * Release a kernel. * * Decrease the reference count of a kernel. If it reaches 0, all * resources associated with `k` will be released. * * If the reference count of a kernel reaches 0 while it is running, * this call will block until completion. */ GPUARRAY_PUBLIC void gpukernel_release(gpukernel *k); /** * Set kernel argument. * * Buffer arguments will not be retained and it is the * responsability of the caller to ensure that the value is still * valid whenever a call is made. * * \param k kernel * \param i argument index (starting at 0) * \param a pointer to argument * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpukernel_setarg(gpukernel *k, unsigned int i, void *a); /** * Call a kernel. * * If args is NULL, it will be assumed that the arguments have * previously been set with kernel_setarg(). * * \param k kernel * \param n number of dimensions of grid/block * \param gs grid sizes for this call (also known as global size) * \param ls block sizes for this call (also known as local size) * \param shared amount of dynamic shared memory to reserve * \param args table of pointers to each argument (optional). * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args); /** * Fetch a property. * * Can be used for kernel and context properties. The context * properties will fetch the value for the context associated with the * kernel. The currently defined properties and their type are * defined in \ref props "Properties". * * \param k kernel * \param prop_id property id (from \ref props "Properties") * \param res pointer to the return space of the appropriate type * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpukernel_property(gpukernel *k, int prop_id, void *res); GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k); /** * \defgroup props Properties * @{ */ /* Start at 1 for GA_CTX_PROP_ */ /** * Get the device name for the context. * * Type: `char [256]` */ #define GA_CTX_PROP_DEVNAME 1 /* UNUSED: 2 */ /** * Get the local memory size available for a call in the context. * * Type: `size_t` */ #define GA_CTX_PROP_LMEMSIZE 3 /** * Number of compute units in this context. * * compute units times local size is more or less the expected * parallelism available on the device, but this is a very rough * estimate. * * Type: `unsigned int` */ #define GA_CTX_PROP_NUMPROCS 4 /* UNUSED: 5 */ /* UNUSED: 6 */ /** * Get the compatibility ID for the binaries generated with this context. * * Those binaries should work with any context which has the same ID. * * Type: `const char *` */ #define GA_CTX_PROP_BIN_ID 7 /** * Get a pre-allocated 8 byte buffer for kernel ops. * * This buffer is initialized to 0 on allocation and must always be * returned to that state after using it. * * This only to avoid the overhead of an allocation when calling a * kernel that may error out. It does not preclude the need for * synchronization and transfers. * * Type: `gpudata *` */ #define GA_CTX_PROP_ERRBUF 8 /** * Get the total size of global memory on the device. * * Type: `size_t` */ #define GA_CTX_PROP_TOTAL_GMEM 9 /** * Get the size of free global memory on the device. * * Type: `size_t` */ #define GA_CTX_PROP_FREE_GMEM 10 /** * Get the status of native float16 support on the device. * * Type: `int` */ #define GA_CTX_PROP_NATIVE_FLOAT16 11 /** * Get the maximum global size for dimension 0. * * Type: `size_t` */ #define GA_CTX_PROP_MAXGSIZE0 12 /** * Get the maximum global size for dimension 1. * * Type: `size_t` */ #define GA_CTX_PROP_MAXGSIZE1 13 /** * Get the maximum global size for dimension 2. * * Type: `size_t` */ #define GA_CTX_PROP_MAXGSIZE2 14 /** * Get the maximum local size for dimension 0. * * Type: `size_t` */ #define GA_CTX_PROP_MAXLSIZE0 15 /** * Get the maximum local size for dimension 1. * * Type: `size_t` */ #define GA_CTX_PROP_MAXLSIZE1 16 /** * Get the maximum loca size for dimension 2. * * Type: `size_t` */ #define GA_CTX_PROP_MAXLSIZE2 17 /* UNUSED: 18 */ /** * Get a unique ID for the device behind the context. * * Type: `char [16]` */ #define GA_CTX_PROP_UNIQUE_ID 19 /** * Get the largest single block of memory that can be allocted. * * Type: `size_t` */ #define GA_CTX_PROP_LARGEST_MEMBLOCK 20 /* Start at 512 for GA_BUFFER_PROP_ */ #define GA_BUFFER_PROP_START 512 /** * Get the context in which this buffer was allocated. * * Type: `gpucontext *` */ #define GA_BUFFER_PROP_CTX 512 /** * The reference count of the buffer. Use only for debugging purposes. * * Type: `unsigned int` */ #define GA_BUFFER_PROP_REFCNT 513 /** * Size of the buffer on the device. * * This may be larger than the requested allocation size due to a * number of factors. * * Type: `size_t` */ #define GA_BUFFER_PROP_SIZE 514 /* Start at 1024 for GA_KERNEL_PROP_ */ #define GA_KERNEL_PROP_START 1024 /** * Get the context for which this kernel was compiled. * * Type: `gpucontext *` */ #define GA_KERNEL_PROP_CTX 1024 /** * Get the maximum block size (also known as local size) for a call of * this kernel. * * Type: `size_t` */ #define GA_KERNEL_PROP_MAXLSIZE 1025 /** * Get the prefered multiple of the block size for a call to this * kernel. * * Type: `size_t` */ #define GA_KERNEL_PROP_PREFLSIZE 1026 /** * Get the number of kernel arguments. * * Type `unsigned int` */ #define GA_KERNEL_PROP_NUMARGS 1027 /** * Get the list of argument types for a kernel. * * This list is the same length as the number of arguments to the * kernel. Do not modify the returned list. * * Type: `const int *` */ #define GA_KERNEL_PROP_TYPES 1028 /** * @} */ /** * Flags for gpukernel_init(). * * It is important to specify these properly as the compilation * machinery will ensure that the proper configuration is made to * support the requested features or error out if the demands cannot * be met. * * \warning Failure to properly specify the feature flags will in most * cases result in silent data corruption (especially on ATI cards). */ typedef enum _ga_usefl { /* UNUSED: 0x01 */ /** * The kernel makes use of small (size is smaller than 4 bytes) types. */ GA_USE_SMALL = 0x02, /** * The kernel makes use of double or complex doubles. */ GA_USE_DOUBLE = 0x04, /** * The kernel makes use of complex of complex doubles. */ GA_USE_COMPLEX = 0x08, /** * The kernel makes use of half-floats (also known as float16) */ GA_USE_HALF = 0x10, /* If you add a new flag, don't forget to update both gpuarray_buffer_{cuda,opencl}.c with the implementation of your flag */ /** * The kernel is made of CUDA code. */ GA_USE_CUDA = 0x2000, /** * The kernel is made of OpenCL code. */ GA_USE_OPENCL = 0x4000, } ga_usefl; #ifdef __cplusplus } #endif #endif libgpuarray-0.7.6/src/gpuarray/buffer_blas.h000066400000000000000000000154451326743622600211350ustar00rootroot00000000000000#ifndef GPUARRAY_BUFFER_BLAS_H #define GPUARRAY_BUFFER_BLAS_H #include #include #ifdef __cplusplus extern "C" { #endif typedef enum _cb_order { cb_row, cb_column } cb_order; #define cb_c cb_row #define cb_fortran cb_column typedef enum _cb_side { cb_left, cb_right } cb_side; typedef enum _cb_transpose { cb_no_trans, cb_trans, cb_conj_trans } cb_transpose; typedef enum _cb_uplo { cb_upper, cb_lower } cb_uplo; GPUARRAY_PUBLIC int gpublas_setup(gpucontext *ctx); GPUARRAY_PUBLIC void gpublas_teardown(gpucontext *ctx); GPUARRAY_PUBLIC const char *gpublas_error(gpucontext *ctx); GPUARRAY_PUBLIC int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); GPUARRAY_PUBLIC int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); GPUARRAY_PUBLIC int gpublas_ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY); GPUARRAY_PUBLIC int gpublas_sgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY); GPUARRAY_PUBLIC int gpublas_dgemv( cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY); GPUARRAY_PUBLIC int gpublas_hgemm( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc); GPUARRAY_PUBLIC int gpublas_sgemm( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc); GPUARRAY_PUBLIC int gpublas_dgemm( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc); GPUARRAY_PUBLIC int gpublas_hger( cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); GPUARRAY_PUBLIC int gpublas_sger( cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); GPUARRAY_PUBLIC int gpublas_dger( cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); GPUARRAY_PUBLIC int gpublas_hgemmBatch( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_hgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_sgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_dgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_sgemmBatch( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_dgemmBatch( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_hgemvBatch( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_sgemvBatch( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_dgemvBatch( cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_hgerBatch( cb_order order, size_t M, size_t N, float alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_sgerBatch( cb_order order, size_t M, size_t N, float alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_dgerBatch( cb_order order, size_t M, size_t N, double alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); #ifdef __cplusplus } #endif #endif libgpuarray-0.7.6/src/gpuarray/buffer_collectives.h000066400000000000000000000210161326743622600225170ustar00rootroot00000000000000#ifndef GPUARRAY_BUFFER_COLLECTIVES_H #define GPUARRAY_BUFFER_COLLECTIVES_H #include #include #ifdef __cplusplus extern "C" { #endif // __cplusplus #ifdef CONFUSE_EMACS } #endif // CONFUSE_EMACS /***************************************************************************** * Multi-gpu collectives buffer interface * ******************************************************************************/ /** * Multi-gpu communicator structure. */ struct _gpucomm; typedef struct _gpucomm gpucomm; /* * \enum gpucomm_reduce_ops * * \brief Reduction operations */ enum gpucomm_reduce_ops { GA_SUM = 0, //!< to sum (elemwise) arrays across ranks GA_PROD = 1, //!< to multiply (elemwise) arrays across ranks GA_MAX = 2, //!< to find max (elemwise) of arrays across ranks GA_MIN = 3, //!< to find min (elemwise) of arrays across ranks }; #define GA_COMM_ID_BYTES 128 //!< sizeof(gpucommCliqueId) /** * Dummy struct to define byte-array's length through a type */ typedef struct _gpucommCliqueId { char internal[GA_COMM_ID_BYTES]; } gpucommCliqueId; /** * Create a new gpu communicator instance. * * This must be called in parallel by all participants in the same * world. The call will block until all participants have joined in. * The world is defined by a shared comm_id. * * \param comm pointer to get a new gpu communicator * \param ctx gpu context in which `comm` will be used * (contains device information) * \param comm_id id unique to communicators consisting a world * \param ndev number of communicators/devices participating in the world * \param rank user-defined rank, from 0 to `ndev`-1. Must be unique * for the world. * * \returns error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_new(gpucomm** comm, gpucontext* ctx, gpucommCliqueId comm_id, int ndev, int rank); /** * Destroy a gpu communicator instance. * * \param comm gpu communicator to be destroyed */ GPUARRAY_PUBLIC void gpucomm_free(gpucomm* comm); /** * Returns nice error message concerning \ref GA_COMM_ERROR. * * \param ctx gpu context in which communicator was used * * \returns useful backend error message */ GPUARRAY_PUBLIC const char* gpucomm_error(gpucontext* ctx); /** * Returns gpu context in which `comm` is used. * * \param comm gpu communicator * * \returns gpu context */ GPUARRAY_PUBLIC gpucontext* gpucomm_context(gpucomm* comm); /** * Creates a unique `comm_id`. * * The id is guarenteed to be unique in the same host, but not * necessarily across hosts. * * \param ctx gpu context * \param comm_id pointer to instance containing id * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_gen_clique_id(gpucontext* ctx, gpucommCliqueId* comm_id); /** * Returns total number of devices participating in `comm`'s world. * * \param comm gpu communicator * \param devcount pointer to store the number of devices * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_get_count(gpucomm* comm, int* devcount); /** * Returns the rank of `comm` inside its world. * * \param comm gpu communicator * \param rank pointer to store the rank * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_get_rank(gpucomm* comm, int* rank); /** * Reduce collective operation for ranks in a communicator world * [buffer level]. * * \param src data in device's buffer to be reduced * \param offsrc memory offset after which data is saved in buffer * `src` * \param dest data in device's buffer to collect result * \param offdest memory offset after which data will be saved in * buffer `dest` * \param count number of elements to be reduced in each array * \param typecode elements' data type * \param opcode reduce operation code * \param root rank in `comm` which will collect result * \param comm gpu communicator * * \note Non root ranks can call this, using a NULL `dest`. In this * case, `offdest` will not be used. * * \note Must be called separately for each rank in `comm`. * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, int opcode, int root, gpucomm* comm); /** * AllReduce collective operation for ranks in a communicator world * [buffer level]. * * Reduces data pointed by `src` using op operation and leaves * identical copies of result in data pointed by `dest` on each rank * of `comm`. * * \param src data in device's buffer to be reduced * \param offsrc memory offset after which data is saved in buffer * `src` * \param dest data in device's buffer to collect result * \param offdest memory offset after which data will be saved in * buffer `dest` * \param count number of elements to be reduced in each array * \param typecode elements' data type * \param opcode reduce operation code (see #gpucomm_reduce_ops) * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_all_reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm* comm); /** * ReduceScatter collective operation for ranks in a communicator * world [buffer level]. * * Reduces data pointed by `src` using `opcode` operation and leaves * reduced result scattered over data pointed by `dest` in the * user-defined rank order in `comm`. * * \param src data in device's buffer to be reduced * \param offsrc memory offset after which data is saved in buffer * `src` * \param dest data in device's buffer to collect scattered result * \param offdest memory offset after which data will be saved in * buffer `dest` * \param count number of elements to be contained in result `dest` * \param typecode elements' data type * \param opcode reduce operation code (see #gpucomm_reduce_ops) * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm* comm); /** * Broadcast collective operation for ranks in a communicator world * [buffer level]. * * Copies data pointed by `array` to all ranks in `comm`. * * \param array data in device's buffer to get copied or be received * \param offset memory offset after which data in `array` begin * \param count number of elements to be contained in `array` * \param typecode elements' data type * \param root rank in `comm` which broadcasts its array * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_broadcast(gpudata* array, size_t offset, size_t count, int typecode, int root, gpucomm* comm); /** * AllGather collective operation for ranks in a communicator world. * * Each rank receives all data pointed by `src` of every rank in the * user-defined rank order in `comm`. * * \param src data in device's buffer to be gathered * \param offsrc memory offset after which data in `src` begin * \param dest data in device's buffer to gather from all ranks * \param offdest memory offset after which data in `dest` begin * \param count number of elements to be gathered from each rank in * `src` * \param typecode elements' data type * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_all_gather(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, gpucomm* comm); #ifdef __cplusplus } #endif #endif // GPUARRAY_BUFFER_COLLECTIVES_H libgpuarray-0.7.6/src/gpuarray/collectives.h000066400000000000000000000104521326743622600211700ustar00rootroot00000000000000#ifndef GPUARRAY_COLLECTIVES_H #define GPUARRAY_COLLECTIVES_H #include #include #include #ifdef __cplusplus extern "C" { #endif // __cplusplus #ifdef CONFUSE_EMACS } #endif // CONFUSE_EMACS /***************************************************************************** * Multi-gpu collectives interface * ******************************************************************************/ /** * Reduce collective operation for non root participant ranks in a * communicator world. * * \param src array to be reduced * \param opcode reduce operation code, see #gpucomm_reduce_ops * \param root rank in `comm` which will collect result * \param comm gpu communicator * * \note Root rank of reduce operation must call GpuArray_reduce(). * \note Must be called separately for each rank in `comm`, except root rank. * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_reduce_from(const GpuArray* src, int opcode, int root, gpucomm* comm); /** * Reduce collective operation for ranks in a communicator world. * * \param src array to be reduced * \param dest array to collect reduce operation result * \param opcode reduce operation code, see #gpucomm_reduce_ops * \param root rank in `comm` which will collect result * \param comm gpu communicator * * \note Can be used by root and non root ranks alike. * * \note Non root ranks can call this, using a NULL `dest`. * \note Must be called separately for each rank in `comm` (non root * can call GpuArray_reduce_from() instead). * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_reduce(const GpuArray* src, GpuArray* dest, int opcode, int root, gpucomm* comm); /** * AllReduce collective operation for ranks in a communicator world. * * Reduces `src` using op operation and leaves identical copies of * result in `dest` on each rank of `comm`. * * \param src array to be reduced * \param dest array to collect reduce operation result * \param opcode reduce operation code, see #gpucomm_reduce_ops * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_all_reduce(const GpuArray* src, GpuArray* dest, int opcode, gpucomm* comm); /** * ReduceScatter collective operation for ranks in a communicator world. * * Reduces data in `src` using `opcode` operation and leaves reduced * result scattered over `dest` in the user-defined rank order in * `comm`. * * \param src array to be reduced * \param dest array to collect reduce operation scattered result * \param opcode reduce operation code, see #gpucomm_reduce_ops * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_reduce_scatter(const GpuArray* src, GpuArray* dest, int opcode, gpucomm* comm); /** * Broadcast collective operation for ranks in a communicator world. * * Copies `array` to all ranks in `comm`. * * \param array array to be broadcasted, if root rank, else to receive * \param root rank in `comm` which broadcasts its array * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_broadcast(GpuArray* array, int root, gpucomm* comm); /** * AllGather collective operation for ranks in a communicator world. * * Each rank receives all `src` arrays from every rank in the * user-defined rank order in `comm`. * * \param src array to be gathered * \param dest array to receive all gathered arrays from ranks in * `comm` * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_all_gather(const GpuArray* src, GpuArray* dest, gpucomm* comm); #ifdef __cplusplus } #endif #endif // GPUARRAY_COLLECTIVES_H libgpuarray-0.7.6/src/gpuarray/config.h000066400000000000000000000015661326743622600201270ustar00rootroot00000000000000#ifndef GPUARRAY_CONFIG #define GPUARRAY_CONFIG /* The following included file should have been generated by CMake. */ #include #define GPUARRAY_API_VERSION 2 #ifdef GPUARRAY_SHARED #ifdef _WIN32 #ifdef GPUARRAY_BUILDING_DLL #define GPUARRAY_PUBLIC __declspec(dllexport) #else #define GPUARRAY_PUBLIC __declspec(dllimport) #endif #else #if __GNUC__ >= 4 #define GPUARRAY_PUBLIC __attribute__((visibility ("default"))) #else #error "Don't know how to export symbols on this platform" #endif #endif #else #define GPUARRAY_PUBLIC #endif #ifdef _MSC_VER #include #ifndef inline #define inline __inline #endif #if _MSC_VER < 1600 #include #else #include #endif #define ssize_t intptr_t #define SSIZE_MAX INTPTR_MAX #else #include #include #endif #endif libgpuarray-0.7.6/src/gpuarray/elemwise.h000066400000000000000000000072771326743622600205010ustar00rootroot00000000000000#ifndef GPUARRAY_ELEMWISE_H #define GPUARRAY_ELEMWISE_H /** \file elemwise.h * \brief Custom elementwise operations generator. */ #include #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif struct _GpuElemwise; /** * Elementwise generator structure. * * The contents are private. */ typedef struct _GpuElemwise GpuElemwise; /** * Argument information structure for GpuElemwise. */ typedef struct _gpuelemwise_arg { /** * Name of this argument in the associated expression, mandatory. */ const char *name; /** * Type of argument, mandatory (not GA_BUFFER, the content dtype) */ int typecode; /** * Argument flags, mandatory (see \ref eflags). */ int flags; /** * \defgroup eflags GpuElemwise argument flags * @{ */ /** * Argument is a scalar passed from the CPU, requires nd == 0. */ #define GE_SCALAR 0x0001 /** * Array is read from in the expression. */ #define GE_READ 0x0002 /** * Array is written to in the expression. */ #define GE_WRITE 0x0004 /** * }@ */ } gpuelemwise_arg; /** * Create a new GpuElemwise. * * This will allocate and initialized a new GpuElemwise object. This * object can be used to run the specified operation on different sets * of arrays. * * The argument descriptor name the arguments and provide their data * types and geometry (arrays or scalars). They also specify if the * arguments are used for reading or writing. An argument can be used * for both. * * The expression is a C-like string performing an operation with * scalar values named according to the argument descriptors. All of * the indexing and selection of the right values is handled by the * GpuElemwise code. * * \param ctx the context in which to run the operations * \param preamble code to be inserted before the kernel code * \param expr the expression to compute * \param n the number of arguments * \param args the argument descriptors * \param nd the number of dimensions to precompile for * \param flags see \ref elem_flags "GpuElemwise flags" * * \returns a new GpuElemwise object or NULL */ GPUARRAY_PUBLIC GpuElemwise *GpuElemwise_new(gpucontext *ctx, const char *preamble, const char *expr, unsigned int n, gpuelemwise_arg *args, unsigned int nd, int flags); /** * \defgroup elem_flags GpuElemwise flags * @{ */ /** * Don't precompile kernels for 64-bits addressing. */ #define GE_NOADDR64 0x0001 /** * Convert float16 inputs to float32 for computation. */ #define GE_CONVERT_F16 0x0002 /** * @} */ /** * Free all storage associated with a GpuElemwise. * * \param ge the GpuElemwise object to free. */ GPUARRAY_PUBLIC void GpuElemwise_free(GpuElemwise *ge); /** * Run a GpuElemwise on some inputs. * * \param ge the GpuElemwise to run * \param args pointers to the arguments (must macth what was described by * the argument descriptors) * \param flags see \ref elem_call_flags "GpuElemwise call flags" */ GPUARRAY_PUBLIC int GpuElemwise_call(GpuElemwise *ge, void **args, int flags); /** * \defgroup elem_call_flags GpuElemwise call flags * @{ */ /** * Allow broadcasting of dimensions of size 1. */ #define GE_BROADCAST 0x0100 /** * Disable dimension collapsing (not recommended). */ #define GE_NOCOLLAPSE 0x0200 /** * Allow implicit left-padding of shape with dimensions of size 1. */ #define GE_PADSHAPE 0x0400 /** * @} */ #ifdef __cplusplus } #endif #endif libgpuarray-0.7.6/src/gpuarray/error.h000066400000000000000000000021311326743622600200000ustar00rootroot00000000000000#ifndef GPUARRAY_ERROR_H #define GPUARRAY_ERROR_H /** \file gpuarray/error.h * \brief Error functions. */ #include #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif /** * List of all the possible error codes. */ enum ga_error { GA_NO_ERROR = 0, GA_MEMORY_ERROR, GA_VALUE_ERROR, GA_IMPL_ERROR, /* call buffer_error() for more details */ GA_INVALID_ERROR, GA_UNSUPPORTED_ERROR, GA_SYS_ERROR, /* look at errno for more details */ GA_RUN_ERROR, GA_DEVSUP_ERROR, GA_READONLY_ERROR, GA_WRITEONLY_ERROR, GA_BLAS_ERROR, GA_UNALIGNED_ERROR, GA_COPY_ERROR, GA_NODEV_ERROR, GA_MISC_ERROR, GA_COMM_ERROR, GA_XLARGE_ERROR, GA_LOAD_ERROR, /* Add more error types if needed, but at the end */ /* Don't forget to sync with Gpu_error() */ }; /** * Returns a user-readable description for most error codes. * * Some errors only happen in a context and in those cases Gpu_error() * will provide more details as to the reason for the error. */ GPUARRAY_PUBLIC const char *gpuarray_error_str(int err); #ifdef __cplusplus } #endif #endif libgpuarray-0.7.6/src/gpuarray/ext_cuda.h000066400000000000000000000034231326743622600204500ustar00rootroot00000000000000#ifndef LIBGPU_EXT_CUDA #define LIBGPU_EXT_CUDA #include #include #include #include #ifdef __cplusplus extern "C" { #endif /** @cond NEVER */ static void (*cuda_enter)(gpucontext *); static void (*cuda_exit)(gpucontext *); static gpucontext *(*cuda_make_ctx)(CUcontext, int); static CUstream (*cuda_get_stream)(void *); static gpudata *(*cuda_make_buf)(void *, CUdeviceptr, size_t); static size_t (*cuda_get_sz)(gpudata *); static int (*cuda_wait)(gpudata *, int); static int (*cuda_record)(gpudata *, int); static CUipcMemHandle (*cuda_get_ipc_handle)(gpudata *d); static gpudata *(*cuda_open_ipc_handle)(gpucontext *c, CUipcMemHandle h, size_t sz); /** @endcond */ static void setup_ext_cuda(void) { // The casts are necessary to reassure C++ compilers cuda_enter = (void (*)(gpucontext *))gpuarray_get_extension("cuda_enter"); cuda_exit = (void (*)(gpucontext *))gpuarray_get_extension("cuda_exit"); cuda_make_ctx = (gpucontext *(*)(CUcontext, int))gpuarray_get_extension("cuda_make_ctx"); cuda_get_stream = (CUstream (*)(void *))gpuarray_get_extension("cuda_get_stream"); cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))gpuarray_get_extension("cuda_make_buf"); cuda_get_sz = (size_t (*)(gpudata *))gpuarray_get_extension("cuda_get_sz"); cuda_wait = (int (*)(gpudata *, int))gpuarray_get_extension("cuda_wait"); cuda_record = (int (*)(gpudata *, int))gpuarray_get_extension("cuda_record"); cuda_get_ipc_handle = (CUipcMemHandle (*)(gpudata *))gpuarray_get_extension("cuda_get_ipc_handle"); cuda_open_ipc_handle = (gpudata *(*)(gpucontext *c, CUipcMemHandle h, size_t sz))gpuarray_get_extension("cuda_open_ipc_handle"); } #ifdef __cplusplus } #endif #endif libgpuarray-0.7.6/src/gpuarray/extension.h000066400000000000000000000013771326743622600206760ustar00rootroot00000000000000#ifndef GPUARRAY_EXTENSIONS_H #define GPUARRAY_EXTENSIONS_H /** \file extension.h * \brief Extensions access. */ #include #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif /* Keep in sync with the flags in private_cuda.h */ #define GPUARRAY_CUDA_CTX_NOFREE 0x10000000 /* DONTFREE */ #define GPUARRAY_CUDA_WAIT_READ 0x10000 /* CUDA_WAIT_READ */ #define GPUARRAY_CUDA_WAIT_WRITE 0x20000 /* CUDA_WAIT_WRITE */ typedef struct _GpuArrayIpcMemHandle { char priv[64]; } GpuArrayIpcMemHandle; /** * Obtain a function pointer for an extension. * * \returns A function pointer or NULL if the extension was not found. */ GPUARRAY_PUBLIC void * gpuarray_get_extension(const char *name); #ifdef __cplusplus } #endif #endif libgpuarray-0.7.6/src/gpuarray/kernel.h000066400000000000000000000065521326743622600201420ustar00rootroot00000000000000#ifndef GPUARRAY_KERNEL_H #define GPUARRAY_KERNEL_H /** \file kernel.h * \brief Kernel functions. */ #include #include #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif /** * Kernel information structure. */ typedef struct _GpuKernel { /** * Device kernel reference. */ gpukernel *k; /** * Argument buffer. */ void **args; } GpuKernel; /** * Initialize a kernel structure. * * `lens` holds the size of each source string. If is it NULL or an * element has a value of 0 the length will be determined using strlen() * or equivalent code. * * \param k a kernel structure * \param ctx context in which to build the kernel * \param count number of source code strings * \param strs C array of source code strings * \param lens C array with the size of each string or NULL * \param name name of the kernel function * \param argcount number of kerner arguments * \param types typecode for each argument * \param flags kernel use flags (see \ref ga_usefl) * \param err_str (if not NULL) location to write GPU-backend provided debug info * * If `*err_str` is returned not NULL then it must be free()d by the caller * * \return GA_NO_ERROR if the operation is successful * \return any other value if an error occured */ GPUARRAY_PUBLIC int GpuKernel_init(GpuKernel *k, gpucontext *ctx, unsigned int count, const char **strs, const size_t *lens, const char *name, unsigned int argcount, const int *types, int flags, char **err_str); /** * Clear and release data associated with a kernel. * * \param k the kernel to release */ GPUARRAY_PUBLIC void GpuKernel_clear(GpuKernel *k); /** * Returns the context in which a kernel was built. * * \param k a kernel * * \returns a context pointer */ GPUARRAY_PUBLIC gpucontext *GpuKernel_context(GpuKernel *k); GPUARRAY_PUBLIC int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *val); /** * Do a scheduling of local and global size for a kernel. * * This function will find an optimal grid and block size for the * number of elements specified in n when running kernel k. The * parameters may run a bit more instances than n for efficiency * reasons, so your kernel must be ready to deal with that. * * If either gs or ls is not 0 on entry its value will not be altered * and will be taken into account when choosing the other value. * * \param k the kernel to schedule for * \param n number of elements to handle * \param gs grid size (in/out) * \param ls local size (in/out) */ GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n, size_t *gs, size_t *ls); /** * Launch the execution of a kernel. * * \param k the kernel to launch * \param n dimensionality of the grid/blocks * \param gs sizes of launch grid * \param ls sizes of launch blocks * \param shared amount of dynamic shared memory to allocate * \param args table of pointers to arguments */ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args); GPUARRAY_PUBLIC const char *GpuKernel_error(const GpuKernel *k, int err); #ifdef __cplusplus } #endif #endif libgpuarray-0.7.6/src/gpuarray/types.h000066400000000000000000000037161326743622600200250ustar00rootroot00000000000000 /* This file is generated by gen_types.py */ /** \file types.h * \brief Type declarations and access. */ #ifndef GPUARRAY_TYPES_H #define GPUARRAY_TYPES_H #include #include #include #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif /** * Structure that holds the properties of a type. */ typedef struct _gpuarray_type { /** * Type name to use in the buffers. */ const char *cluda_name; /** * Size of one element (in bytes). */ size_t size; /** * Alignement requirement for the type. */ size_t align; /** * Code for the type. */ int typecode; } gpuarray_type; /** * List of all built-in types. */ enum GPUARRAY_TYPES { GA_BUFFER = -1, GA_BOOL = 0, GA_BYTE = 1, GA_UBYTE = 2, GA_SHORT = 3, GA_USHORT = 4, GA_INT = 5, GA_UINT = 6, GA_LONG = 7, GA_ULONG = 8, GA_LONGLONG = 9, GA_ULONGLONG = 10, GA_FLOAT = 11, GA_DOUBLE = 12, GA_QUAD = 13, GA_CFLOAT = 14, GA_CDOUBLE = 15, GA_CQUAD = 16, GA_HALF = 23, GA_SIZE = 24, GA_SSIZE = 25, /** \cond INTERNAL_DOCS */ GA_NBASE = 26, GA_DELIM = 255, /* To be forward-compatible with numpy */ /** \endcond */ GA_BYTE2, GA_UBYTE2, GA_BYTE3, GA_UBYTE3, GA_BYTE4, GA_UBYTE4, GA_BYTE8, GA_UBYTE8, GA_BYTE16, GA_UBYTE16, GA_SHORT2, GA_USHORT2, GA_SHORT3, GA_USHORT3, GA_SHORT4, GA_USHORT4, GA_SHORT8, GA_USHORT8, GA_SHORT16, GA_USHORT16, GA_INT2, GA_UINT2, GA_INT3, GA_UINT3, GA_INT4, GA_UINT4, GA_INT8, GA_UINT8, GA_INT16, GA_UINT16, GA_LONG2, GA_ULONG2, GA_LONG3, GA_ULONG3, GA_LONG4, GA_ULONG4, GA_LONG8, GA_ULONG8, GA_LONG16, GA_ULONG16, GA_FLOAT2, GA_FLOAT4, GA_FLOAT8, GA_FLOAT16, GA_DOUBLE2, GA_DOUBLE4, GA_DOUBLE8, GA_DOUBLE16, GA_HALF2, GA_HALF4, GA_HALF8, GA_HALF16, /** \cond INTERNAL_DOCS */ GA_NVEC, GA_ENDVEC = 512 /** \endcond */ }; #ifdef __cplusplus } #endif #endif /* GPUARRAY_TYPES */ libgpuarray-0.7.6/src/gpuarray/util.h000066400000000000000000000117401326743622600176320ustar00rootroot00000000000000#ifndef GPUARRAY_UTIL #define GPUARRAY_UTIL /** \file util.h * \brief Utility functions. */ #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif #include #include #include /** * Registers a type with the kernel machinery. * * \param t is a preallocated and filled gpuarray_type structure. The * memory can be allocated from static memory as it will never be * freed. * \param ret is a pointer where the error code (if any) will be * stored. It can be NULL in which case no error code will be * returned. If there is no error then the memory pointed to by * `ret` will be untouched. * * \returns The type code that corresponds to the registered type. * This code is only valid for the duration of the application and * cannot be reused between invocation. * * On error this function will return -1. */ GPUARRAY_PUBLIC int gpuarray_register_type(gpuarray_type *t, int *ret); /** * Get the type structure for a type. * * The resulting structure MUST NOT be modified. * * \param typecode the typecode to get structure for * * \returns A type structure pointer or NULL */ GPUARRAY_PUBLIC const gpuarray_type *gpuarray_get_type(int typecode); /** * Get the size of one element of a type. * * If the type does not exists this function returns (size_t)-1. * * \param typecode the type to get the element size for * * \returns the size */ GPUARRAY_PUBLIC size_t gpuarray_get_elsize(int typecode); /** * Return the type use flags for the specified typecodes. * * The flags for each type passed in are OR-ed together. * * To check for a single typecode, you have to pass the final -1 also. * * Passing a -1 as the sole argument is allowed and returns 0, however * useful that is. * * \param init a typecode * \param ... list of typecodes terminated by -1 * * \returns flags for all passed-in types. */ GPUARRAY_PUBLIC int gpuarray_type_flags(int init, ...); GPUARRAY_PUBLIC int gpuarray_type_flagsa(unsigned int n, gpuelemwise_arg *arg); /** * Perform dimension collapsing on the specified arguments. * * This function will check for dimension that are next to each other * and contiguous for all inputs and merge them together. This allows * to reduce the complexity of the indexing code in kernels and * therefore enables faster runtime for kernels. * * On return the nd, dims and strs will be overwritten with the * collapsed versions. * * For scalar arguments, strs[k] can be NULL. * * \param n The number of arguments * \param nd The number of dimensions of all arguments * \param dim The compute shape * \param strs The strides for all arguments * */ GPUARRAY_PUBLIC void gpuarray_elemwise_collapse(unsigned int n, unsigned int *nd, size_t *dim, ssize_t **strs); typedef struct _ga_half_t { uint16_t h; } ga_half_t; /* code strongly inspired from https://github.com/numpy/numpy/blob/master/numpy/core/src/npymath/halffloat.c#L246 */ static inline ga_half_t ga_float2half(float f) { union { float f; uint32_t bits; } bf; union { ga_half_t h; uint16_t bits; } bh; uint32_t f_exp, f_sig; uint16_t h_sgn, h_exp, h_sig; bf.f = f; h_sgn = (bf.bits&0x80000000u) >> 16; f_exp = (bf.bits&0x7f800000u); /* Exponent overflow/NaN converts to signed inf/NaN */ if (f_exp >= 0x47800000u) { if (f_exp == 0x7f800000u) { /* Inf or NaN */ f_sig = (bf.bits&0x007fffffu); if (f_sig != 0) { /* NaN - propagate the flag in the significand... */ bh.bits = (uint16_t) (0x7c00u + (f_sig >> 13)); /* ...but make sure it stays a NaN */ if (bh.bits == 0x7c00u) { bh.bits++; } bh.bits += h_sgn; return bh.h; } else { /* signed inf */ bh.bits = h_sgn + 0x7c00u; return bh.h; } } else { bh.bits = h_sgn + 0x7c00u; return bh.h; } } if (f_exp <= 0x38000000u) { /* * Signed zeros, subnormal floats, and floats with small * exponents all convert to signed zero halfs. */ if (f_exp < 0x33000000u) { bh.bits = h_sgn; return bh.h; } /* Make the subnormal significand */ f_exp >>= 23; f_sig = (0x00800000u + (bf.bits&0x007fffffu)); f_sig >>= (113 - f_exp); /* Handle rounding by adding 1 to the bit beyond half precision */ f_sig += 0x00001000u; h_sig = (uint16_t) (f_sig >> 13); /* * If the rounding causes a bit to spill into h_exp, it will * increment h_exp from zero to one and h_sig will be zero. * This is the correct result. */ bh.bits = h_sgn + h_sig; return bh.h; } /* Regular case with no overflow or underflow */ h_exp = (uint16_t) ((f_exp - 0x38000000u) >> 13); /* Handle rounding by adding 1 to the bit beyond half precision */ f_sig = (bf.bits&0x007fffffu); f_sig += 0x00001000u; h_sig = (uint16_t) (f_sig >> 13); bh.bits = h_sgn + h_exp + h_sig; return bh.h; } #ifdef __cplusplus } #endif #endif /* GPUARRAY_UTIL */ libgpuarray-0.7.6/src/gpuarray/wincompat/000077500000000000000000000000001326743622600205025ustar00rootroot00000000000000libgpuarray-0.7.6/src/gpuarray/wincompat/stdint.h000066400000000000000000000171271326743622600221700ustar00rootroot00000000000000// ISO C9x compliant stdint.h for Microsoft Visual Studio // Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 // // Copyright (c) 2006-2008 Alexander Chemeris // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // 3. The name of the author may be used to endorse or promote products // derived from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // /////////////////////////////////////////////////////////////////////////////// #if !defined(_MSC_VER) || _MSC_VER > 1600 // [ #error "Use this header only with Microsoft Visual C++ 2010 or older compilers!" #endif // _MSC_VER ] #ifndef _MSC_STDINT_H_ // [ #define _MSC_STDINT_H_ #if _MSC_VER > 1000 #pragma once #endif #include // For Visual Studio 6 in C++ mode and for many Visual Studio versions when // compiling for ARM we should wrap include with 'extern "C++" {}' // or compiler give many errors like this: // error C2733: second C linkage of overloaded function 'wmemchr' not allowed #ifdef __cplusplus extern "C" { #endif # include #ifdef __cplusplus } #endif // Define _W64 macros to mark types changing their size, like intptr_t. #ifndef _W64 # if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 # define _W64 __w64 # else # define _W64 # endif #endif // 7.18.1 Integer types // 7.18.1.1 Exact-width integer types // Visual Studio 6 and Embedded Visual C++ 4 doesn't // realize that, e.g. char has the same size as __int8 // so we give up on __intX for them. #if (_MSC_VER < 1300) typedef signed char int8_t; typedef signed short int16_t; typedef signed int int32_t; typedef unsigned char uint8_t; typedef unsigned short uint16_t; typedef unsigned int uint32_t; #else typedef signed __int8 int8_t; typedef signed __int16 int16_t; typedef signed __int32 int32_t; typedef unsigned __int8 uint8_t; typedef unsigned __int16 uint16_t; typedef unsigned __int32 uint32_t; #endif typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t; // 7.18.1.2 Minimum-width integer types typedef int8_t int_least8_t; typedef int16_t int_least16_t; typedef int32_t int_least32_t; typedef int64_t int_least64_t; typedef uint8_t uint_least8_t; typedef uint16_t uint_least16_t; typedef uint32_t uint_least32_t; typedef uint64_t uint_least64_t; // 7.18.1.3 Fastest minimum-width integer types typedef int8_t int_fast8_t; typedef int16_t int_fast16_t; typedef int32_t int_fast32_t; typedef int64_t int_fast64_t; typedef uint8_t uint_fast8_t; typedef uint16_t uint_fast16_t; typedef uint32_t uint_fast32_t; typedef uint64_t uint_fast64_t; // 7.18.1.4 Integer types capable of holding object pointers #ifdef _WIN64 // [ typedef signed __int64 intptr_t; typedef unsigned __int64 uintptr_t; #else // _WIN64 ][ typedef _W64 signed int intptr_t; typedef _W64 unsigned int uintptr_t; #endif // _WIN64 ] // 7.18.1.5 Greatest-width integer types typedef int64_t intmax_t; typedef uint64_t uintmax_t; // 7.18.2 Limits of specified-width integer types #if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 // 7.18.2.1 Limits of exact-width integer types #define INT8_MIN ((int8_t)_I8_MIN) #define INT8_MAX _I8_MAX #define INT16_MIN ((int16_t)_I16_MIN) #define INT16_MAX _I16_MAX #define INT32_MIN ((int32_t)_I32_MIN) #define INT32_MAX _I32_MAX #define INT64_MIN ((int64_t)_I64_MIN) #define INT64_MAX _I64_MAX #define UINT8_MAX _UI8_MAX #define UINT16_MAX _UI16_MAX #define UINT32_MAX _UI32_MAX #define UINT64_MAX _UI64_MAX // 7.18.2.2 Limits of minimum-width integer types #define INT_LEAST8_MIN INT8_MIN #define INT_LEAST8_MAX INT8_MAX #define INT_LEAST16_MIN INT16_MIN #define INT_LEAST16_MAX INT16_MAX #define INT_LEAST32_MIN INT32_MIN #define INT_LEAST32_MAX INT32_MAX #define INT_LEAST64_MIN INT64_MIN #define INT_LEAST64_MAX INT64_MAX #define UINT_LEAST8_MAX UINT8_MAX #define UINT_LEAST16_MAX UINT16_MAX #define UINT_LEAST32_MAX UINT32_MAX #define UINT_LEAST64_MAX UINT64_MAX // 7.18.2.3 Limits of fastest minimum-width integer types #define INT_FAST8_MIN INT8_MIN #define INT_FAST8_MAX INT8_MAX #define INT_FAST16_MIN INT16_MIN #define INT_FAST16_MAX INT16_MAX #define INT_FAST32_MIN INT32_MIN #define INT_FAST32_MAX INT32_MAX #define INT_FAST64_MIN INT64_MIN #define INT_FAST64_MAX INT64_MAX #define UINT_FAST8_MAX UINT8_MAX #define UINT_FAST16_MAX UINT16_MAX #define UINT_FAST32_MAX UINT32_MAX #define UINT_FAST64_MAX UINT64_MAX // 7.18.2.4 Limits of integer types capable of holding object pointers #ifdef _WIN64 // [ # define INTPTR_MIN INT64_MIN # define INTPTR_MAX INT64_MAX # define UINTPTR_MAX UINT64_MAX #else // _WIN64 ][ # define INTPTR_MIN INT32_MIN # define INTPTR_MAX INT32_MAX # define UINTPTR_MAX UINT32_MAX #endif // _WIN64 ] // 7.18.2.5 Limits of greatest-width integer types #define INTMAX_MIN INT64_MIN #define INTMAX_MAX INT64_MAX #define UINTMAX_MAX UINT64_MAX // 7.18.3 Limits of other integer types #ifdef _WIN64 // [ # define PTRDIFF_MIN _I64_MIN # define PTRDIFF_MAX _I64_MAX #else // _WIN64 ][ # define PTRDIFF_MIN _I32_MIN # define PTRDIFF_MAX _I32_MAX #endif // _WIN64 ] #define SIG_ATOMIC_MIN INT_MIN #define SIG_ATOMIC_MAX INT_MAX #ifndef SIZE_MAX // [ # ifdef _WIN64 // [ # define SIZE_MAX _UI64_MAX # else // _WIN64 ][ # define SIZE_MAX _UI32_MAX # endif // _WIN64 ] #endif // SIZE_MAX ] // WCHAR_MIN and WCHAR_MAX are also defined in #ifndef WCHAR_MIN // [ # define WCHAR_MIN 0 #endif // WCHAR_MIN ] #ifndef WCHAR_MAX // [ # define WCHAR_MAX _UI16_MAX #endif // WCHAR_MAX ] #define WINT_MIN 0 #define WINT_MAX _UI16_MAX #endif // __STDC_LIMIT_MACROS ] // 7.18.4 Limits of other integer types #if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 // 7.18.4.1 Macros for minimum-width integer constants #define INT8_C(val) val##i8 #define INT16_C(val) val##i16 #define INT32_C(val) val##i32 #define INT64_C(val) val##i64 #define UINT8_C(val) val##ui8 #define UINT16_C(val) val##ui16 #define UINT32_C(val) val##ui32 #define UINT64_C(val) val##ui64 // 7.18.4.2 Macros for greatest-width integer constants #define INTMAX_C INT64_C #define UINTMAX_C UINT64_C #endif // __STDC_CONSTANT_MACROS ] #endif // _MSC_STDINT_H_ ] libgpuarray-0.7.6/src/gpuarray_array.c000066400000000000000000001066631326743622600200570ustar00rootroot00000000000000#ifdef _MSC_VER #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include #include #include #include "private.h" #include "gpuarray/config.h" #include "gpuarray/array.h" #include "gpuarray/error.h" #include "gpuarray/kernel.h" #include "gpuarray/util.h" #include "util/error.h" #include "util/strb.h" #include "util/xxhash.h" struct extcopy_args { int itype; int otype; }; static int extcopy_eq(cache_key_t _k1, cache_key_t _k2) { struct extcopy_args *k1 = _k1; struct extcopy_args *k2 = _k2; return k1->itype == k2->itype && k1->otype == k2->otype; } static void extcopy_free(cache_key_t k) { free(k); } static uint32_t extcopy_hash(cache_key_t k) { return XXH32(k, sizeof(struct extcopy_args), 42); } static int ga_extcopy(GpuArray *dst, const GpuArray *src) { struct extcopy_args a, *aa; gpucontext *ctx = GpuArray_context(dst); GpuElemwise *k = NULL; void *args[2]; if (ctx != GpuArray_context(src)) return error_set(ctx->err, GA_INVALID_ERROR, "src and dst context differ"); a.itype = src->typecode; a.otype = dst->typecode; if (ctx->extcopy_cache != NULL) k = cache_get(ctx->extcopy_cache, &a); if (k == NULL) { gpuelemwise_arg gargs[2]; gargs[0].name = "src"; gargs[0].typecode = src->typecode; gargs[0].flags = GE_READ; gargs[1].name = "dst"; gargs[1].typecode = dst->typecode; gargs[1].flags = GE_WRITE; k = GpuElemwise_new(ctx, "", "dst = src", 2, gargs, 0, GE_CONVERT_F16); if (k == NULL) return ctx->err->code; aa = memdup(&a, sizeof(a)); if (aa == NULL) { GpuElemwise_free(k); return error_sys(ctx->err, "memdup"); } if (ctx->extcopy_cache == NULL) ctx->extcopy_cache = cache_twoq(4, 8, 8, 2, extcopy_eq, extcopy_hash, extcopy_free, (cache_freev_fn)GpuElemwise_free, ctx->err); if (ctx->extcopy_cache == NULL) return ctx->err->code; if (cache_add(ctx->extcopy_cache, aa, k) != 0) return error_set(ctx->err, GA_MISC_ERROR, "Could not store GpuElemwise copy kernel in context cache"); } args[0] = (void *)src; args[1] = (void *)dst; return GpuElemwise_call(k, args, GE_BROADCAST); } /* Value below which a size_t multiplication will never overflow. */ #define MUL_NO_OVERFLOW (1ULL << (sizeof(size_t) * 4)) void GpuArray_fix_flags(GpuArray *a) { /* Only keep the writable flag */ a->flags &= GA_WRITEABLE; /* Set the other flags if applicable */ if (GpuArray_is_c_contiguous(a)) a->flags |= GA_C_CONTIGUOUS; if (GpuArray_is_f_contiguous(a)) a->flags |= GA_F_CONTIGUOUS; if (GpuArray_is_aligned(a)) a->flags |= GA_ALIGNED; } int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode, unsigned int nd, const size_t *dims, ga_order ord) { size_t size = gpuarray_get_elsize(typecode); unsigned int i; int res = GA_NO_ERROR; if (typecode == GA_SIZE || typecode == GA_SSIZE) return error_set(ctx->err, GA_VALUE_ERROR, "Cannot create array with size type"); if (ord == GA_ANY_ORDER) ord = GA_C_ORDER; if (ord != GA_C_ORDER && ord != GA_F_ORDER) return error_set(ctx->err, GA_VALUE_ERROR, "Invalid order"); for (i = 0; i < nd; i++) { size_t d = dims[i]; /* Check for overflow */ if ((d >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) && d > 0 && SIZE_MAX / d < size) return error_set(ctx->err, GA_XLARGE_ERROR, "Total array size greater than addressable space"); size *= d; } /* We add a offset of 64 to all arrays in DEBUG to help catch errors. */ #ifdef DEBUG assert(SIZE_MAX - size > 64); size += 64; #endif a->data = gpudata_alloc(ctx, size, NULL, 0, &res); if (a->data == NULL) return ctx->err->code; a->nd = nd; #ifdef DEBUG a->offset = 64; #else a->offset = 0; #endif a->typecode = typecode; a->dimensions = calloc(nd, sizeof(size_t)); a->strides = calloc(nd, sizeof(ssize_t)); /* F/C distinction comes later */ a->flags = GA_BEHAVED; if (a->dimensions == NULL || a->strides == NULL) { GpuArray_clear(a); return error_sys(ctx->err, "calloc"); } /* Mult will not overflow since calloc succeded */ memcpy(a->dimensions, dims, sizeof(size_t)*nd); size = gpuarray_get_elsize(typecode); /* mults will not overflow, checked on entry */ switch (ord) { case GA_C_ORDER: for (i = nd; i > 0; i--) { a->strides[i-1] = size; size *= a->dimensions[i-1]; } a->flags |= GA_C_CONTIGUOUS; break; case GA_F_ORDER: for (i = 0; i < nd; i++) { a->strides[i] = size; size *= a->dimensions[i]; } a->flags |= GA_F_CONTIGUOUS; break; default: assert(0); /* cannot be reached */ } if (a->nd <= 1) a->flags |= GA_F_CONTIGUOUS|GA_C_CONTIGUOUS; return GA_NO_ERROR; } int GpuArray_zeros(GpuArray *a, gpucontext *ctx, int typecode, unsigned int nd, const size_t *dims, ga_order ord) { int err; err = GpuArray_empty(a, ctx, typecode, nd, dims, ord); if (err != GA_NO_ERROR) return err; err = gpudata_memset(a->data, a->offset, 0); if (err != GA_NO_ERROR) { GpuArray_clear(a); } return err; } int GpuArray_fromdata(GpuArray *a, gpudata *data, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, int writeable) { gpucontext *ctx = gpudata_context(data); if (typecode == GA_SIZE || typecode == GA_SSIZE) return error_set(ctx->err, GA_VALUE_ERROR, "Cannot create array with size type"); assert(data != NULL); a->data = data; gpudata_retain(a->data); a->nd = nd; a->offset = offset; a->typecode = typecode; a->dimensions = calloc(nd, sizeof(size_t)); a->strides = calloc(nd, sizeof(ssize_t)); a->flags = (writeable ? GA_WRITEABLE : 0); if (a->dimensions == NULL || a->strides == NULL) { GpuArray_clear(a); return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } memcpy(a->dimensions, dims, nd*sizeof(size_t)); memcpy(a->strides, strides, nd*sizeof(ssize_t)); GpuArray_fix_flags(a); return GA_NO_ERROR; } int GpuArray_view(GpuArray *v, const GpuArray *a) { gpucontext *ctx = GpuArray_context(a); v->data = a->data; gpudata_retain(a->data); v->nd = a->nd; v->offset = a->offset; v->typecode = a->typecode; v->flags = a->flags; v->dimensions = calloc(v->nd, sizeof(size_t)); v->strides = calloc(v->nd, sizeof(ssize_t)); if (v->dimensions == NULL || v->strides == NULL) { GpuArray_clear(v); return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } memcpy(v->dimensions, a->dimensions, v->nd*sizeof(size_t)); memcpy(v->strides, a->strides, v->nd*sizeof(ssize_t)); return GA_NO_ERROR; } int GpuArray_sync(GpuArray *a) { return gpudata_sync(a->data); } int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps) { gpucontext *ctx = GpuArray_context(a); unsigned int i, new_i; unsigned int new_nd = a->nd; size_t *newdims; ssize_t *newstrs; size_t new_offset = a->offset; if ((starts == NULL) || (stops == NULL) || (steps == NULL)) return error_set(ctx->err, GA_VALUE_ERROR, "Invalid slice (contains NULL)"); for (i = 0; i < a->nd; i++) { if (steps[i] == 0) new_nd -= 1; } newdims = calloc(new_nd, sizeof(size_t)); newstrs = calloc(new_nd, sizeof(ssize_t)); if (newdims == NULL || newstrs == NULL) { free(newdims); free(newstrs); return error_sys(ctx->err, "calloc"); } new_i = 0; for (i = 0; i < a->nd; i++) { if (starts[i] < -1 || (starts[i] > 0 && (size_t)starts[i] > a->dimensions[i])) { free(newdims); free(newstrs); return error_fmt(ctx->err, GA_VALUE_ERROR, "Invalid slice value: slice(%lld, %lld, %lld) when " "indexing array on dimension %u of length %lld", starts[i], stops[i], steps[i], i, a->dimensions[i]); } if (steps[i] == 0 && (starts[i] == -1 || (size_t)starts[i] >= a->dimensions[i])) { free(newdims); free(newstrs); return error_fmt(ctx->err, GA_VALUE_ERROR, "Invalid slice value: slice(%lld, %lld, %lld) when " "indexing array on dimension %u of length %lld", starts[i], stops[i], steps[i], i, a->dimensions[i]); } new_offset += starts[i] * a->strides[i]; if (steps[i] != 0) { if ((stops[i] < -1 || (stops[i] > 0 && (size_t)stops[i] > a->dimensions[i])) || (stops[i]-starts[i])/steps[i] < 0) { free(newdims); free(newstrs); return error_fmt(ctx->err, GA_VALUE_ERROR, "Invalid slice value: slice(%lld, %lld, %lld) when " "indexing array on dimension %u of length %lld", starts[i], stops[i], steps[i], i, a->dimensions[i]); } newstrs[new_i] = steps[i] * a->strides[i]; newdims[new_i] = (stops[i]-starts[i]+steps[i]- (steps[i] < 0? -1 : 1))/steps[i]; new_i++; } } a->nd = new_nd; a->offset = new_offset; free(a->dimensions); a->dimensions = newdims; free(a->strides); a->strides = newstrs; GpuArray_fix_flags(a); return GA_NO_ERROR; } int GpuArray_index(GpuArray *r, const GpuArray *a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps) { int err; err = GpuArray_view(r, a); if (err != GA_NO_ERROR) return err; err = GpuArray_index_inplace(r, starts, stops, steps); if (err != GA_NO_ERROR) GpuArray_clear(r); return err; } static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, GpuArray *a, const GpuArray *v, const GpuArray *ind, int addr32) { strb sb = STRB_STATIC_INIT; int *atypes; char *sz, *ssz; unsigned int i, i2; unsigned int nargs, apos; int flags = 0; int res; nargs = 9 + 2 * v->nd; atypes = calloc(nargs, sizeof(int)); if (atypes == NULL) return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); if (addr32) { sz = "ga_uint"; ssz = "ga_int"; } else { sz = "ga_size"; ssz = "ga_ssize"; } apos = 0; strb_appendf(&sb, "#include \"cluda.h\"\n" "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, " "GLOBAL_MEM const %s *v, ga_size v_off,", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(v->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; for (i = 0; i < v->nd; i++) { strb_appendf(&sb, " ga_ssize s%u, ga_size d%u,", i, i); atypes[apos++] = GA_SSIZE; atypes[apos++] = GA_SIZE; } strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, " "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n", gpuarray_get_type(ind->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; assert(apos == nargs); strb_appendf(&sb, " const %s idx0 = LDIM_0 * GID_0 + LID_0;\n" " const %s numThreads0 = LDIM_0 * GDIM_0;\n" " const %s idx1 = LDIM_1 * GID_1 + LID_1;\n" " const %s numThreads1 = LDIM_1 * GDIM_1;\n" " %s i0, i1;\n", sz, sz, sz, sz, sz); strb_appends(&sb, " if (idx0 >= n0 || idx1 >= n1) return;\n"); strb_appendf(&sb, " r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n" " ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(ind->typecode)->cluda_name); strb_appendf(&sb, " for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n" " %s ii0 = ind[i0];\n" " %s pos0 = v_off;\n" " if (ii0 < 0) ii0 += d0;\n" " if ((ii0 < 0) || (ii0 >= (%s)d0)) {\n" " *err = -1;\n" " continue;\n" " }\n" " pos0 += ii0 * (%s)s0;\n" " for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n" " %s p = pos0;\n", ssz, sz, ssz, sz, sz); if (v->nd > 1) { strb_appendf(&sb, " %s pos, ii = i1;\n", sz); for (i2 = v->nd; i2 > 1; i2--) { i = i2 - 1; if (i > 1) strb_appendf(&sb, " pos = ii %% (%s)d%u;\n" " ii /= (%s)d%u;\n", sz, i, sz, i); else strb_appends(&sb, " pos = ii;\n"); strb_appendf(&sb, " p += pos * (%s)s%u;\n", ssz, i); } } strb_appendf(&sb, " r[i0*((%s)n1) + i1] = *((GLOBAL_MEM %s *)(((GLOBAL_MEM char *)v) + p));\n", sz, gpuarray_get_type(v->typecode)->cluda_name); strb_appends(&sb, " }\n" " }\n" "}\n"); if (strb_error(&sb)) { res = error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); goto bail; } flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1); res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "take1", nargs, atypes, flags, err_str); bail: free(atypes); strb_clear(&sb); return res; } int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, int check_error) { gpucontext *ctx = GpuArray_context(a); size_t n[2], ls[2] = {0, 0}, gs[2] = {0, 0}; size_t pl; gpudata *errbuf; #if DEBUG char *errstr = NULL; #endif GpuKernel k; unsigned int j; unsigned int argp; int err, kerr = 0; int addr32 = 0; if (!GpuArray_ISWRITEABLE(a)) return error_set(ctx->err, GA_VALUE_ERROR, "Destination array not writeable"); if (!GpuArray_ISALIGNED(a) || !GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(i)) return error_fmt(ctx->err, GA_UNALIGNED_ERROR, "Not all arrays are aligned: a (%d), b (%d), i (%d)", GpuArray_ISALIGNED(a), GpuArray_ISALIGNED(v), GpuArray_ISALIGNED(i)); /* a and i have to be C contiguous */ if (!GpuArray_IS_C_CONTIGUOUS(a)) return error_set(ctx->err, GA_INVALID_ERROR, "Destination array (a) not C-contiguous"); if (!GpuArray_IS_C_CONTIGUOUS(i)) return error_set(ctx->err, GA_INVALID_ERROR, "Index array (i) not C-contiguous"); /* Check that the dimensions match namely a[0] == i[0] and a[>0] == v[>0] */ if (v->nd == 0 || a->nd == 0 || i->nd != 1 || a->nd != v->nd) return error_fmt(ctx->err, GA_INVALID_ERROR, "Dimension mismatch. " "v->nd = %llu, a->nd = %llu, i->nd = %llu", v->nd, a->nd, i->nd); if (a->dimensions[0] != i->dimensions[0]) return error_fmt(ctx->err, GA_INVALID_ERROR, "Dimension mismatch. " "a->dimensions[0] = %llu, i->dimensions[0] = %llu", a->dimensions[0], i->dimensions[0]); n[0] = i->dimensions[0]; n[1] = 1; for (j = 1; j < v->nd; j++) { if (a->dimensions[j] != v->dimensions[j]) return error_fmt(ctx->err, GA_INVALID_ERROR, "Dimension mismatch. " "a->dimensions[%llu] = %llu, i->dimensions[%llu] = %llu", j, a->dimensions[j], j, i->dimensions[j]); n[1] *= v->dimensions[j]; } if (n[0] * n[1] < SADDR32_MAX) { addr32 = 1; } err = gpudata_property(v->data, GA_CTX_PROP_ERRBUF, &errbuf); if (err != GA_NO_ERROR) return err; err = gen_take1_kernel(&k, ctx, #if DEBUG &errstr, #else NULL, #endif a, v, i, addr32); #if DEBUG if (errstr != NULL) { fprintf(stderr, "%s\n", errstr); free(errstr); } #endif if (err != GA_NO_ERROR) return err; err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]); if (err != GA_NO_ERROR) goto out; /* This may not be the best scheduling, but it's good enough */ err = gpukernel_property(k.k, GA_KERNEL_PROP_PREFLSIZE, &pl); ls[0] = ls[1] / pl; ls[1] = pl; if (n[1] > n[0]) { pl = ls[0]; ls[0] = ls[1]; ls[1] = pl; gs[0] = 1; } else { gs[0] = gs[1]; gs[1] = 1; } argp = 0; GpuKernel_setarg(&k, argp++, a->data); GpuKernel_setarg(&k, argp++, (void *)&a->offset); GpuKernel_setarg(&k, argp++, v->data); /* The cast is to avoid a warning about const */ GpuKernel_setarg(&k, argp++, (void *)&v->offset); for (j = 0; j < v->nd; j++) { GpuKernel_setarg(&k, argp++, &v->strides[j]); GpuKernel_setarg(&k, argp++, &v->dimensions[j]); } GpuKernel_setarg(&k, argp++, i->data); GpuKernel_setarg(&k, argp++, (void *)&i->offset); GpuKernel_setarg(&k, argp++, &n[0]); GpuKernel_setarg(&k, argp++, &n[1]); GpuKernel_setarg(&k, argp++, errbuf); err = GpuKernel_call(&k, 2, gs, ls, 0, NULL); if (check_error && err == GA_NO_ERROR) { err = gpudata_read(&kerr, errbuf, 0, sizeof(int)); if (err == GA_NO_ERROR && kerr != 0) { err = error_set(ctx->err, GA_VALUE_ERROR, "Index out of bounds"); kerr = 0; /* We suppose this will not fail */ gpudata_write(errbuf, 0, &kerr, sizeof(int)); } } out: GpuKernel_clear(&k); return err; } int GpuArray_setarray(GpuArray *a, const GpuArray *v) { gpucontext *ctx = GpuArray_context(a); GpuArray tv; size_t sz; ssize_t *strs; unsigned int i, off; int err = GA_NO_ERROR; int simple_move = 1; if (a->nd < v->nd) return error_fmt(ctx->err, GA_VALUE_ERROR, "Dimension error. " "a->nd = %llu, v->nd = %llu", a->nd, v->nd); if (!GpuArray_ISWRITEABLE(a)) return error_set(ctx->err, GA_VALUE_ERROR, "Destination array not writable"); if (!GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(a)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "One of the inputs is unaligned"); off = a->nd - v->nd; for (i = 0; i < v->nd; i++) { if (v->dimensions[i] != a->dimensions[i+off]) { if (v->dimensions[i] != 1) return error_fmt(ctx->err, GA_VALUE_ERROR, "Shape error. " "v->dimensions[%u] = %llu, a->dimesions[%u + %u] = %llu", i, v->dimensions[i], i, off, a->dimensions[i + off]); else simple_move = 0; } } if (simple_move && GpuArray_ISONESEGMENT(a) && GpuArray_ISONESEGMENT(v) && GpuArray_ISFORTRAN(a) == GpuArray_ISFORTRAN(v) && a->typecode == v->typecode && a->nd == v->nd) { sz = gpuarray_get_elsize(a->typecode); for (i = 0; i < a->nd; i++) sz *= a->dimensions[i]; return gpudata_move(a->data, a->offset, v->data, v->offset, sz); } strs = calloc(a->nd, sizeof(ssize_t)); if (strs == NULL) return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); for (i = off; i < a->nd; i++) { if (v->dimensions[i-off] == a->dimensions[i]) { strs[i] = v->strides[i-off]; } } memcpy(&tv, v, sizeof(GpuArray)); tv.nd = a->nd; tv.dimensions = a->dimensions; tv.strides = strs; if (tv.nd != 0) GpuArray_fix_flags(&tv); err = ga_extcopy(a, &tv); free(strs); return err; } int GpuArray_reshape(GpuArray *res, const GpuArray *a, unsigned int nd, const size_t *newdims, ga_order ord, int nocopy) { int err; err = GpuArray_view(res, a); if (err != GA_NO_ERROR) return err; err = GpuArray_reshape_inplace(res, nd, newdims, ord); if (err == GA_COPY_ERROR && !nocopy) { GpuArray_clear(res); err = GpuArray_copy(res, a, ord); if (err != GA_NO_ERROR) return err; err = GpuArray_reshape_inplace(res, nd, newdims, ord); } if (err != GA_NO_ERROR) GpuArray_clear(res); return err; } int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, const size_t *newdims, ga_order ord) { gpucontext *ctx = GpuArray_context(a); ssize_t *newstrides; size_t *tmpdims; size_t np; size_t op; size_t newsize = 1; size_t oldsize = 1; unsigned int ni = 0; unsigned int oi = 0; unsigned int nj = 1; unsigned int oj = 1; unsigned int nk; unsigned int ok; unsigned int i; if (ord == GA_ANY_ORDER && GpuArray_ISFORTRAN(a) && a->nd > 1) ord = GA_F_ORDER; for (i = 0; i < a->nd; i++) { oldsize *= a->dimensions[i]; } for (i = 0; i < nd; i++) { size_t d = newdims[i]; /* Check for overflow */ if ((d >= MUL_NO_OVERFLOW || newsize >= MUL_NO_OVERFLOW) && d > 0 && SIZE_MAX / d < newsize) return error_set(ctx->err, GA_XLARGE_ERROR, "Output array size greater than addressable space"); newsize *= d; } if (newsize != oldsize) return error_set(ctx->err, GA_INVALID_ERROR, "New shape differs in total size"); /* If the source and desired layouts are the same, then just copy strides and dimensions */ if ((ord == GA_C_ORDER && GpuArray_CHKFLAGS(a, GA_C_CONTIGUOUS)) || (ord == GA_F_ORDER && GpuArray_CHKFLAGS(a, GA_F_CONTIGUOUS))) { goto do_final_copy; } newstrides = calloc(nd, sizeof(ssize_t)); if (newstrides == NULL) return error_sys(ctx->err, "calloc"); if (newsize != 0) { while (ni < nd && oi < a->nd) { np = newdims[ni]; op = a->dimensions[oi]; while (np != op) { if (np < op) { np *= newdims[nj++]; } else { op *= a->dimensions[oj++]; } } for (ok = oi; ok < oj - 1; ok++) { if (ord == GA_F_ORDER) { if (a->strides[ok+1] != (ssize_t)a->dimensions[ok]*a->strides[ok]) goto need_copy; } else { if (a->strides[ok] != (ssize_t)a->dimensions[ok+1]*a->strides[ok+1]) goto need_copy; } } if (ord == GA_F_ORDER) { newstrides[ni] = a->strides[oi]; for (nk = ni + 1; nk < nj; nk++) { newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1]; } } else { newstrides[nj-1] = a->strides[oj-1]; for (nk = nj-1; nk > ni; nk--) { newstrides[nk-1] = newstrides[nk]*newdims[nk]; } } ni = nj++; oi = oj++; } } /* Fixup trailing ones */ if (ord == GA_F_ORDER) { for (i = nj-1; i < nd; i++) { newstrides[i] = newstrides[i-1] * newdims[i-1]; } } else { for (i = nj-1; i < nd; i++) { newstrides[i] = gpuarray_get_elsize(a->typecode); } } /* We can reuse newstrides since it was allocated in this function. Can't do the same with newdims (which is a parameter). */ tmpdims = calloc(nd, sizeof(size_t)); if (tmpdims == NULL) { return error_sys(ctx->err, "calloc"); } memcpy(tmpdims, newdims, nd*sizeof(size_t)); a->nd = nd; free(a->dimensions); free(a->strides); a->dimensions = tmpdims; a->strides = newstrides; goto fix_flags; need_copy: free(newstrides); return error_set(ctx->err, GA_COPY_ERROR, "Copy is needed but disallowed by parameters"); do_final_copy: tmpdims = calloc(nd, sizeof(size_t)); newstrides = calloc(nd, sizeof(ssize_t)); if (tmpdims == NULL || newstrides == NULL) { free(tmpdims); free(newstrides); return error_sys(ctx->err, "calloc"); } memcpy(tmpdims, newdims, nd*sizeof(size_t)); if (nd > 0) { if (ord == GA_F_ORDER) { newstrides[0] = gpuarray_get_elsize(a->typecode); for (i = 1; i < nd; i++) { newstrides[i] = newstrides[i-1] * tmpdims[i-1]; } } else { newstrides[nd-1] = gpuarray_get_elsize(a->typecode); for (i = nd-1; i > 0; i--) { newstrides[i-1] = newstrides[i] * tmpdims[i]; } } } free(a->dimensions); free(a->strides); a->nd = nd; a->dimensions = tmpdims; a->strides = newstrides; fix_flags: GpuArray_fix_flags(a); return GA_NO_ERROR; } int GpuArray_transpose(GpuArray *res, const GpuArray *a, const unsigned int *new_axes) { int err; err = GpuArray_view(res, a); if (err != GA_NO_ERROR) return err; err = GpuArray_transpose_inplace(res, new_axes); if (err != GA_NO_ERROR) GpuArray_clear(res); return err; } int GpuArray_transpose_inplace(GpuArray *a, const unsigned int *new_axes) { gpucontext *ctx = GpuArray_context(a); size_t *newdims; ssize_t *newstrs; unsigned int i; unsigned int j; unsigned int k; newdims = calloc(a->nd, sizeof(size_t)); newstrs = calloc(a->nd, sizeof(ssize_t)); if (newdims == NULL || newstrs == NULL) { free(newdims); free(newstrs); return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } for (i = 0; i < a->nd; i++) { if (new_axes == NULL) { j = a->nd - i - 1; } else { j = new_axes[i]; // Repeated axes will lead to a broken output for (k = 0; k < i; k++) if (j == new_axes[k]) { free(newdims); free(newstrs); return error_fmt(ctx->err, GA_VALUE_ERROR, "Repeated axes in transpose: new_axes[%u] == new_axes[%u] == %u", i, k, j); } } newdims[i] = a->dimensions[j]; newstrs[i] = a->strides[j]; } free(a->dimensions); free(a->strides); a->dimensions = newdims; a->strides = newstrs; GpuArray_fix_flags(a); return GA_NO_ERROR; } void GpuArray_clear(GpuArray *a) { if (a->data) gpudata_release(a->data); free(a->dimensions); free(a->strides); memset(a, 0, sizeof(*a)); } int GpuArray_share(const GpuArray *a, const GpuArray *b) { if (a->data != b->data) return 0; /* XXX: redefine buffer_share to mean: is it possible to share? and use offset to make sure */ return gpudata_share(a->data, b->data, NULL); } gpucontext *GpuArray_context(const GpuArray *a) { return gpudata_context(a->data); } int GpuArray_move(GpuArray *dst, const GpuArray *src) { gpucontext *ctx = GpuArray_context(dst); size_t sz; unsigned int i; if (!GpuArray_ISWRITEABLE(dst)) return error_set(ctx->err, GA_VALUE_ERROR, "Destination array (dst) not writeable"); if (!GpuArray_ISALIGNED(src)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "Source array (src) not aligned"); if (!GpuArray_ISALIGNED(dst)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "Destination array (dst) not aligned"); if (src->nd != dst->nd) return error_fmt(ctx->err, GA_VALUE_ERROR, "Dimension mismatch. src->nd = %llu, dst->nd = %llu", src->nd, dst->nd); for (i = 0; i < src->nd; i++) { if (src->dimensions[i] != dst->dimensions[i]) return error_fmt(ctx->err, GA_VALUE_ERROR, "Dimension mismatch. src->dimensions[%u] = %llu, dst->dimensions[%u] = %llu", i, src->dimensions[i], i, dst->dimensions[i]); } if (!GpuArray_ISONESEGMENT(dst) || !GpuArray_ISONESEGMENT(src) || GpuArray_ISFORTRAN(dst) != GpuArray_ISFORTRAN(src) || dst->typecode != src->typecode) { return ga_extcopy(dst, src); } sz = gpuarray_get_elsize(dst->typecode); for (i = 0; i < dst->nd; i++) sz *= dst->dimensions[i]; return gpudata_move(dst->data, dst->offset, src->data, src->offset, sz); } int GpuArray_write(GpuArray *dst, const void *src, size_t src_sz) { gpucontext *ctx = GpuArray_context(dst); if (!GpuArray_ISWRITEABLE(dst)) return error_set(ctx->err, GA_VALUE_ERROR, "Destination array (dst) not writeable"); if (!GpuArray_ISONESEGMENT(dst)) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Destination array (dst) not one segment"); return gpudata_write(dst->data, dst->offset, src, src_sz); } int GpuArray_read(void *dst, size_t dst_sz, const GpuArray *src) { gpucontext *ctx = GpuArray_context(src); if (!GpuArray_ISONESEGMENT(src)) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (src) not one segment"); return gpudata_read(dst, src->data, src->offset, dst_sz); } int GpuArray_memset(GpuArray *a, int data) { gpucontext *ctx = GpuArray_context(a); if (!GpuArray_ISONESEGMENT(a)) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (a) not one segment"); return gpudata_memset(a->data, a->offset, data); } int GpuArray_copy(GpuArray *res, const GpuArray *a, ga_order order) { int err; err = GpuArray_empty(res, GpuArray_context(a), a->typecode, a->nd, a->dimensions, order); if (err != GA_NO_ERROR) return err; err = GpuArray_move(res, a); if (err != GA_NO_ERROR) GpuArray_clear(res); return err; } int GpuArray_transfer(GpuArray *res, const GpuArray *a) { gpucontext *ctx = GpuArray_context(res); size_t sz; unsigned int i; if (!GpuArray_ISONESEGMENT(res)) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (res) not one segment"); if (!GpuArray_ISONESEGMENT(a)) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (a) not one segment"); if (res->typecode != a->typecode) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "typecode mismatch"); sz = gpuarray_get_elsize(a->typecode); for (i = 0; i < a->nd; i++) sz *= a->dimensions[i]; return gpudata_transfer(res->data, res->offset, a->data, a->offset, sz); } int GpuArray_split(GpuArray **rs, const GpuArray *a, size_t n, size_t *p, unsigned int axis) { gpucontext *ctx = GpuArray_context(a); size_t i; ssize_t *starts, *stops, *steps; int err; starts = calloc(a->nd, sizeof(ssize_t)); stops = calloc(a->nd, sizeof(ssize_t)); steps = calloc(a->nd, sizeof(ssize_t)); if (starts == NULL || stops == NULL || steps == NULL) { free(starts); free(stops); free(steps); return error_sys(ctx->err, "calloc"); } for (i = 0; i < a->nd; i++) { starts[i] = 0; stops[i] = a->dimensions[i]; steps[i] = 1; } for (i = 0; i <= n; i++) { if (i > 0) starts[axis] = p[i-1]; else starts[axis] = 0; if (i < n) stops[axis] = p[i]; else stops[axis] = a->dimensions[axis]; err = GpuArray_index(rs[i], a, starts, stops, steps); if (err != GA_NO_ERROR) break; } free(starts); free(stops); free(steps); if (err != GA_NO_ERROR) { size_t ii; for (ii = 0; ii < i; ii++) GpuArray_clear(rs[ii]); } return err; } int GpuArray_concatenate(GpuArray *r, const GpuArray **as, size_t n, unsigned int axis, int restype) { gpucontext *ctx = GpuArray_context(as[0]); size_t *dims, *res_dims; size_t i, res_off; unsigned int p; int res_flags; int err = GA_NO_ERROR; if (axis >= as[0]->nd) return error_fmt(ctx->err, GA_VALUE_ERROR, "Invalid axis. " "axis = %u, as[0]->nd = %llu", axis, as[0]->nd); dims = calloc(as[0]->nd, sizeof(size_t)); if (dims == NULL) return error_fmt(ctx->err, GA_MEMORY_ERROR, "Out of memory"); for (p = 0; p < as[0]->nd; p++) { dims[p] = as[0]->dimensions[p]; } if (!GpuArray_ISALIGNED(as[0])) { err = error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned array (as[0])."); goto afterloop; } for (i = 1; i < n; i++) { if (!GpuArray_ISALIGNED(as[i])) { err = error_fmt(ctx->err, GA_UNALIGNED_ERROR, "Unaligned array (as[%llu]).", i); goto afterloop; } if (as[i]->nd != as[0]->nd) { err = error_fmt(ctx->err, GA_VALUE_ERROR, "Shape mismatch. " "as[%llu]->nd = %llu, as[0]->nd = %llu", i, as[i]->nd, as[0]->nd); goto afterloop; } for (p = 0; p < as[0]->nd; p++) { if (p != axis && dims[p] != as[i]->dimensions[p]) { err = error_fmt(ctx->err, GA_VALUE_ERROR, "Dimension mismatch. " "as[%llu]->dimensions[%u] = %llu, as[0]->dimensions[%u] = %llu", i, p, as[i]->dimensions[p], p, dims[p]); goto afterloop; } else if (p == axis) { dims[p] += as[i]->dimensions[p]; } } } afterloop: if (err != GA_NO_ERROR) { free(dims); return err; } err = GpuArray_empty(r, GpuArray_context(as[0]), restype, as[0]->nd, dims, GA_ANY_ORDER); free(dims); if (err != GA_NO_ERROR) { return err; } res_off = r->offset; res_dims = r->dimensions; res_flags = r->flags; for (i = 0; i < n; i++) { r->dimensions = as[i]->dimensions; GpuArray_fix_flags(r); err = ga_extcopy(r, as[i]); if (err != GA_NO_ERROR) { r->dimensions = res_dims; goto fail; } r->offset += r->strides[axis] * as[i]->dimensions[axis]; } r->offset = res_off; r->dimensions = res_dims; r->flags = res_flags; return GA_NO_ERROR; fail: GpuArray_clear(r); return err; } const char *GpuArray_error(const GpuArray *a, int err) { return gpucontext_error(gpudata_context(a->data), err); } void GpuArray_fprintf(FILE *fd, const GpuArray *a) { unsigned int i; int comma = 0; fprintf(fd, "GpuArray <%p, data: %p (%p)> nd=%d\n", a, a->data, *((void **)a->data), a->nd); fprintf(fd, "\tdims: %p, str: %p\n", a->dimensions, a->strides); fprintf(fd, "\tITEMSIZE: %zd\n", GpuArray_ITEMSIZE(a)); fprintf(fd, "\tTYPECODE: %d\n", a->typecode); fprintf(fd, "\tOFFSET: %" SPREFIX "u\n", a->offset); fprintf(fd, "\tHOST_DIMS: "); for (i = 0; i < a->nd; ++i) { fprintf(fd, "%zu\t", a->dimensions[i]); } fprintf(fd, "\n\tHOST_STRIDES: "); for (i = 0; i < a->nd; ++i) { fprintf(fd, "%zd\t", a->strides[i]); } fprintf(fd, "\nFLAGS:"); #define PRINTFLAG(flag) if (a->flags & flag) { \ if (comma) fputc(',', fd); \ fprintf(fd, " " #flag); \ comma = 1; \ } PRINTFLAG(GA_C_CONTIGUOUS); if (!GpuArray_is_c_contiguous(a) && ISSET(a->flags, GA_C_CONTIGUOUS)) fputc('!', fd); PRINTFLAG(GA_F_CONTIGUOUS); if (!GpuArray_is_f_contiguous(a) && ISSET(a->flags, GA_F_CONTIGUOUS)) fputc('!', fd); PRINTFLAG(GA_ALIGNED); PRINTFLAG(GA_WRITEABLE); #undef PRINTFLAG fputc('\n', fd); } int GpuArray_fdump(FILE *fd, const GpuArray *a) { gpucontext *ctx = GpuArray_context(a); char *buf, *p; size_t s = GpuArray_ITEMSIZE(a); size_t k; unsigned int i; int err; for (i = 0; i < a->nd; i++) s *= a->dimensions[i]; buf = malloc(s); if (buf == NULL) return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); err = GpuArray_read(buf, s, a); if (err != GA_NO_ERROR) { free(buf); return err; } p = buf; k = 0; while (s) { fprintf(fd, "[%" SPREFIX "u] = ", k); switch (a->typecode) { case GA_UINT: fprintf(fd, "%u", *(unsigned int *)p); break; case GA_LONG: fprintf(fd, "%lld", (long long)*(int64_t *)p); break; case GA_FLOAT: fprintf(fd, "%f", *(float *)p); break; case GA_SSIZE: fprintf(fd, "%" SPREFIX "d", *(ssize_t *)p); break; default: free(buf); fprintf(fd, "\n", a->typecode); return error_fmt(ctx->err, GA_UNSUPPORTED_ERROR, "Unsupported data type for dump: %d", a->typecode); } s -= gpuarray_get_elsize(a->typecode); p += gpuarray_get_elsize(a->typecode); k++; fprintf(fd, "\n"); } free(buf); return GA_NO_ERROR; } int GpuArray_is_c_contiguous(const GpuArray *a) { size_t size = GpuArray_ITEMSIZE(a); int i; for (i = a->nd - 1; i >= 0; i--) { if (a->strides[i] != (ssize_t)size && a->dimensions[i] != 1) return 0; // We suppose that overflow will not happen since data has to fit in memory size *= a->dimensions[i]; } return 1; } int GpuArray_is_f_contiguous(const GpuArray *a) { size_t size = GpuArray_ITEMSIZE(a); unsigned int i; for (i = 0; i < a->nd; i++) { if (a->strides[i] != (ssize_t)size && a->dimensions[i] != 1) return 0; // We suppose that overflow will not happen since data has to fit in memory size *= a->dimensions[i]; } return 1; } int GpuArray_is_aligned(const GpuArray *a) { size_t align = gpuarray_get_type(a->typecode)->align; unsigned int i; if (a->offset % align != 0) return 0; for (i = 0; i < a->nd; i++) { if (a->strides[i] % align != 0) return 0; } return 1; } libgpuarray-0.7.6/src/gpuarray_array_blas.c000066400000000000000000000545151326743622600210560ustar00rootroot00000000000000#include #include "gpuarray/blas.h" #include "gpuarray/buffer_blas.h" #include "gpuarray/types.h" #include "gpuarray/util.h" #include "private.h" #include "util/error.h" int GpuArray_rdot(GpuArray *X, GpuArray *Y, GpuArray *Z, int nocopy) { GpuArray *Xp = X; GpuArray copyX; GpuArray *Yp = Y; GpuArray copyY; GpuArray *Zp = Z; size_t n; gpucontext *ctx = gpudata_context(Xp->data); size_t elsize; int err; if (X->typecode != GA_HALF && X->typecode != GA_FLOAT && X->typecode != GA_DOUBLE) return error_set(ctx->err, GA_INVALID_ERROR, "Data type not supported"); if (X->nd != 1 || Y->nd != 1 || Z->nd != 0) return error_fmt(ctx->err, GA_VALUE_ERROR, "Wrong number of dimensions: X->nd = %u (expected 1), Y->nd = %u (expected 1), Z->nd = %u (expected 0)", X->nd, Y->nd, Z->nd); if (X->typecode != Y->typecode || X->typecode != Z->typecode) error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes"); n = X->dimensions[0]; if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || !(Z->flags & GA_ALIGNED)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "One of the inputs is unaligned"); if (X->dimensions[0] != Y->dimensions[0]) return error_fmt(ctx->err, GA_VALUE_ERROR, "Shape mismatch: X->dimensions[0] = %d != Y->dimensions[0] = %d", X->dimensions[0], Y->dimensions[0]); elsize = gpuarray_get_elsize(X->typecode); if (X->strides[0] < 0) { if (nocopy) return error_set(ctx->err, GA_COPY_ERROR, "Copy required for X"); else { err = GpuArray_copy(©X, X, GA_ANY_ORDER); if (err != GA_NO_ERROR) goto cleanup; Xp = ©X; } } if (Y->strides[0] < 0) { if (nocopy) return error_set(ctx->err, GA_COPY_ERROR, "Copy required for Y"); else { err = GpuArray_copy(©Y, Y, GA_ANY_ORDER); if (err != GA_NO_ERROR) goto cleanup; Yp = ©Y; } } err = gpublas_setup(ctx); if (err != GA_NO_ERROR) goto cleanup; switch (Xp->typecode) { case GA_HALF: err = gpublas_hdot( n, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, Zp->data, Zp->offset / elsize); break; case GA_FLOAT: err = gpublas_sdot( n, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, Zp->data, Zp->offset / elsize); break; case GA_DOUBLE: err = gpublas_ddot( n, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, Zp->data, Zp->offset / elsize); break; } cleanup: if (Xp == ©X) GpuArray_clear(©X); if (Yp == ©Y) GpuArray_clear(©Y); return err; } int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, double beta, GpuArray *Y, int nocopy) { GpuArray *Ap = A; GpuArray copyA; GpuArray *Xp = X; GpuArray copyX; GpuArray *Yp = Y; gpucontext *ctx = gpudata_context(Ap->data); size_t elsize; size_t m, n, lda; cb_order o; int err; if (A->typecode != GA_HALF && A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype"); if (A->nd != 2 || X->nd != 1 || Y->nd != 1) return error_fmt(ctx->err, GA_VALUE_ERROR, "Wrong number of dimensions: A->nd = %u (expected 2), X->nd = %u (expected 1), Y->nd = %u (expected 1)", A->nd, X->nd, Y->nd); if (X->typecode != A->typecode || Y->typecode != A->typecode) return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes"); if (!(A->flags & GA_ALIGNED) || !(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned inputs"); if (transA == cb_no_trans) { m = A->dimensions[0]; n = A->dimensions[1]; } else { m = A->dimensions[1]; n = A->dimensions[0]; } if (Y->dimensions[0] != m || X->dimensions[0] != n) return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent shapes"); m = A->dimensions[0]; n = A->dimensions[1]; elsize = gpuarray_get_elsize(A->typecode); if (!GpuArray_ISONESEGMENT(A)) { if (nocopy) return error_set(ctx->err, GA_COPY_ERROR, "Copy required for A"); else { err = GpuArray_copy(©A, A, GA_F_ORDER); if (err != GA_NO_ERROR) goto cleanup; Ap = ©A; } } if (X->strides[0] < 0) { if (nocopy) return error_set(ctx->err, GA_COPY_ERROR, "Copy required for X"); else { err = GpuArray_copy(©X, X, GA_ANY_ORDER); if (err != GA_NO_ERROR) goto cleanup; Xp = ©X; } } if (Y->strides[0] < 0) { err = error_set(ctx->err, GA_VALUE_ERROR, "Negative strides for Y"); goto cleanup; } if (Ap->flags & GA_F_CONTIGUOUS) { o = cb_fortran; lda = Ap->dimensions[0]; } else if (Ap->flags & GA_C_CONTIGUOUS) { o = cb_c; lda = Ap->dimensions[1]; } else { /* Might be worth looking at making degenerate matrices (1xn) work here. */ err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A"); goto cleanup; } err = gpublas_setup(ctx); if (err != GA_NO_ERROR) goto cleanup; switch (Ap->typecode) { case GA_HALF: err = gpublas_hgemv(o, transA, m, n, (float)alpha, Ap->data, Ap->offset / elsize, lda, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, (float)beta, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize); break; case GA_FLOAT: err = gpublas_sgemv(o, transA, m, n, (float)alpha, Ap->data, Ap->offset / elsize, lda, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, (float)beta, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize); break; case GA_DOUBLE: err = gpublas_dgemv(o, transA, m, n, (double)alpha, Ap->data, Ap->offset / elsize, lda, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, (double)beta, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize); break; } cleanup: if (Ap == ©A) GpuArray_clear(©A); if (Xp == ©X) GpuArray_clear(©X); return err; } int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuArray *A, GpuArray *B, double beta, GpuArray *C, int nocopy) { GpuArray *Ap = A; GpuArray copyA; GpuArray *Bp = B; GpuArray copyB; GpuArray *Cp = C; gpucontext *ctx = gpudata_context(Ap->data); size_t elsize; size_t m, n, k, lda, ldb, ldc; cb_order o; int err; if (A->typecode != GA_HALF && A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype"); if (A->nd != 2 || B->nd != 2 || C->nd != 2) return error_fmt(ctx->err, GA_VALUE_ERROR, "Wrong number of dimensions: A->nd = %u (expected 2), B->nd = %u (expected 2), C->nd = %u (expected 2)", A->nd, B->nd, C->nd); if (B->typecode != A->typecode || C->typecode != A->typecode) return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes"); if (!(A->flags & GA_ALIGNED) || !(B->flags & GA_ALIGNED) || !(C->flags & GA_ALIGNED)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned inputs"); if (transA == cb_no_trans) { m = A->dimensions[0]; k = A->dimensions[1]; } else { m = A->dimensions[1]; k = A->dimensions[0]; } if (transB == cb_no_trans) { n = B->dimensions[1]; if (B->dimensions[0] != k) return error_set(ctx->err, GA_VALUE_ERROR, "mismatched shapes"); } else { n = B->dimensions[0]; if (B->dimensions[1] != k) return error_set(ctx->err, GA_VALUE_ERROR, "mismatched shapes"); } if (C->dimensions[0] != m || C->dimensions[1] != n) return error_set(ctx->err, GA_VALUE_ERROR, "mismatched shapes"); elsize = gpuarray_get_elsize(A->typecode); if (!GpuArray_ISONESEGMENT(A)) { if (nocopy) return error_set(ctx->err, GA_COPY_ERROR, "Need copy for A"); else { err = GpuArray_copy(©A, A, GA_F_ORDER); if (err != GA_NO_ERROR) goto cleanup; Ap = ©A; } } if (!GpuArray_ISONESEGMENT(B)) { if (nocopy) return error_set(ctx->err, GA_COPY_ERROR, "Need copy for B"); else { err = GpuArray_copy(©B, B, GA_F_ORDER); if (err != GA_NO_ERROR) goto cleanup; Bp = ©B; } } if (!GpuArray_ISONESEGMENT(C)) { err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous C"); goto cleanup; } if (Cp->flags & GA_F_CONTIGUOUS) { o = cb_fortran; ldc = Cp->dimensions[0]; } else if (Cp->flags & GA_C_CONTIGUOUS) { o = cb_c; ldc = Cp->dimensions[1]; } else { err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous C"); goto cleanup; } if (Ap->flags & GA_F_CONTIGUOUS) { lda = Ap->dimensions[0]; if (o == cb_c) { if (transA == cb_no_trans) transA = cb_trans; else transA = cb_no_trans; } } else if (Ap->flags & GA_C_CONTIGUOUS) { lda = Ap->dimensions[1]; if (o == cb_fortran) { if (transA == cb_no_trans) transA = cb_trans; else transA = cb_no_trans; } } else { err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A"); goto cleanup; } if (Bp->flags & GA_F_CONTIGUOUS) { ldb = Bp->dimensions[0]; if (o == cb_c) { if (transB == cb_no_trans) transB = cb_trans; else transB = cb_no_trans; } } else if (Bp->flags & GA_C_CONTIGUOUS) { ldb = Bp->dimensions[1]; if (o == cb_fortran) { if (transB == cb_no_trans) transB = cb_trans; else transB = cb_no_trans; } } else { err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous B"); goto cleanup; } ctx = gpudata_context(Ap->data); err = gpublas_setup(ctx); if (err != GA_NO_ERROR) goto cleanup; switch (Ap->typecode) { case GA_HALF: err = gpublas_hgemm(o, transA, transB, m, n, k, (float)alpha, Ap->data, Ap->offset / elsize, lda, Bp->data, Bp->offset / elsize, ldb, (float)beta, Cp->data, Cp->offset / elsize, ldc); break; case GA_FLOAT: err = gpublas_sgemm(o, transA, transB, m, n, k, (float)alpha, Ap->data, Ap->offset / elsize, lda, Bp->data, Bp->offset / elsize, ldb, (float)beta, Cp->data, Cp->offset / elsize, ldc); break; case GA_DOUBLE: err = gpublas_dgemm(o, transA, transB, m, n, k, (double)alpha, Ap->data, Ap->offset / elsize, lda, Bp->data, Bp->offset / elsize, ldb, (double)beta, Cp->data, Cp->offset / elsize, ldc); break; } cleanup: if (Ap == ©A) GpuArray_clear(©A); if (Bp == ©B) GpuArray_clear(©B); return err; } int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, int nocopy) { GpuArray *Xp = X; GpuArray copyX; GpuArray *Yp = Y; GpuArray copyY; GpuArray *Ap = A; gpucontext *ctx = gpudata_context(Xp->data); size_t elsize; size_t m, n, lda; cb_order o; int err; if (X->typecode != GA_HALF && X->typecode != GA_FLOAT && X->typecode != GA_DOUBLE) return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype"); if (X->nd != 1 || Y->nd != 1 || A->nd != 2) return error_fmt(ctx->err, GA_VALUE_ERROR, "Wrong number of dimensions: X->nd = %u (expected 1), Y->nd = %u (expected 1), A->nd = %u (expected 2)", X->nd, Y->nd, A->nd); if (Y->typecode != X->typecode || A->typecode != X->typecode) return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes"); if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || !(A->flags & GA_ALIGNED)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned inputs"); m = X->dimensions[0]; n = Y->dimensions[0]; if (A->dimensions[0] != m || A->dimensions[1] != n) return error_set(ctx->err, GA_VALUE_ERROR, "Incompatible shapes"); elsize = gpuarray_get_elsize(X->typecode); if (X->strides[0] < 0) { if (nocopy) return error_set(ctx->err, GA_COPY_ERROR, "Need copy for X"); else { err = GpuArray_copy(©X, X, GA_ANY_ORDER); if (err != GA_NO_ERROR) goto cleanup; Xp = ©X; } } if (Y->strides[0] < 0) { if (nocopy) return error_set(ctx->err, GA_COPY_ERROR, "Need copy for Y"); else { err = GpuArray_copy(©Y, Y, GA_ANY_ORDER); if (err != GA_NO_ERROR) goto cleanup; Yp = ©Y; } } if (!GpuArray_ISONESEGMENT(A)) { err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A"); goto cleanup; } if (Ap->flags & GA_F_CONTIGUOUS) { o = cb_fortran; lda = Ap->dimensions[0]; } else if (Ap->flags & GA_C_CONTIGUOUS) { o = cb_c; lda = Ap->dimensions[1]; } else { /* Might be worth looking at making degenerate matrices (1xn) work here. */ err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A"); goto cleanup; } ctx = gpudata_context(Xp->data); err = gpublas_setup(ctx); if (err != GA_NO_ERROR) goto cleanup; switch(Xp->typecode) { case GA_HALF: err = gpublas_hger(o, m, n, (float)alpha, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, Ap->data, Ap->offset / elsize, lda); break; case GA_FLOAT: err = gpublas_sger(o, m, n, (float)alpha, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, Ap->data, Ap->offset / elsize, lda); break; case GA_DOUBLE: err = gpublas_dger(o, m, n, (double)alpha, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, Ap->data, Ap->offset / elsize, lda); break; } cleanup: if (Xp == ©X) GpuArray_clear(©X); if (Yp == ©Y) GpuArray_clear(©Y); return err; } static inline int is_last_2d_contiguous(const GpuArray *a) { ssize_t size = GpuArray_ITEMSIZE(a); if (GpuArray_IS_C_CONTIGUOUS(a)) return 1; // C contiguous if (a->strides[a->nd - 2] <= 0 || a->strides[a->nd - 1] <= 0) return 0; if (a->strides[a->nd - 2] == size) return 2; // F contiguous if (a->strides[a->nd - 1] == size) return 1; // C contiguous return 0; } int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alpha, GpuArray *A, GpuArray *B, double beta, GpuArray *C, int nocopy) { GpuArray *Ap = A; GpuArray copyA; GpuArray *Bp = B; GpuArray copyB; GpuArray *Cp = C; gpucontext *ctx = gpudata_context(A->data); size_t elsize; size_t batchCount, m, n, k, lda, ldb, ldc; cb_order o; int cA, cB, cC; int err; if (A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE && A->typecode != GA_HALF) return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype"); if (A->nd != 3 || B->nd != 3 || C->nd != 3) return error_fmt(ctx->err, GA_VALUE_ERROR, "Wrong number of dimensions: A->nd = %u (expected 3), B->nd = %u (expected 3), C->nd = %u (expected 3)", A->nd, B->nd, C->nd); if (B->typecode != A->typecode || C->typecode != A->typecode) return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes"); if (!(A->flags & GA_ALIGNED) || !(B->flags & GA_ALIGNED) || !(C->flags & GA_ALIGNED)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input"); batchCount = A->dimensions[0]; if (B->dimensions[0] != batchCount || C->dimensions[0] != batchCount) return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched first dimension"); if (transA == cb_no_trans) { m = A->dimensions[1]; k = A->dimensions[2]; } else { m = A->dimensions[2]; k = A->dimensions[1]; } if (transB == cb_no_trans) { n = B->dimensions[2]; if (B->dimensions[1] != k) return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched shape"); } else { n = B->dimensions[1]; if (B->dimensions[2] != k) return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched shape"); } if (C->dimensions[1] != m || C->dimensions[2] != n) return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched shape"); elsize = gpuarray_get_elsize(A->typecode); cA = is_last_2d_contiguous(A); if (!cA) { if (nocopy) return error_set(ctx->err, GA_COPY_ERROR, "Need copy for A"); else { err = GpuArray_copy(©A, A, GA_C_ORDER); cA = 1; if (err != GA_NO_ERROR) goto cleanup; Ap = ©A; } } cB = is_last_2d_contiguous(B); if (!cB) { if (nocopy) return error_set(ctx->err, GA_COPY_ERROR, "Need copy for B"); else { err = GpuArray_copy(©B, B, GA_C_ORDER); cB = 1; if (err != GA_NO_ERROR) goto cleanup; Bp = ©B; } } cC = is_last_2d_contiguous(C); if (!cC) { err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous last 2d C"); goto cleanup; } if (cC == 2) { o = cb_fortran; ldc = Cp->dimensions[2] > 1 ? Cp->strides[2] / elsize : Cp->dimensions[1]; } else if (cC == 1) { o = cb_c; ldc = Cp->dimensions[1] > 1 ? Cp->strides[1] / elsize : Cp->dimensions[2]; } else { err = error_set(ctx->err, GA_MISC_ERROR, "Invalid internal result for C"); goto cleanup; } if (cA == 2) { lda = Ap->dimensions[2] > 1 ? Ap->strides[2] / elsize : Ap->dimensions[1]; if (o == cb_c) { if (transA == cb_no_trans) transA = cb_trans; else transA = cb_no_trans; } } else if (cA == 1) { lda = Ap->dimensions[1] > 1 ? Ap->strides[1] / elsize : Ap->dimensions[2]; if (o == cb_fortran) { if (transA == cb_no_trans) transA = cb_trans; else transA = cb_no_trans; } } else { err = error_set(ctx->err, GA_MISC_ERROR, "Invalid internal result for A"); goto cleanup; } if (cB == 2) { ldb = Bp->dimensions[2] > 1 ? Bp->strides[2] / elsize : Bp->dimensions[1]; if (o == cb_c) { if (transB == cb_no_trans) transB = cb_trans; else transB = cb_no_trans; } } else if (cB == 1) { ldb = Bp->dimensions[1] > 1 ? Bp->strides[1] / elsize : Bp->dimensions[2]; if (o == cb_fortran) { if (transB == cb_no_trans) transB = cb_trans; else transB = cb_no_trans; } } else { err = error_set(ctx->err, GA_MISC_ERROR, "Invalid internal result for B"); goto cleanup; } ctx = gpudata_context(Ap->data); err = gpublas_setup(ctx); if (err != GA_NO_ERROR) goto cleanup; switch (C->typecode) { case GA_HALF: err = gpublas_hgemm3D(o, transA, transB, m, n, k, (float)alpha, Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize, Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize, (float)beta, Cp->data, Cp->offset/elsize, ldc, Cp->strides[0]/elsize, batchCount, 0); break; case GA_FLOAT: err = gpublas_sgemm3D(o, transA, transB, m, n, k, (float)alpha, Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize, Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize, (float)beta, Cp->data, Cp->offset/elsize, ldc, Cp->strides[0]/elsize, batchCount, 0); break; case GA_DOUBLE: err = gpublas_dgemm3D(o, transA, transB, m, n, k, (double)alpha, Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize, Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize, (double)beta, Cp->data, Cp->offset/elsize, ldc, Cp->strides[0]/elsize, batchCount, 0); break; } if (err == GA_DEVSUP_ERROR) { gpudata **A_datas = NULL, **B_datas = NULL, **C_datas = NULL; size_t *A_offsets = NULL, *B_offsets = NULL, *C_offsets = NULL; size_t i; A_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); B_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); C_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); A_offsets = (size_t*)malloc(batchCount * sizeof(size_t)); B_offsets = (size_t*)malloc(batchCount * sizeof(size_t)); C_offsets = (size_t*)malloc(batchCount * sizeof(size_t)); if (A_datas == NULL || B_datas == NULL || C_datas == NULL || A_offsets == NULL || B_offsets == NULL || C_offsets == NULL) { err = error_sys(ctx->err, "malloc"); goto old_cleanup; } for (i = 0; i < batchCount; i++) { A_datas[i] = Ap->data; B_datas[i] = Bp->data; C_datas[i] = Cp->data; A_offsets[i] = (Ap->offset + i * Ap->strides[0]) / elsize; B_offsets[i] = (Bp->offset + i * Bp->strides[0]) / elsize; C_offsets[i] = (Cp->offset + i * Cp->strides[0]) / elsize; } switch (C->typecode) { case GA_HALF: err = gpublas_hgemmBatch(o, transA, transB, m, n, k, (float)alpha, A_datas, A_offsets, lda, B_datas, B_offsets, ldb, (float)beta, C_datas, C_offsets, ldc, batchCount, 0); break; case GA_FLOAT: err = gpublas_sgemmBatch(o, transA, transB, m, n, k, (float)alpha, A_datas, A_offsets, lda, B_datas, B_offsets, ldb, (float)beta, C_datas, C_offsets, ldc, batchCount, 0); break; case GA_DOUBLE: err = gpublas_dgemmBatch(o, transA, transB, m, n, k, (double)alpha, A_datas, A_offsets, lda, B_datas, B_offsets, ldb, (double)beta, C_datas, C_offsets, ldc, batchCount, 0); break; } old_cleanup: free(A_datas); free(B_datas); free(C_datas); free(A_offsets); free(B_offsets); free(C_offsets); } cleanup: if (Ap == ©A) GpuArray_clear(©A); if (Bp == ©B) GpuArray_clear(©B); return err; } libgpuarray-0.7.6/src/gpuarray_array_collectives.c000066400000000000000000000106001326743622600224340ustar00rootroot00000000000000#include "gpuarray/array.h" #include "gpuarray/buffer_collectives.h" #include "gpuarray/collectives.h" #include "gpuarray/error.h" #include "private.h" /** * \brief Finds total number of elements contained in `array`. */ static inline size_t find_total_elems(const GpuArray* array) { unsigned int i; size_t total_elems = 1; for (i = 0; i < array->nd; ++i) total_elems *= array->dimensions[i]; return total_elems; } /** * \brief Checks if `src` and `dest` arrays are appropriate to participate in a * collective operation. * * Checks to see if they contain the appropriate number of elements, if they are * properly aligned (contiguous) and writeable (for `dest`) and if they contain * elements of the same datatype. It returns the number of elements of the array * with * the less length. */ static inline int check_gpuarrays(int times_src, const GpuArray* src, int times_dest, const GpuArray* dest, size_t* count) { gpucontext *ctx = gpudata_context(src->data); size_t count_src, count_dest; count_src = find_total_elems(src); count_dest = find_total_elems(dest); if (times_src * count_src != times_dest * count_dest) return error_set(ctx->err, GA_VALUE_ERROR, "Size mismatch for transfer"); if (src->typecode != dest->typecode) return error_set(ctx->err, GA_VALUE_ERROR, "Type mismatch"); if (!GpuArray_ISALIGNED(src) || !GpuArray_ISALIGNED(dest)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned arrays"); if (!GpuArray_ISWRITEABLE(dest)) return error_set(ctx->err, GA_INVALID_ERROR, "Unwritable destination"); if (times_src >= times_dest) *count = count_src; else *count = count_dest; return GA_NO_ERROR; } int GpuArray_reduce_from(const GpuArray* src, int opcode, int root, gpucomm* comm) { gpucontext *ctx = gpudata_context(src->data); size_t total_elems; if (!GpuArray_ISALIGNED(src)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input"); total_elems = find_total_elems(src); return gpucomm_reduce(src->data, src->offset, NULL, 0, total_elems, src->typecode, opcode, root, comm); } int GpuArray_reduce(const GpuArray* src, GpuArray* dest, int opcode, int root, gpucomm* comm) { int rank = 0; GA_CHECK(gpucomm_get_rank(comm, &rank)); if (rank == root) { size_t count = 0; GA_CHECK(check_gpuarrays(1, src, 1, dest, &count)); return gpucomm_reduce(src->data, src->offset, dest->data, dest->offset, count, src->typecode, opcode, root, comm); } else { return GpuArray_reduce_from(src, opcode, root, comm); } } int GpuArray_all_reduce(const GpuArray* src, GpuArray* dest, int opcode, gpucomm* comm) { size_t count = 0; GA_CHECK(check_gpuarrays(1, src, 1, dest, &count)); return gpucomm_all_reduce(src->data, src->offset, dest->data, dest->offset, count, src->typecode, opcode, comm); } int GpuArray_reduce_scatter(const GpuArray* src, GpuArray* dest, int opcode, gpucomm* comm) { size_t count = 0; int ndev = 0; GA_CHECK(gpucomm_get_count(comm, &ndev)); GA_CHECK(check_gpuarrays(1, src, ndev, dest, &count)); return gpucomm_reduce_scatter(src->data, src->offset, dest->data, dest->offset, count, src->typecode, opcode, comm); } int GpuArray_broadcast(GpuArray *array, int root, gpucomm *comm) { gpucontext *ctx = gpudata_context(array->data); size_t total_elems; int rank = 0; GA_CHECK(gpucomm_get_rank(comm, &rank)); if (rank == root) { if (!GpuArray_CHKFLAGS(array, GA_BEHAVED)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input"); } else { if (!GpuArray_ISALIGNED(array)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input"); } total_elems = find_total_elems(array); return gpucomm_broadcast(array->data, array->offset, total_elems, array->typecode, root, comm); } int GpuArray_all_gather(const GpuArray* src, GpuArray* dest, gpucomm* comm) { size_t count = 0; int ndev = 0; GA_CHECK(gpucomm_get_count(comm, &ndev)); GA_CHECK(check_gpuarrays(ndev, src, 1, dest, &count)); return gpucomm_all_gather(src->data, src->offset, dest->data, dest->offset, count, src->typecode, comm); } libgpuarray-0.7.6/src/gpuarray_blas_cuda_cublas.c000066400000000000000000001637231326743622600222070ustar00rootroot00000000000000#include "private.h" #include "private_cuda.h" #include "gpuarray/buffer_blas.h" #include "gpuarray/kernel.h" #include "gpuarray/error.h" #include #include "loaders/libcublas.h" extern const gpuarray_buffer_ops cuda_ops; static inline cublasOperation_t convT(cb_transpose trans) { switch (trans) { case cb_no_trans: return CUBLAS_OP_N; case cb_trans: return CUBLAS_OP_T; case cb_conj_trans: return CUBLAS_OP_C; default: return -1; } } static const char *estr(cublasStatus_t err) { switch (err) { case CUBLAS_STATUS_SUCCESS: return "(cublas) Operation completed successfully."; case CUBLAS_STATUS_NOT_INITIALIZED: return "(cublas) Library not initialized."; case CUBLAS_STATUS_ALLOC_FAILED: return "(cublas) GPU ressource allocation failed."; case CUBLAS_STATUS_INVALID_VALUE: return "(cublas) Invalid value."; case CUBLAS_STATUS_ARCH_MISMATCH: return "(cublas) Operation not supported by device."; case CUBLAS_STATUS_MAPPING_ERROR: return "(cublas) Mapping error."; case CUBLAS_STATUS_EXECUTION_FAILED: return "(cublas) Execution failed."; case CUBLAS_STATUS_INTERNAL_ERROR: return "(cublas) Internal error."; case CUBLAS_STATUS_NOT_SUPPORTED: return "(cublas) Unsupported functionality."; case CUBLAS_STATUS_LICENSE_ERROR: return "(cublas) License error."; default: return "(cublas) Unknown error."; } } static inline int error_cublas(error *e, const char *msg, cublasStatus_t err) { return error_fmt(e, (err == CUBLAS_STATUS_ARCH_MISMATCH) ? GA_DEVSUP_ERROR : GA_BLAS_ERROR, "%s: %s%s", msg, estr(err), err == CUBLAS_STATUS_NOT_INITIALIZED ? " (Possibly because the driver version is too old for the cuda version)" : ""); } #define CUBLAS_EXIT_ON_ERROR(ctx, cmd) do { \ cublasStatus_t err = (cmd); \ if (err != CUBLAS_STATUS_SUCCESS) { \ cuda_exit(ctx); \ return error_cublas((ctx)->err, #cmd, err); \ } \ } while(0) typedef struct _blas_handle { cublasHandle_t h; GpuKernel sgemvBH_N_a1_b1_small; GpuKernel sgemvBH_T_a1_b1_small; GpuKernel dgemvBH_N_a1_b1_small; GpuKernel dgemvBH_T_a1_b1_small; GpuKernel sgerBH_gen_small; GpuKernel dgerBH_gen_small; uint8_t tensorCore; } blas_handle; #define LARGE_VAL(v) (v >= INT_MAX) static const char *code_sgemvBH_N_a1_b1_small = \ "#include \"cluda.h\"\n" \ "KERNEL void sgemv(const float *A[], size_t lda, " \ " const float *x[], size_t incx, " \ " float *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ " for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < m;" \ " i += gridDim.x * blockDim.x) {" \ " float yi = 0.0f;" \ " const float *Ap = A[p] + i;" \ " const float *xp = x[p];\n" \ " #pragma unroll 32\n" \ " for (size_t j = 0; j < n; j++) {" \ " yi += Ap[0] * xp[0];" \ " Ap += lda;" \ " xp += incx;" \ " }" \ " atom_add_fg(&y[p][i*incy], yi);" \ " }" \ " }" \ "}\n"; static const char *code_sgemvBH_T_a1_b1_small = \ "#include \"cluda.h\"\n" \ "KERNEL void sgemv(const float *A[], size_t lda, " \ " const float *x[], size_t incx, " \ " float *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ " if (i >= m || p >= b) return;" \ " float yi = 0.0f;" \ " const float *Ap = A[p] + i * lda;" \ " const float *xp = x[p];\n" \ " # pragma unroll 32\n" \ " for (size_t j = 0; j < n; j++) {" \ " yi += Ap[j] * xp[0];" \ " xp += incx;" \ " }" \ " atom_add_fg(&y[p][i*incy], yi);" \ "}\n"; static const char *code_dgemvBH_N_a1_b1_small = \ "#include \"cluda.h\"\n" \ "KERNEL void dgemv(const double *A[], size_t lda, " \ " const double *x[], size_t incx, " \ " double *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ " for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < m;" \ " i += gridDim.x * blockDim.x) {" \ " double yi = 0.0;" \ " const double *Ap = A[p] + i;" \ " const double *xp = x[p];\n" \ " #pragma unroll 32\n" \ " for (size_t j = 0; j < n; j++) {" \ " yi += Ap[0] * xp[0];" \ " Ap += lda;" \ " xp += incx;" \ " }" \ " atom_add_dg(&y[p][i*incy], yi);" \ " }" \ " }" \ "}\n"; static const char *code_dgemvBH_T_a1_b1_small = \ "#include \"cluda.h\"\n" \ "KERNEL void dgemv(const double *A[], size_t lda, " \ " const double *x[], size_t incx, " \ " double *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ " if (i >= m || p >= b) return;" \ " double yi = 0.0;" \ " const double *Ap = A[p] + i * lda;" \ " const double *xp = x[p];\n" \ " # pragma unroll 32\n" \ " for (size_t j = 0; j < n; j++) {" \ " yi += Ap[j] * xp[0];" \ " xp += incx;" \ " }" \ " atom_add_dg(&y[p][i*incy], yi);" \ "}\n"; static const char *code_sgerBH_gen_small = \ "#include \"cluda.h\"\n" \ "KERNEL void _sgerBH_gen_small(" \ " const float *x[], size_t incx," \ " const float *y[], size_t incy," \ " float alpha, float *A[], size_t lda," \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t j = blockIdx.y * blockDim.y + threadIdx.y;" \ " if (i >= m || j >= n) return;" \ " for (size_t p = blockIdx.z; p < b; p += gridDim.z) {" \ " atom_add_fg(&A[p][j * lda + i]," \ " alpha * x[p][i * incx] * y[p][j * incy]);" \ " }" \ "}\n"; static const char *code_dgerBH_gen_small = \ "#include \"cluda.h\"\n" \ "KERNEL void _dgerBH_gen_small(" \ " const double *x[], size_t incx, " \ " const double *y[], size_t incy," \ " double alpha, double *A[], size_t lda," \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t j = blockIdx.y * blockDim.y + threadIdx.y;" \ " if (i >= m || j >= n) return;" \ " for (size_t p = blockIdx.z; p < b; p += gridDim.z) {" \ " atom_add_dg(&A[p][j * lda + i]," \ " alpha * x[p][i * incx] * y[p][j * incy]);" \ " }" \ "}\n"; static int setup(gpucontext *c) { cuda_context *ctx = (cuda_context *)c; blas_handle *handle; CUdevice dev; cublasStatus_t err; int types[10]; int major, minor; int e; if (ctx->blas_handle != NULL) return GA_NO_ERROR; handle = calloc(1, sizeof(*handle)); if (handle == NULL) return error_sys(ctx->err, "calloc"); cuda_enter(ctx); { CUresult err; err = cuCtxGetDevice(&dev); if (err != CUDA_SUCCESS) { cuda_exit(ctx); return error_cuda(ctx->err, "cuCtxGetDevice", err); } } GA_CUDA_EXIT_ON_ERROR(ctx, get_cc(dev, &major, &minor, ctx->err)); /* Only try to use tensor core on cuda 9 and up */ if (ctx->major >= 9 && major >= 7 && minor >= 0) { handle->tensorCore = 1; } else { handle->tensorCore = 0; } err = cublasCreate(&handle->h); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); free(handle); return error_cublas(ctx->err, "cublasCreate", err); } err = cublasSetStream(handle->h, ctx->s); if (err != CUBLAS_STATUS_SUCCESS) { e = error_cublas(ctx->err, "cublasSetStream", err); goto e1; } err = cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST); if (err != CUBLAS_STATUS_SUCCESS) { e = error_cublas(ctx->err, "cublasSetPointerMode", err); goto e1; } types[0] = GA_BUFFER; types[1] = GA_SIZE; types[2] = GA_BUFFER; types[3] = GA_SIZE; types[4] = GA_BUFFER; types[5] = GA_SIZE; types[6] = GA_SIZE; types[7] = GA_SIZE; types[8] = GA_SIZE; e = GpuKernel_init(&handle->sgemvBH_N_a1_b1_small, c, 1, &code_sgemvBH_N_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL); if (e != GA_NO_ERROR) goto e1; e = GpuKernel_init(&handle->sgemvBH_T_a1_b1_small, c, 1, &code_sgemvBH_T_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL); if (e != GA_NO_ERROR) goto e2; e = GpuKernel_init(&handle->dgemvBH_N_a1_b1_small, c, 1, &code_dgemvBH_N_a1_b1_small, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e3; e = GpuKernel_init(&handle->dgemvBH_T_a1_b1_small, c, 1, &code_dgemvBH_T_a1_b1_small, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e4; types[0] = GA_BUFFER; types[1] = GA_SIZE; types[2] = GA_BUFFER; types[3] = GA_SIZE; types[4] = GA_FLOAT; types[5] = GA_BUFFER; types[6] = GA_SIZE; types[7] = GA_SIZE; types[8] = GA_SIZE; types[9] = GA_SIZE; e = GpuKernel_init(&handle->sgerBH_gen_small, c, 1, &code_sgerBH_gen_small, NULL, "_sgerBH_gen_small", 10, types, 0, NULL); if (e != GA_NO_ERROR) goto e5; types[4] = GA_DOUBLE; e = GpuKernel_init(&handle->dgerBH_gen_small, c, 1, &code_dgerBH_gen_small, NULL, "_dgerBH_gen_small", 10, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e6; ctx->blas_handle = handle; cuda_exit(ctx); return GA_NO_ERROR; e6: GpuKernel_clear(&handle->sgerBH_gen_small); e5: GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small); e4: GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small); e3: GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small); e2: GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small); e1: cublasDestroy(handle->h); cuda_exit(ctx); free(handle); return e; } static void teardown(gpucontext *c) { cuda_context *ctx = (cuda_context *)c; blas_handle *handle = (blas_handle *)ctx->blas_handle; if (ctx->blas_handle == NULL) return; cuda_enter(ctx); cublasDestroy(handle->h); GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small); GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small); GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small); GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small); GpuKernel_clear(&handle->sgerBH_gen_small); GpuKernel_clear(&handle->dgerBH_gen_small); cuda_exit(ctx); free(ctx->blas_handle); ctx->blas_handle = NULL; } static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *T; size_t t; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemm(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((float *)A->ptr) + offA, lda, ((float *)B->ptr) + offB, ldb, &beta, ((float *)C->ptr) + offC, ldc)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; } static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *T; size_t t; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); CUBLAS_EXIT_ON_ERROR(ctx, cublasDgemm(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((double *)A->ptr) + offA, lda, ((double *)B->ptr) + offB, ldb, &beta, ((double *)C->ptr) + offC, ldc)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; } static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { /* This will use float32 for computation as it's the best we can * have right now. In the future when native float16 support will be * there we will switch to that. */ cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *T; size_t t; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (cublasSgemmEx == NULL && (cublasGemmEx == NULL || h->tensorCore == 0)) return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasSgemmEx|cublasGemmEx unavailable"); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); if (cublasGemmEx != NULL && h->tensorCore) { CUBLAS_EXIT_ON_ERROR(ctx, cublasGemmEx(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((uint16_t *)A->ptr) + offA, CUDA_R_16F, lda, ((uint16_t *)B->ptr) + offB, CUDA_R_16F, ldb, &beta, ((uint16_t *)C->ptr) + offC, CUDA_R_16F, ldc, CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP)); } else { CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemmEx(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((uint16_t *)A->ptr) + offA, CUDA_R_16F, lda, ((uint16_t *)B->ptr) + offB, CUDA_R_16F, ldb, &beta, ((uint16_t *)C->ptr) + offC, CUDA_R_16F, ldc)); } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; } static int hgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount) { cuda_context *ctx; blas_handle *h; size_t t; ssize_t st; gpudata *T; cb_transpose transT; cublasStatus_t err; ga_half_t halpha, hbeta; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); ctx = A->ctx; if (cublasHgemmStridedBatched == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasHgemmStridedBatched not available in your version of cuBLAS"); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); h = (blas_handle *)ctx->blas_handle; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; t = offA; offA = offB; offB = t; transT = transA; transA = transB; transB = transT; st = strideA; strideA = strideB; strideB = st; } halpha = ga_float2half(alpha); hbeta = ga_float2half(beta); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); err = cublasHgemmStridedBatched(h->h, convT(transA), convT(transB), M, N, K, (__half *)&halpha, ((__half *)A->ptr) + offA, lda, strideA, ((__half *)B->ptr) + offB, ldb, strideB, (__half *)&hbeta, ((__half *)C->ptr) + offC, ldc, strideC, batchCount); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); return error_cublas(ctx->err, "cublasHgemmStridedBatched", err); } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; } static int sgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount) { cuda_context *ctx; blas_handle *h; size_t t; ssize_t st; gpudata *T; cb_transpose transT; cublasStatus_t err; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); ctx = A->ctx; if (cublasSgemmStridedBatched == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasSgemmStridedBatched not available in your version of cuBLAS"); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); h = (blas_handle *)ctx->blas_handle; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; t = offA; offA = offB; offB = t; transT = transA; transA = transB; transB = transT; st = strideA; strideA = strideB; strideB = st; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); err = cublasSgemmStridedBatched(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((float *)A->ptr) + offA, (int)lda, strideA, ((float *)B->ptr) + offB, (int)ldb, strideB, &beta, ((float *)C->ptr) + offC, (int)ldc, strideC, batchCount); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); return error_cublas(ctx->err, "cublasHgemmStridedBatched", err); } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; } static int dgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount) { cuda_context *ctx; blas_handle *h; size_t t; ssize_t st; gpudata *T; cb_transpose transT; cublasStatus_t err; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); ctx = A->ctx; if (cublasDgemmStridedBatched == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasDgemmStridedBatched not available in your version of cuBLAS"); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); h = (blas_handle *)ctx->blas_handle; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; t = offA; offA = offB; offB = t; transT = transA; transA = transB; transB = transT; st = strideA; strideA = strideB; strideB = st; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); err = cublasDgemmStridedBatched(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((double *)A->ptr) + offA, (int)lda, strideA, ((double *)B->ptr) + offB, (int)ldb, strideB, &beta, ((double *)C->ptr) + offC, (int)ldc, strideC, batchCount); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); return error_cublas(ctx->err, "cublasDgemmStridedBatched", err); } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; } static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cuda_context *ctx; blas_handle *h; size_t *lt, t; gpudata **T; size_t i; const size_t threshold = 650; cb_transpose transT; ASSERT_BUF(A[0]); ctx = A[0]->ctx; if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); h = (blas_handle *)ctx->blas_handle; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; lt = offA; offA = offB; offB = lt; } /* use parallel cublasSgemm calls rather than cublasSgemmBatched for * large products */ if (M * N * K > threshold * threshold * threshold) { for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemm(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((float*)A[i]->ptr) + offA[i], lda, ((float*)B[i]->ptr) + offB[i], ldb, &beta, ((float*)C[i]->ptr) + offC[i], ldc)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL)); } } else { float **T_l = alloca(sizeof(float *) * batchCount * 3); const float **A_l = (const float **)T_l; const float **B_l = (const float **)T_l + batchCount; float **C_l = T_l + (batchCount * 2); gpudata *Ta; CUdeviceptr Aa, Ba, Ca; cublasStatus_t err; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); A_l[i] = ((float *)A[i]->ptr) + offA[i]; B_l[i] = ((float *)B[i]->ptr) + offB[i]; C_l[i] = ((float *)C[i]->ptr) + offC[i]; } Ta = gpudata_alloc((gpucontext *)ctx, sizeof(float *) * batchCount * 3, NULL, 0, NULL); if (Ta == NULL) { cuda_exit(ctx); return ctx->err->code; } Aa = *(CUdeviceptr *)Ta; Ba = Aa + (batchCount * sizeof(float *)); Ca = Aa + (batchCount * sizeof(float *) * 2); if (gpudata_write(Ta, 0, T_l, sizeof(float *) * batchCount * 3) != GA_NO_ERROR) { gpudata_release(Ta); cuda_exit(ctx); return ctx->err->code; } if (cuda_wait(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) { gpudata_release(Ta); cuda_exit(ctx); return ctx->err->code; } err = cublasSgemmBatched(h->h, convT(transA), convT(transB), M, N, K, &alpha, (const float **)Aa, lda, (const float **)Ba, ldb, &beta, (float **)Ca, ldc, batchCount); if (cuda_record(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) { gpudata_release(Ta); cuda_exit(ctx); return ctx->err->code; } gpudata_release(Ta); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); return error_cublas(ctx->err, "cublasSgemmBatched", err); } for (i = 0; i < batchCount; i++) { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL)); } } cuda_exit(ctx); return GA_NO_ERROR; } static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cuda_context *ctx; blas_handle *h; size_t *lt, t; gpudata **T; size_t i; const size_t threshold = 650; cb_transpose transT; ASSERT_BUF(A[0]); ctx = A[0]->ctx; if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); h = (blas_handle *)ctx->blas_handle; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; lt = offA; offA = offB; offB = lt; } /* use parallel cublasSgemm calls rather than cublasSgemmBatched for * large products */ if (M * N * K > threshold * threshold * threshold) { for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); CUBLAS_EXIT_ON_ERROR(ctx, cublasDgemm(h->h, convT(transA), convT(transB), M, N, K, &alpha, (double*)A[i]->ptr + offA[i], lda, (double*)B[i]->ptr + offB[i], ldb, &beta, (double*)C[i]->ptr + offC[i], ldc)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL)); } } else { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **B_l = (const double **)T_l + batchCount; double **C_l = T_l + (batchCount * 2); gpudata *Ta; CUdeviceptr Aa, Ba, Ca; cublasStatus_t err; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); A_l[i] = ((double *)A[i]->ptr) + offA[i]; B_l[i] = ((double *)B[i]->ptr) + offB[i]; C_l[i] = ((double *)C[i]->ptr) + offC[i]; } Ta = gpudata_alloc((gpucontext *)ctx, sizeof(double *) * batchCount * 3, NULL, 0, NULL); if (Ta == NULL) { cuda_exit(ctx); return ctx->err->code; } Aa = *(CUdeviceptr *)Ta; Ba = Aa + (batchCount * sizeof(double *)); Ca = Aa + (batchCount * sizeof(double *) * 2); if (gpudata_write(Ta, 0, T_l, sizeof(double *) * batchCount * 3) != GA_NO_ERROR) { gpudata_release(Ta); cuda_exit(ctx); return ctx->err->code; } if (cuda_wait(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) { gpudata_release(Ta); cuda_exit(ctx); return ctx->err->code; } err = cublasDgemmBatched(h->h, convT(transA), convT(transB), M, N, K, &alpha, (const double **)Aa, lda, (const double **)Ba, ldb, &beta, (double **)Ca, ldc, batchCount); if (cuda_record(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) { gpudata_release(Ta); cuda_exit(ctx); return ctx->err->code; } gpudata_release(Ta); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); return error_cublas(ctx->err, "cublasDgemmBatched", err); } for (i = 0; i < batchCount; i++) { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL)); } } cuda_exit(ctx); return GA_NO_ERROR; } static int sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; cublasPointerMode_t pmode; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(Z); if (LARGE_VAL(N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE)); // we should store dot result on device CUBLAS_EXIT_ON_ERROR(ctx, cublasGetPointerMode(h->h, &pmode)); CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE)); CUBLAS_EXIT_ON_ERROR(ctx, cublasSdot(h->h, N, ((float*)X->ptr) + offX, incX, ((float*)Y->ptr) + offY, incY, ((float*)Z->ptr) + offZ)); CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, pmode)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; } static int ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; cublasPointerMode_t pmode; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(Z); if (LARGE_VAL(N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE)); // we should store dot result on device CUBLAS_EXIT_ON_ERROR(ctx, cublasGetPointerMode(h->h, &pmode)); CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE)); CUBLAS_EXIT_ON_ERROR(ctx, cublasDdot(h->h, N, ((double*)X->ptr) + offX, incX, ((double*)Y->ptr) + offY, incY, ((double*)Z->ptr) + offZ)); CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, pmode)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; } static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL)); CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemv(h->h, convT(transA), M, N, &alpha, ((float *)A->ptr) + offA, lda, ((float *)X->ptr) + offX, incX, &beta, ((float *)Y->ptr) + offY, incY)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; } static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL)); CUBLAS_EXIT_ON_ERROR(ctx, cublasDgemv(h->h, convT(transA), M, N, &alpha, ((double *)A->ptr) + offA, lda, ((double *)X->ptr) + offX, incX, &beta, ((double *)Y->ptr) + offY, incY)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; } static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { /* Flags is there for possible future implementations where we might not use atomics or have some alternate implemntation. */ cuda_context *ctx; size_t t, i; size_t ls[2], gs[2]; void *args[9]; gpudata *Aa, *xa, *ya; int err; ASSERT_BUF(A[0]); ctx = A[0]->ctx; if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags not set to 0"); if (alpha != 1.0 || beta != 1.0) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Only alpha = 1 and beta = 1 are supported for now"); if (M < 512) { ls[0] = 32; if (batchCount > 16) ls[1] = 16; else ls[1] = batchCount; } else { ls[0] = 512; ls[1] = 1; } gs[0] = (M + ls[0] - 1) / ls[0]; gs[1] = (batchCount + ls[1] - 1) / ls[1]; if (gs[0] * gs[1] / 65535) { gs[1] = (65535 / gs[0]); } if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); { float **T_l = alloca(sizeof(float *) * batchCount * 3); const float **A_l = (const float **)T_l; const float **x_l = (const float **)T_l + batchCount; float **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(x[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(y[i], CUDA_WAIT_ALL)); A_l[i] = (float *)(A[i]->ptr + offA[i]); x_l[i] = (float *)(x[i]->ptr + offX[i]); y_l[i] = (float *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, A_l, GA_BUFFER_INIT); if (Aa == NULL) return ctx->err->code; xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, x_l, GA_BUFFER_INIT); if (xa == NULL) { cuda_ops.buffer_release(Aa); return ctx->err->code; } ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, y_l, GA_BUFFER_INIT); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return ctx->err->code; } } args[0] = Aa; args[1] = &lda; args[2] = xa; args[3] = &incX; args[4] = ya; args[5] = &incY; args[6] = &batchCount; args[7] = &M; args[8] = &N; if (transA == cb_no_trans) { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, gs, ls, 0, args); } else { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, gs, ls, 0, args); } cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(x[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_ALL)); } cuda_exit(ctx); return GA_NO_ERROR; } static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { cuda_context *ctx; size_t t, i; size_t ls[2], gs[2]; void *args[9]; gpudata *Aa, *xa, *ya; int err; ASSERT_BUF(A[0]); ctx = A[0]->ctx; if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags not set to 0"); if (alpha != 1.0 || beta != 1.0) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Only alpha = 1 and beta = 1 are supported for now"); if (M < 512) { ls[0] = 32; if (batchCount > 16) ls[1] = 16; else ls[1] = batchCount; } else { ls[0] = 512; ls[1] = 1; } gs[0] = (M + ls[0] - 1) / ls[0]; gs[1] = (batchCount + ls[1] - 1) / ls[1]; if (gs[0] * gs[1] / 65535) { gs[1] = (65535 / gs[0]); } if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **x_l = (const double **)T_l + batchCount; double **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(x[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(y[i], CUDA_WAIT_ALL)); A_l[i] = (double *)(A[i]->ptr + offA[i]); x_l[i] = (double *)(x[i]->ptr + offX[i]); y_l[i] = (double *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, A_l, GA_BUFFER_INIT); if (Aa == NULL) return ctx->err->code; xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, x_l, GA_BUFFER_INIT); if (xa == NULL) { cuda_ops.buffer_release(Aa); return ctx->err->code; } ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, y_l, GA_BUFFER_INIT); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return ctx->err->code; } } args[0] = Aa; args[1] = &lda; args[2] = xa; args[3] = &incX; args[4] = ya; args[5] = &incY; args[6] = &batchCount; args[7] = &M; args[8] = &N; if (transA == cb_no_trans) { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, gs, ls, 0, args); } else { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, gs, ls, 0, args); } cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(x[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_ALL)); } cuda_exit(ctx); return GA_NO_ERROR; } static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *td; size_t t; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(A); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { t = M; M = N; N = t; t = offX; offX = offY; offY = t; t = incX; incX = incY; incY = t; td = X; X = Y; Y = td; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL)); CUBLAS_EXIT_ON_ERROR(ctx, cublasSger(h->h, M, N, &alpha, ((float *)X->ptr) + offX, incX, ((float *)Y->ptr) + offY, incY, ((float *)A->ptr) + offA, lda)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; } static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *td; size_t t; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(A); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { t = M; M = N; N = t; t = offX; offX = offY; offY = t; t = incX; incX = incY; incY = t; td = X; X = Y; Y = td; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL)); CUBLAS_EXIT_ON_ERROR(ctx, cublasDger(h->h, M, N, &alpha, ((double *)X->ptr) + offX, incX, ((double *)Y->ptr) + offY, incY, ((double *)A->ptr) + offA, lda)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; size_t t, *tp, i; size_t ls[3] = {M, N, 1}, gs[3] = {1, 1, batchCount}; void *args[10]; gpudata **T; gpudata *Aa, *xa, *ya; int err; ASSERT_BUF(x[0]); ctx = x[0]->ctx; if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); if (incX == 1) { if (ls[0] > 32) { gs[0] = (ls[0] + 31) / 32; ls[0] = 32; } if (ls[0] * ls[1] > 512) { gs[1] = (ls[1] + 15) / 16; ls[1] = 16; } } else { if (ls[1] > 32) { gs[1] = (ls[1] + 31) / 32; ls[1] = 32; } if (ls[0] * ls[1] > 512) { gs[0] = (ls[0] + 15) / 16; ls[0] = 16; } } if (gs[0] * gs[1] * gs[2] > 65535) { if (gs[0] * gs[1] > 65535) return error_set(ctx->err, GA_VALUE_ERROR, "Input too large"); gs[2] = (65535 / (gs[0] * gs[1])); } if (order == cb_c) { t = M; M = N; N = t; tp = offX; offX = offY; offY = tp; t = incX; incX = incY; incY = t; T = x; x = y; y = T; } cuda_enter(ctx); { float **T_l = alloca(sizeof(float *) * batchCount * 3); const float **A_l = (const float **)T_l; const float **x_l = (const float **)T_l + batchCount; float **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_ALL)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(x[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(y[i], CUDA_WAIT_READ)); A_l[i] = (float *)(A[i]->ptr + offA[i]); x_l[i] = (float *)(x[i]->ptr + offX[i]); y_l[i] = (float *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, A_l, GA_BUFFER_INIT); if (Aa == NULL) return ctx->err->code; xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, x_l, GA_BUFFER_INIT); if (xa == NULL) { cuda_ops.buffer_release(Aa); return ctx->err->code; } ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, y_l, GA_BUFFER_INIT); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return ctx->err->code; } } args[0] = xa; args[1] = &incX; args[2] = ya; args[3] = &incY; args[4] = α args[5] = Aa; args[6] = &lda; args[7] = &batchCount; args[8] = &M; args[9] = &N; err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args); cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_ALL)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(x[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_READ)); } cuda_exit(ctx); return GA_NO_ERROR; } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; size_t t, *tp, i; size_t ls[3] = {M, N, 1}, gs[3] = {1, 1, batchCount}; void *args[10]; gpudata **T; gpudata *Aa, *xa, *ya; int err; ASSERT_BUF(x[0]); ctx = x[0]->ctx; if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); if (incX == 1) { if (ls[0] > 32) { gs[0] = (ls[0] + 31) / 32; ls[0] = 32; } if (ls[0] * ls[1] > 512) { gs[1] = (ls[1] + 15) / 16; ls[1] = 16; } } else { if (ls[1] > 32) { gs[1] = (ls[1] + 31) / 32; ls[1] = 32; } if (ls[0] * ls[1] > 512) { gs[0] = (ls[0] + 15) / 16; ls[0] = 16; } } if (gs[0] * gs[1] * gs[2] > 65535) { if (gs[0] * gs[1] > 65535) return error_set(ctx->err, GA_VALUE_ERROR, "Input too large"); gs[2] = (65535 / (gs[0] * gs[1])); } if (order == cb_c) { t = M; M = N; N = t; tp = offX; offX = offY; offY = tp; t = incX; incX = incY; incY = t; T = x; x = y; y = T; } cuda_enter(ctx); { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **x_l = (const double **)T_l + batchCount; double **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_ALL)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(x[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(y[i], CUDA_WAIT_READ)); A_l[i] = (double *)(A[i]->ptr + offA[i]); x_l[i] = (double *)(x[i]->ptr + offX[i]); y_l[i] = (double *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, A_l, GA_BUFFER_INIT); if (Aa == NULL) return ctx->err->code; xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, x_l, GA_BUFFER_INIT); if (xa == NULL) { cuda_ops.buffer_release(Aa); return ctx->err->code; } ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, y_l, GA_BUFFER_INIT); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return ctx->err->code; } } args[0] = xa; args[1] = &incX; args[2] = ya; args[3] = &incY; args[4] = α args[5] = Aa; args[6] = &lda; args[7] = &batchCount; args[8] = &M; args[9] = &N; err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args); cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_ALL)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(x[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_READ)); } cuda_exit(ctx); return GA_NO_ERROR; } gpuarray_blas_ops cublas_ops = { setup, teardown, NULL, /* hdot */ sdot, ddot, NULL, /* hgemv */ sgemv, dgemv, hgemm, sgemm, dgemm, NULL, /* hger */ sger, dger, NULL, /* hgemmBatch */ sgemmBatch, dgemmBatch, NULL, /* hgemvBatch */ sgemvBatch, dgemvBatch, NULL, /* hgerBatch */ sgerBatch, dgerBatch, hgemm3D, sgemm3D, dgemm3D }; libgpuarray-0.7.6/src/gpuarray_blas_opencl_clblas.c000066400000000000000000000307631326743622600225370ustar00rootroot00000000000000#include "private.h" #include "private_opencl.h" #include "loaders/libclblas.h" #include "gpuarray/buffer_blas.h" #include "gpuarray/error.h" extern const gpuarray_buffer_ops opencl_ops; static inline clblasOrder convO(cb_order order) { switch (order) { case cb_row: return clblasRowMajor; case cb_column: return clblasColumnMajor; default: return -1; } } static inline clblasTranspose convT(cb_transpose trans) { switch (trans) { case cb_no_trans: return clblasNoTrans; case cb_trans: return clblasTrans; case cb_conj_trans: return clblasConjTrans; default: return -1; } } static unsigned int refcnt = 0; static const char *estr(clblasStatus err) { if (err > -1024) return cl_error_string((cl_int)err); switch (err) { case clblasNotImplemented: return "Unimplemented feature"; case clblasNotInitialized: return "Library not initialized"; case clblasInvalidMatA: return "matrix A is not a valid memory object"; case clblasInvalidMatB: return "matrix B is not a valid memory object"; case clblasInvalidMatC: return "matrix C is not a valid memory object"; case clblasInvalidVecX: return "vector X is not a valid memory object"; case clblasInvalidVecY: return "vector Y is not a valid memory object"; case clblasInvalidDim: return "An input dimension (M, N, K) is invalid"; case clblasInvalidLeadDimA: return "leading dimension for A must not be less than the size of the first dimension"; case clblasInvalidLeadDimB: return "leading dimension for B must not be less than the size of the second dimension"; case clblasInvalidLeadDimC: return "leading dimension for C must not be less than the size of the third dimension"; case clblasInvalidIncX: return "increment for X must not be 0"; case clblasInvalidIncY: return "increment for Y must not be 0"; case clblasInsufficientMemMatA: return "memory object for matrix A is too small"; case clblasInsufficientMemMatB: return "memory object for matrix B is too small"; case clblasInsufficientMemMatC: return "memory object for matrix C is too small"; case clblasInsufficientMemVecX: return "memory object for vector X is too small"; case clblasInsufficientMemVecY: return "memory object for vector Y is too small"; default: return "Unknow error"; } } static inline int error_clblas(error *e, const char *msg, clblasStatus err) { return error_fmt(e, GA_BLAS_ERROR, "%s: %s", msg, estr(err)); } #define CLB_CHECK(e, cmd) do { \ clblasStatus err = (cmd); \ if (err != clblasSuccess) \ return error_clblas(e, #cmd, err); \ } while (0) static int setup(gpucontext *ctx) { if (refcnt == 0) { CLB_CHECK(ctx->err, clblasSetup()); } if (ctx->blas_handle == NULL) ctx->blas_handle = &refcnt; refcnt++; return GA_NO_ERROR; } static void teardown(gpucontext *ctx) { if (ctx->blas_handle != NULL) { ctx->blas_handle = NULL; refcnt--; } if (refcnt == 0) clblasTeardown(); } #define ARRAY_INIT(A) \ if (A->ev != NULL) \ evl[num_ev++] = A->ev #define ARRAY_FINI(A) \ if (A->ev != NULL) \ clReleaseEvent(A->ev); \ A->ev = ev; \ clRetainEvent(A->ev) static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cl_ctx *ctx = A[0]->ctx; cl_event evl[3]; cl_event ev; size_t i; cl_uint num_ev = 0; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); clReleaseEvent(ev); } return GA_NO_ERROR; } static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cl_ctx *ctx = A[0]->ctx; cl_event evl[3]; cl_event ev; size_t i; cl_uint num_ev = 0; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); CLB_CHECK(ctx->err, clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); clReleaseEvent(ev); } return GA_NO_ERROR; } static int sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; clblasStatus err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; gpudata *wbuf; wbuf = opencl_ops.buffer_alloc((gpucontext*)ctx, N*sizeof(float), NULL, GA_BUFFER_READ_WRITE); if (wbuf == NULL) return ctx->err->code; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(Z); // TODO: a thread-safe static buffer or allocator? err = clblasSdot( N, Z->buf, offZ, X->buf, offX, incX, Y->buf, offY, incY, wbuf->buf, 1, &ctx->q, num_ev, num_ev ? evl : NULL, &ev); opencl_ops.buffer_release(wbuf); if (err != clblasSuccess) return error_clblas(ctx->err, "clblasSdot", err); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(Z); clReleaseEvent(ev); return GA_NO_ERROR; } static int ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; clblasStatus err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; gpudata *wbuf; wbuf = opencl_ops.buffer_alloc((gpucontext*)ctx, N*sizeof(double), NULL, GA_BUFFER_READ_WRITE); if (wbuf == NULL) return ctx->err->code; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(Z); err = clblasDdot( N, Z->buf, offZ, X->buf, offX, incX, Y->buf, offY, incY, wbuf->buf, 1, &ctx->q, num_ev, num_ev ? evl : NULL, &ev); opencl_ops.buffer_release(wbuf); if (err != clblasSuccess) return error_clblas(ctx->err, "clblasDdot", err); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(Z); clReleaseEvent(ev); return GA_NO_ERROR; } static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); CLB_CHECK(ctx->err, clblasSgemv(convO(order), convT(transA), M, N, alpha, A->buf, offA, lda, X->buf, offX, incX, beta, Y->buf, offY, incY, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); ARRAY_FINI(Y); clReleaseEvent(ev); return GA_NO_ERROR; } static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); CLB_CHECK(ctx->err, clblasDgemv(convO(order), convT(transA), M, N, alpha, A->buf, offA, lda, X->buf, offX, incX, beta, Y->buf, offY, incY, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); ARRAY_FINI(Y); clReleaseEvent(ev); return GA_NO_ERROR; } static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, beta, C->buf, offC, ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); ARRAY_FINI(C); clReleaseEvent(ev); return GA_NO_ERROR; } static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); CLB_CHECK(ctx->err, clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, beta, C->buf, offC, ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); ARRAY_FINI(C); clReleaseEvent(ev); return GA_NO_ERROR; } static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event evl[3]; cl_event ev; cl_uint num_ev = 0; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(A); CLB_CHECK(ctx->err, clblasSger(convO(order), M, N, alpha, X->buf, offX, incX, Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(A); clReleaseEvent(ev); return GA_NO_ERROR; } static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event evl[3]; cl_event ev; cl_uint num_ev = 0; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(A); CLB_CHECK(ctx->err, clblasDger(convO(order), M, N, alpha, X->buf, offX, incX, Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(A); clReleaseEvent(ev); return GA_NO_ERROR; } gpuarray_blas_ops clblas_ops = { setup, teardown, NULL, /* hdot */ sdot, ddot, NULL, /* hgemv */ sgemv, dgemv, NULL, /* hgemm */ sgemm, dgemm, NULL, /* hger */ sger, dger, NULL, /* hgemmBatch */ sgemmBatch, dgemmBatch, NULL, /* hgemvBatch */ NULL, /* sgemvBatch */ NULL, /* dgemvBatch */ NULL, /* hgerBatch */ NULL, /* sgerBatch */ NULL, /* dgerBatch */ NULL, /* hgemm3D */ NULL, /* sgemm3D */ NULL, /* dgemm3D */ }; libgpuarray-0.7.6/src/gpuarray_blas_opencl_clblast.c000066400000000000000000000356321326743622600227230ustar00rootroot00000000000000#include "private.h" #include "private_opencl.h" #include "loaders/libclblast.h" #include "gpuarray/buffer_blas.h" #include "gpuarray/error.h" static inline Layout convO(cb_order order) { switch (order) { case cb_row: return kRowMajor; case cb_column: return kColMajor; default: return -1; } } static inline Transpose convT(cb_transpose trans) { switch (trans) { case cb_no_trans: return kNo; case cb_trans: return kYes; case cb_conj_trans: return kConjugate; default: return -1; } } static const char *estr(CLBlastStatusCode err) { if (err > -1024) return cl_error_string((cl_int)err); switch (err) { case CLBlastNotImplemented: return "Unimplemented feature"; case CLBlastInvalidMatrixA: return "matrix A is not a valid memory object"; case CLBlastInvalidMatrixB: return "matrix B is not a valid memory object"; case CLBlastInvalidMatrixC: return "matrix C is not a valid memory object"; case CLBlastInvalidVectorX: return "vector X is not a valid memory object"; case CLBlastInvalidVectorY: return "vector Y is not a valid memory object"; case CLBlastInvalidDimension: return "An input dimension (M, N, K) is invalid"; case CLBlastInvalidLeadDimA: return "leading dimension for A must not be less than the size of the first dimension"; case CLBlastInvalidLeadDimB: return "leading dimension for B must not be less than the size of the second dimension"; case CLBlastInvalidLeadDimC: return "leading dimension for C must not be less than the size of the third dimension"; case CLBlastInvalidIncrementX: return "increment for X must not be 0"; case CLBlastInvalidIncrementY: return "increment for Y must not be 0"; case CLBlastInsufficientMemoryA: return "memory object for matrix A is too small"; case CLBlastInsufficientMemoryB: return "memory object for matrix B is too small"; case CLBlastInsufficientMemoryC: return "memory object for matrix C is too small"; case CLBlastInsufficientMemoryX: return "memory object for vector X is too small"; case CLBlastInsufficientMemoryY: return "memory object for vector Y is too small"; case CLBlastInvalidLocalMemUsage: return "not enough local memory on the device"; case CLBlastNoHalfPrecision: return "float16 is not supported on this device"; case CLBlastNoDoublePrecision: return "float64 is not supported on this device"; case CLBlastInvalidVectorScalar: return "unit-sized vector is not a valid memory object"; case CLBlastInsufficientMemoryScalar: return "memory object for unit-sized vector is too small"; case CLBlastDatabaseError: return "device entry not in database"; case CLBlastUnknownError: return "Unspecified error"; case CLBlastUnexpectedError: return "Unexpected error"; default: return "Unknow error"; } } static inline int error_clblast(error *e, const char *msg, CLBlastStatusCode err) { return error_fmt(e, GA_BLAS_ERROR, "%s: %s", msg, estr(err)); } #define CLBT_CHECK(e, cmd) do { \ CLBlastStatusCode err = (cmd); \ if (err != kSuccess) \ return error_clblast(e, #cmd, err); \ } while (0) static int setup(gpucontext *ctx) { return GA_NO_ERROR; } static void teardown(gpucontext *ctx) { } #define ARRAY_INIT(A) \ if (A->ev != NULL) \ clWaitForEvents(1, &A->ev) #define ARRAY_FINI(A) \ if (A->ev != NULL) \ clReleaseEvent(A->ev); \ A->ev = ev; \ clRetainEvent(A->ev) static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cl_ctx *ctx = A[0]->ctx; cl_event ev; size_t i; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); CLBT_CHECK(ctx->err, CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, float_to_half(alpha), A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, float_to_half(beta), C[i]->buf, offC[i], ldc, &ctx->q, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); clReleaseEvent(ev); } return GA_NO_ERROR; } static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cl_ctx *ctx = A[0]->ctx; cl_event ev; size_t i; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); CLBT_CHECK(ctx->err, CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, beta, C[i]->buf, offC[i], ldc, &ctx->q, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); clReleaseEvent(ev); } return GA_NO_ERROR; } static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cl_ctx *ctx = A[0]->ctx; cl_event ev; size_t i; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, beta, C[i]->buf, offC[i], ldc, &ctx->q, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); clReleaseEvent(ev); } return GA_NO_ERROR; } static int hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; cl_event ev; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(Z); CLBT_CHECK(ctx->err, CLBlastHdot(N, Z->buf, offZ, X->buf, offX, incX, Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(Z); clReleaseEvent(ev); return GA_NO_ERROR; } static int sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; cl_event ev; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(Z); CLBT_CHECK(ctx->err, CLBlastSdot(N, Z->buf, offZ, X->buf, offX, incX, Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(Z); clReleaseEvent(ev); return GA_NO_ERROR; } static int ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; cl_event ev; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(Z); CLBT_CHECK(ctx->err, CLBlastDdot(N, Z->buf, offZ, X->buf, offX, incX, Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(Z); clReleaseEvent(ev); return GA_NO_ERROR; } static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); CLBT_CHECK(ctx->err, CLBlastHgemv(convO(order), convT(transA), M, N, float_to_half(alpha), A->buf, offA, lda, X->buf, offX, incX, float_to_half(beta), Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); ARRAY_FINI(Y); clReleaseEvent(ev); return GA_NO_ERROR; } static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); CLBT_CHECK(ctx->err, CLBlastSgemv(convO(order), convT(transA), M, N, alpha, A->buf, offA, lda, X->buf, offX, incX, beta, Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); ARRAY_FINI(Y); clReleaseEvent(ev); return GA_NO_ERROR; } static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); CLBT_CHECK(ctx->err, CLBlastDgemv(convO(order), convT(transA), M, N, alpha, A->buf, offA, lda, X->buf, offX, incX, beta, Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); ARRAY_FINI(Y); clReleaseEvent(ev); return GA_NO_ERROR; } static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); CLBT_CHECK(ctx->err, CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, float_to_half(alpha), A->buf, offA, lda, B->buf, offB, ldb, float_to_half(beta), C->buf, offC, ldc, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); ARRAY_FINI(C); clReleaseEvent(ev); return GA_NO_ERROR; } static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); CLBT_CHECK(ctx->err, CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, beta, C->buf, offC, ldc, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); ARRAY_FINI(C); clReleaseEvent(ev); return GA_NO_ERROR; } static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, beta, C->buf, offC, ldc, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); ARRAY_FINI(C); clReleaseEvent(ev); return GA_NO_ERROR; } static int hger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(A); CLBT_CHECK(ctx->err, CLBlastHger(convO(order), M, N, float_to_half(alpha), X->buf, offX, incX, Y->buf, offY, incY, A->buf, offA, lda, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(A); clReleaseEvent(ev); return GA_NO_ERROR; } static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(A); CLBT_CHECK(ctx->err, CLBlastSger(convO(order), M, N, alpha, X->buf, offX, incX, Y->buf, offY, incY, A->buf, offA, lda, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(A); clReleaseEvent(ev); return GA_NO_ERROR; } static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(A); CLBT_CHECK(ctx->err, CLBlastDger(convO(order), M, N, alpha, X->buf, offX, incX, Y->buf, offY, incY, A->buf, offA, lda, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(A); clReleaseEvent(ev); return GA_NO_ERROR; } gpuarray_blas_ops clblast_ops = { setup, teardown, hdot, sdot, ddot, hgemv, sgemv, dgemv, hgemm, sgemm, dgemm, hger, sger, dger, hgemmBatch, sgemmBatch, dgemmBatch, NULL, /* hgemvBatch */ NULL, /* sgemvBatch */ NULL, /* dgemvBatch */ NULL, /* hgerBatch */ NULL, /* sgerBatch */ NULL, /* dgerBatch */ NULL, /* hgemm3D */ NULL, /* sgemm3D */ NULL, /* dgemm3D */ }; libgpuarray-0.7.6/src/gpuarray_buffer.c000066400000000000000000000175021326743622600202030ustar00rootroot00000000000000#include #include #include #include "gpuarray/buffer.h" #include "gpuarray/buffer_collectives.h" #include "gpuarray/error.h" #include "util/error.h" #include "private.h" extern const gpuarray_buffer_ops cuda_ops; extern const gpuarray_buffer_ops opencl_ops; const gpuarray_buffer_ops *gpuarray_get_ops(const char *name) { if (strcmp("cuda", name) == 0) return &cuda_ops; if (strcmp("opencl", name) == 0) return &opencl_ops; return NULL; } #define FAIL(v, e) { if (ret) *ret = (e)->code; return v; } int gpu_get_platform_count(const char* name, unsigned int* platcount) { const gpuarray_buffer_ops* ops = gpuarray_get_ops(name); if (ops == NULL) { return error_set(global_err, GA_INVALID_ERROR, "Invalid platform"); } return ops->get_platform_count(platcount); } int gpu_get_device_count(const char* name, unsigned int platform, unsigned int* devcount) { const gpuarray_buffer_ops* ops = gpuarray_get_ops(name); if (ops == NULL) { return error_set(global_err, GA_INVALID_ERROR, "Invalid platform"); } return ops->get_device_count(platform, devcount); } int gpucontext_props_new(gpucontext_props **res) { gpucontext_props *r = calloc(1, sizeof(gpucontext_props)); if (r == NULL) return error_sys(global_err, "calloc"); r->dev = -1; r->sched = GA_CTX_SCHED_AUTO; r->flags = 0; r->kernel_cache_path = NULL; r->initial_cache_size = 0; r->max_cache_size = (size_t)-1; *res = r; return GA_NO_ERROR; } int gpucontext_props_cuda_dev(gpucontext_props *p, int devno) { p->dev = devno; return GA_NO_ERROR; } int gpucontext_props_opencl_dev(gpucontext_props *p, int platno, int devno) { p->dev = (platno << 16) | devno; return GA_NO_ERROR; } int gpucontext_props_sched(gpucontext_props *p, int sched) { switch (sched) { case GA_CTX_SCHED_MULTI: case GA_CTX_SCHED_AUTO: case GA_CTX_SCHED_SINGLE: p->sched = sched; break; default: return error_fmt(global_err, GA_INVALID_ERROR, "Invalid value for sched: %d", sched); } if (sched == GA_CTX_SCHED_MULTI) FLSET(p->flags, GA_CTX_MULTI_THREAD); else FLCLR(p->flags, GA_CTX_MULTI_THREAD); return GA_NO_ERROR; } int gpucontext_props_set_single_stream(gpucontext_props *p) { p->flags |= GA_CTX_SINGLE_STREAM; return GA_NO_ERROR; } int gpucontext_props_kernel_cache(gpucontext_props *p, const char *path) { p->kernel_cache_path = path; return GA_NO_ERROR; } int gpucontext_props_alloc_cache(gpucontext_props *p, size_t initial, size_t max) { if (initial > max) return error_set(global_err, GA_VALUE_ERROR, "Initial size can't be bigger than max size"); p->initial_cache_size = initial; p->max_cache_size = max; return GA_NO_ERROR; } void gpucontext_props_del(gpucontext_props *p) { free(p); } int gpucontext_init(gpucontext **res, const char *name, gpucontext_props *p) { const gpuarray_buffer_ops *ops = gpuarray_get_ops(name); gpucontext *r; if (ops == NULL) { gpucontext_props_del(p); return global_err->code; } if (p == NULL && gpucontext_props_new(&p) != GA_NO_ERROR) return global_err->code; r = ops->buffer_init(p); gpucontext_props_del(p); if (r == NULL) return global_err->code; r->ops = ops; r->extcopy_cache = NULL; *res = r; return GA_NO_ERROR; } void gpucontext_deref(gpucontext *ctx) { if (ctx->blas_handle != NULL) ctx->blas_ops->teardown(ctx); if (ctx->extcopy_cache != NULL) { cache_destroy(ctx->extcopy_cache); ctx->extcopy_cache = NULL; } ctx->ops->buffer_deinit(ctx); } int gpucontext_property(gpucontext *ctx, int prop_id, void *res) { return ctx->ops->property(ctx, NULL, NULL, prop_id, res); } const char *gpucontext_error(gpucontext *ctx, int err) { if (ctx == NULL) return global_err->msg; else return ctx->ops->ctx_error(ctx); } gpudata *gpudata_alloc(gpucontext *ctx, size_t sz, void *data, int flags, int *ret) { gpudata *res = ctx->ops->buffer_alloc(ctx, sz, data, flags); if (res == NULL && ret) *ret = ctx->err->code; return res; } void gpudata_retain(gpudata *b) { ((partial_gpudata *)b)->ctx->ops->buffer_retain(b); } void gpudata_release(gpudata *b) { if (b) ((partial_gpudata *)b)->ctx->ops->buffer_release(b); } int gpudata_share(gpudata *a, gpudata *b, int *ret) { int res = ((partial_gpudata *)a)->ctx->ops->buffer_share(a, b); if (res == -1 && ret) *ret = ((partial_gpudata *)a)->ctx->err->code; return res; } int gpudata_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz) { return ((partial_gpudata *)src)->ctx->ops->buffer_move(dst, dstoff, src, srcoff, sz); } int gpudata_transfer(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz) { gpucontext *src_ctx; gpucontext *dst_ctx; void *tmp; int res; src_ctx = ((partial_gpudata *)src)->ctx; dst_ctx = ((partial_gpudata *)dst)->ctx; if (src_ctx == dst_ctx) return src_ctx->ops->buffer_move(dst, dstoff, src, srcoff, sz); if (src_ctx->ops == dst_ctx->ops) { res = src_ctx->ops->buffer_transfer(dst, dstoff, src, srcoff, sz); if (res == GA_NO_ERROR) return res; } /* Fallback to host copy */ tmp = malloc(sz); if (tmp == NULL) { error_sys(src_ctx->err, "malloc"); return error_sys(dst_ctx->err, "malloc"); } res = src_ctx->ops->buffer_read(tmp, src, srcoff, sz); if (res != GA_NO_ERROR) { free(tmp); return res; } res = dst_ctx->ops->buffer_write(dst, dstoff, tmp, sz); free(tmp); return res; } int gpudata_read(void *dst, gpudata *src, size_t srcoff, size_t sz) { return ((partial_gpudata *)src)->ctx->ops->buffer_read(dst, src, srcoff, sz); } int gpudata_write(gpudata *dst, size_t dstoff, const void *src, size_t sz) { return ((partial_gpudata *)dst)->ctx->ops->buffer_write(dst, dstoff, src, sz); } int gpudata_memset(gpudata *dst, size_t dstoff, int data) { return ((partial_gpudata *)dst)->ctx->ops->buffer_memset(dst, dstoff, data); } int gpudata_sync(gpudata *b) { return ((partial_gpudata *)b)->ctx->ops->buffer_sync(b); } int gpudata_property(gpudata *b, int prop_id, void *res) { return ((partial_gpudata *)b)->ctx->ops->property(NULL, b, NULL, prop_id, res); } gpukernel *gpukernel_init(gpucontext *ctx, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int numargs, const int *typecodes, int flags, int *ret, char **err_str) { gpukernel *res = NULL; int err; err = ctx->ops->kernel_alloc(&res, ctx, count, strings, lengths, fname, numargs, typecodes, flags, err_str); if (err != GA_NO_ERROR && ret != NULL) *ret = ctx->err->code; return res; } void gpukernel_retain(gpukernel *k) { ((partial_gpukernel *)k)->ctx->ops->kernel_retain(k); } void gpukernel_release(gpukernel *k) { ((partial_gpukernel *)k)->ctx->ops->kernel_release(k); } int gpukernel_setarg(gpukernel *k, unsigned int i, void *a) { return ((partial_gpukernel *)k)->ctx->ops->kernel_setarg(k, i, a); } int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args) { return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, gs, ls, shared, args); } int gpukernel_property(gpukernel *k, int prop_id, void *res) { return ((partial_gpukernel *)k)->ctx->ops->property(NULL, NULL, k, prop_id, res); } gpucontext *gpudata_context(gpudata *b) { return ((partial_gpudata *)b)->ctx; } gpucontext *gpukernel_context(gpukernel *k) { return ((partial_gpukernel *)k)->ctx; } libgpuarray-0.7.6/src/gpuarray_buffer_blas.c000066400000000000000000000320731326743622600212040ustar00rootroot00000000000000#include "private.h" #include int gpublas_setup(gpucontext *ctx) { if (ctx->blas_ops == NULL) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Missing Blas library"); return ctx->blas_ops->setup(ctx); } void gpublas_teardown(gpucontext *ctx) { if (ctx->blas_ops != NULL) ctx->blas_ops->teardown(ctx); } const char *gpublas_error(gpucontext *ctx) { return ctx->err->msg; } #define BLAS_OP(buf, name, args) \ gpucontext *ctx = gpudata_context(buf); \ if (ctx->blas_ops->name) \ return ctx->blas_ops->name args; \ else \ return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by device or missing library: %s", #name) #define BLAS_OPF(buf, name, args) \ gpucontext *ctx = gpudata_context(buf); \ if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); \ if (ctx->blas_ops->name) \ return ctx->blas_ops->name args; \ else \ return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by device or missing library: %s", #name) int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { BLAS_OP(X, hdot, (N, X, offX, incX, Y, offY, incY, Z, offZ)); } int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { BLAS_OP(X, sdot, (N, X, offX, incX, Y, offY, incY, Z, offZ)); } int gpublas_ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { BLAS_OP(X, ddot, (N, X, offX, incX, Y, offY, incY, Z, offZ)); } int gpublas_hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { BLAS_OP(A, hgemv, (order, transA, M, N, alpha, A, offA, lda, X, offX, incX, beta, Y, offY, incY)); } int gpublas_sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { BLAS_OP(A, sgemv, (order, transA, M, N, alpha, A, offA, lda, X, offX, incX, beta, Y, offY, incY)); } int gpublas_dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { BLAS_OP(A, dgemv, (order, transA, M, N, alpha, A, offA, lda, X, offX, incX, beta, Y, offY, incY)); } int gpublas_hgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { BLAS_OP(A, hgemm, (order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc)); } int gpublas_sgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { BLAS_OP(A, sgemm, (order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc)); } int gpublas_dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { BLAS_OP(A, dgemm, (order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc)); } int gpublas_hger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { BLAS_OP(X, hger, (order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda)); } int gpublas_sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { BLAS_OP(X, sger, (order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda)); } int gpublas_dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { BLAS_OP(X, dger, (order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda)); } #define BLAS_OPB(l, name, args) \ gpucontext *ctx; \ if (batchCount == 0) return GA_NO_ERROR; \ ctx = gpudata_context(l[0]); \ if (ctx->blas_ops->name) \ return ctx->blas_ops->name args; \ else \ return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by library in use: %s", #name) #define BLAS_OPBF(l, name, args) \ gpucontext *ctx; \ if (batchCount == 0) return GA_NO_ERROR; \ ctx = gpudata_context(l[0]); \ if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); \ if (ctx->blas_ops->name) \ return ctx->blas_ops->name args; \ else \ return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by library in use: %s", #name) int gpublas_hgemmBatch( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount, int flags) { BLAS_OPBF(A, hgemmBatch, (order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, batchCount)); } int gpublas_sgemmBatch( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount, int flags) { BLAS_OPBF(A, sgemmBatch, (order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, batchCount)); } int gpublas_dgemmBatch( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount, int flags) { BLAS_OPBF(A, dgemmBatch, (order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, batchCount)); } int gpublas_hgemvBatch( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { BLAS_OPB(A, hgemvBatch, (order, transA, M, N, alpha, A, offA, lda, x, offX, incX, beta, y, offY, incY, batchCount, flags)); } int gpublas_sgemvBatch( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { BLAS_OPB(A, sgemvBatch, (order, transA, M, N, alpha, A, offA, lda, x, offX, incX, beta, y, offY, incY, batchCount, flags)); } int gpublas_dgemvBatch( cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { BLAS_OPB(A, dgemvBatch, (order, transA, M, N, alpha, A, offA, lda, x, offX, incX, beta, y, offY, incY, batchCount, flags)); } int gpublas_hgerBatch(cb_order order, size_t M, size_t N, float alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { BLAS_OPB(x, hgerBatch, (order, M, N, alpha, x, offX, incX, y, offY, incY, A, offA, lda, batchCount, flags)); } int gpublas_sgerBatch(cb_order order, size_t M, size_t N, float alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { BLAS_OPB(x, sgerBatch, (order, M, N, alpha, x, offX, incX, y, offY, incY, A, offA, lda, batchCount, flags)); } int gpublas_dgerBatch(cb_order order, size_t M, size_t N, double alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { BLAS_OPB(x, dgerBatch, (order, M, N, alpha, x, offX, incX, y, offY, incY, A, offA, lda, batchCount, flags)); } #define BLAS_OP3F(b, name, args) \ gpucontext *ctx; \ if (batchCount == 0) return GA_NO_ERROR; \ ctx = gpudata_context(b); \ if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); \ if (ctx->blas_ops->name) \ return ctx->blas_ops->name args; \ else \ return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by library in use: %s", #name) int gpublas_hgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags) { BLAS_OP3F(A, hgemm3D, (order, transA, transB, M, N, K, alpha, A, offA, lda, strideA, B, offB, ldb, strideB, beta, C, offC, ldc, strideC, batchCount)); } int gpublas_sgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags) { BLAS_OP3F(A, sgemm3D, (order, transA, transB, M, N, K, alpha, A, offA, lda, strideA, B, offB, ldb, strideB, beta, C, offC, ldc, strideC, batchCount)); } int gpublas_dgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags) { BLAS_OP3F(A, dgemm3D, (order, transA, transB, M, N, K, alpha, A, offA, lda, strideA, B, offB, ldb, strideB, beta, C, offC, ldc, strideC, batchCount)); } libgpuarray-0.7.6/src/gpuarray_buffer_collectives.c000066400000000000000000000072321326743622600225760ustar00rootroot00000000000000#include "gpuarray/buffer.h" #include "gpuarray/buffer_collectives.h" #include "gpuarray/error.h" #include "private.h" int gpucomm_new(gpucomm** comm, gpucontext* ctx, gpucommCliqueId comm_id, int ndev, int rank) { if (ctx->comm_ops == NULL) { *comm = NULL; return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Collectives unavailable"); } return ctx->comm_ops->comm_new(comm, ctx, comm_id, ndev, rank); } void gpucomm_free(gpucomm* comm) { gpucontext* ctx; if (comm == NULL) return; ctx = gpucomm_context(comm); if (ctx->comm_ops != NULL) ctx->comm_ops->comm_free(comm); } const char* gpucomm_error(gpucontext* ctx) { return ctx->err->msg; } gpucontext* gpucomm_context(gpucomm* comm) { return ((partial_gpucomm*)comm)->ctx; } int gpucomm_gen_clique_id(gpucontext* ctx, gpucommCliqueId* comm_id) { if (ctx->comm_ops == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->generate_clique_id(ctx, comm_id); } int gpucomm_get_count(gpucomm* comm, int* gpucount) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->get_count(comm, gpucount); } int gpucomm_get_rank(gpucomm* comm, int* rank) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->get_rank(comm, rank); } int gpucomm_reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, int opcode, int root, gpucomm* comm) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->reduce(src, offsrc, dest, offdest, count, typecode, opcode, root, comm); } int gpucomm_all_reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm* comm) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->all_reduce(src, offsrc, dest, offdest, count, typecode, opcode, comm); } int gpucomm_reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm* comm) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->reduce_scatter(src, offsrc, dest, offdest, count, typecode, opcode, comm); } int gpucomm_broadcast(gpudata* array, size_t offset, size_t count, int typecode, int root, gpucomm* comm) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->broadcast(array, offset, count, typecode, root, comm); } int gpucomm_all_gather(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, gpucomm* comm) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->all_gather(src, offsrc, dest, offdest, count, typecode, comm); } libgpuarray-0.7.6/src/gpuarray_buffer_cuda.c000066400000000000000000001424401326743622600211770ustar00rootroot00000000000000#define _CRT_SECURE_NO_WARNINGS #include "private.h" #include "private_cuda.h" #include "loaders/libnvrtc.h" #include "loaders/libcublas.h" #include #include #include #include #include "util/strb.h" #include "util/xxhash.h" #include "gpuarray/buffer.h" #include "gpuarray/util.h" #include "gpuarray/error.h" #include "gpuarray/buffer_blas.h" #include "gpuarray/extension.h" #include "cluda_cuda.h.c" STATIC_ASSERT(DONTFREE == GPUARRAY_CUDA_CTX_NOFREE, cuda_nofree_eq); STATIC_ASSERT(CUDA_WAIT_READ == GPUARRAY_CUDA_WAIT_READ, cuda_wait_read_eq); STATIC_ASSERT(CUDA_WAIT_WRITE == GPUARRAY_CUDA_WAIT_WRITE, cuda_wait_write_eq); STATIC_ASSERT(sizeof(GpuArrayIpcMemHandle) == sizeof(CUipcMemHandle), cuda_ipcmem_eq); /* Allocations will be made in blocks of at least this size */ #define BLOCK_SIZE (4 * 1024 * 1024) /* No returned allocations will be smaller than this size. Also, they * will be aligned to this size. * * Some libraries depend on this value and will crash if it's smaller. */ #define FRAG_SIZE (64) extern gpuarray_blas_ops cublas_ops; extern gpuarray_comm_ops nccl_ops; const gpuarray_buffer_ops cuda_ops; static void cuda_freekernel(gpukernel *); static int cuda_property(gpucontext *, gpudata *, gpukernel *, int, void *); static int cuda_waits(gpudata *, int, CUstream); static int cuda_records(gpudata *, int, CUstream); static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags); static void cuda_free(gpudata *); static int detect_arch(const char *prefix, char *ret, error *e); static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size); typedef struct _disk_key { uint8_t version; uint8_t debug; uint8_t major; uint8_t minor; uint32_t reserved; char bin_id[64]; strb src; } disk_key; typedef struct _kernel_key { const char *fname; strb src; } kernel_key; /* Size of the disk_key that we can memcopy to duplicate */ #define DISK_KEY_MM (sizeof(disk_key) - sizeof(strb)) static void disk_free(cache_key_t _k) { disk_key *k = (disk_key *)_k; strb_clear(&k->src); free(k); } static int strb_eq(strb *k1, strb *k2) { return (k1->l == k2->l && memcmp(k1->s, k2->s, k1->l) == 0); } static int kernel_eq(kernel_key *k1, kernel_key *k2) { return (strcmp(k1->fname, k2->fname) == 0 && strb_eq(&k1->src, &k2->src)); } static uint32_t kernel_hash(kernel_key *k) { XXH32_state_t state; XXH32_reset(&state, 42); XXH32_update(&state, k->fname, strlen(k->fname)); XXH32_update(&state, k->src.s, k->src.l); return XXH32_digest(&state); } static void kernel_free(kernel_key *k) { free((void *)k->fname); strb_clear(&k->src); free(k); } static int disk_eq(disk_key *k1, disk_key *k2) { return (memcmp(k1, k2, DISK_KEY_MM) == 0 && strb_eq(&k1->src, &k2->src)); } static int disk_hash(disk_key *k) { XXH32_state_t state; XXH32_reset(&state, 42); XXH32_update(&state, k, DISK_KEY_MM); XXH32_update(&state, k->src.s, k->src.l); return XXH32_digest(&state); } static int disk_write(strb *res, disk_key *k) { strb_appendn(res, (const char *)k, DISK_KEY_MM); strb_appendb(res, &k->src); return strb_error(res); } static disk_key *disk_read(const strb *b) { disk_key *k; if (b->l < DISK_KEY_MM) return NULL; k = calloc(1, sizeof(*k)); if (k == NULL) return NULL; memcpy(k, b->s, DISK_KEY_MM); if (k->version != 0) { free(k); return NULL; } if (strb_ensure(&k->src, b->l - DISK_KEY_MM) != 0) { strb_clear(&k->src); free(k); return NULL; } strb_appendn(&k->src, b->s + DISK_KEY_MM, b->l - DISK_KEY_MM); return k; } static int kernel_write(strb *res, strb *bin) { strb_appendb(res, bin); return strb_error(res); } static strb *kernel_read(const strb *b) { strb *res = strb_alloc(b->l); if (res != NULL) strb_appendb(res, b); return res; } static int setup_done = 0; static int major = -1; static int minor = -1; static int setup_lib(void) { CUresult err; int res, tmp; if (!setup_done) { res = load_libcuda(global_err); if (res != GA_NO_ERROR) return res; err = cuInit(0); if (err != CUDA_SUCCESS) return error_cuda(global_err, "cuInit", err); err = cuDriverGetVersion(&tmp); if (err != CUDA_SUCCESS) return error_set(global_err, GA_IMPL_ERROR, "cuDriverGetVersion failed"); major = tmp / 1000; minor = (tmp / 10) % 10; /* Let's try to load a nvrtc corresponding to detected CUDA version. */ res = load_libnvrtc(major, minor, global_err); if (res != GA_NO_ERROR) { /* Else, let's try to find a nvrtc corresponding to supported CUDA versions. */ int versions[][2] = {{9, 1}, {9, 0}, {8, 0}, {7, 5}, {7, 0}}; int versions_length = sizeof(versions) / sizeof(versions[0]); int i = 0; /* Skip versions that are higher or equal to the driver version */ while (versions[i][0] > major || (versions[i][0] == major && versions[i][1] >= minor)) i++; do { major = versions[i][0]; minor = versions[i][1]; res = load_libnvrtc(major, minor, global_err); i++; } while (res != GA_NO_ERROR && i < versions_length); } if (res != GA_NO_ERROR) return res; setup_done = 1; } return GA_NO_ERROR; } static int cuda_get_platform_count(unsigned int* platcount) { *platcount = 1; // CUDA works on NVIDIA's GPUs return GA_NO_ERROR; } static int cuda_get_device_count(unsigned int platform, unsigned int* devcount) { CUresult err; int dv; // platform number gets ignored in CUDA implementation GA_CHECK(setup_lib()); err = cuDeviceGetCount(&dv); if (err != CUDA_SUCCESS) return error_cuda(global_err, "cuDeviceGetCount", err); *devcount = (unsigned int)dv; return GA_NO_ERROR; } cuda_context *cuda_make_ctx(CUcontext ctx, gpucontext_props *p) { cuda_context *res; cache *mem_cache; const char *cache_path; void *pp; CUdevice dev; CUresult err; int cc_major, cc_minor; int e; e = setup_lib(); if (e != GA_NO_ERROR) return NULL; err = cuCtxGetDevice(&dev); if (err != CUDA_SUCCESS) { error_cuda(global_err, "cuCtxGetDevice", err); return NULL; } e = get_cc(dev, &cc_major, &cc_minor, global_err); if (e != GA_NO_ERROR) return NULL; if ((major >= 9 && cc_major <= 2) || (major >= 7 && cc_major <= 1)) { error_set(global_err, GA_UNSUPPORTED_ERROR, "GPU is too old for CUDA version"); return NULL; } res = calloc(1, sizeof(*res)); if (res == NULL) { error_sys(global_err, "calloc"); return NULL; } res->ctx = ctx; res->ops = &cuda_ops; res->refcnt = 1; res->flags = p->flags; res->max_cache_size = p->max_cache_size; res->enter = 0; res->major = major; res->minor = minor; res->freeblocks = NULL; if (error_alloc(&res->err)) { error_set(global_err, GA_SYS_ERROR, "Could not create error context"); goto fail_errmsg; } if (detect_arch(ARCH_PREFIX, res->bin_id, global_err)) { goto fail_stream; } /* Don't add the nonblocking flags to help usage with other libraries that may do stuff on the NULL stream */ err = cuStreamCreate(&res->s, 0); if (err != CUDA_SUCCESS) { error_cuda(global_err, "cuStreamCreate", err); goto fail_stream; } if (ISSET(res->flags, GA_CTX_SINGLE_STREAM)) { res->mem_s = res->s; } else { /* Don't add the nonblocking flags to help usage with other libraries that may do stuff on the NULL stream */ err = cuStreamCreate(&res->mem_s, 0); if (err != CUDA_SUCCESS) { error_cuda(global_err, "cuStreamCreate", err); goto fail_mem_stream; } } res->kernel_cache = cache_twoq(64, 128, 64, 8, (cache_eq_fn)kernel_eq, (cache_hash_fn)kernel_hash, (cache_freek_fn)kernel_free, (cache_freev_fn)cuda_freekernel, global_err); if (res->kernel_cache == NULL) { error_cuda(global_err, "cuStreamCreate", err); goto fail_cache; } cache_path = p->kernel_cache_path; if (cache_path == NULL) cache_path = getenv("GPUARRAY_CACHE_PATH"); if (cache_path != NULL) { mem_cache = cache_lru(64, 8, (cache_eq_fn)disk_eq, (cache_hash_fn)disk_hash, (cache_freek_fn)disk_free, (cache_freev_fn)strb_free, global_err); if (mem_cache == NULL) { fprintf(stderr, "Error initializing mem cache for disk: %s\n", global_err->msg); goto fail_disk_cache; } res->disk_cache = cache_disk(cache_path, mem_cache, (kwrite_fn)disk_write, (vwrite_fn)kernel_write, (kread_fn)disk_read, (vread_fn)kernel_read, global_err); if (res->disk_cache == NULL) { fprintf(stderr, "Error initializing disk cache, disabling: %s\n", global_err->msg); cache_destroy(mem_cache); goto fail_disk_cache; } } else { fail_disk_cache: res->disk_cache = NULL; } err = cuMemAllocHost(&pp, 16); if (err != CUDA_SUCCESS) { error_cuda(global_err, "cuMemAllocHost", err); goto fail_errbuf; } memset(pp, 0, 16); /* Need to tag for new_gpudata */ TAG_CTX(res); res->errbuf = new_gpudata(res, (CUdeviceptr)pp, 16); if (res->errbuf == NULL) { /* Copy the error from the context since we are getting rid of it */ error_set(global_err, res->err->code, res->err->msg); goto fail_end; } res->errbuf->flags |= CUDA_MAPPED_PTR; /* Prime the cache */ if (p->initial_cache_size) { gpudata *tmp = cuda_alloc((gpucontext *)res, p->initial_cache_size, NULL, 0); if (tmp != NULL) cuda_free(tmp); } return res; fail_end: cuMemFreeHost(pp); fail_errbuf: if (res->disk_cache) cache_destroy(res->disk_cache); cache_destroy(res->kernel_cache); fail_cache: if (ISCLR(res->flags, GA_CTX_SINGLE_STREAM)) cuStreamDestroy(res->mem_s); fail_mem_stream: cuStreamDestroy(res->s); fail_stream: error_free(res->err); fail_errmsg: free(res); return NULL; } static void deallocate(gpudata *); static void cuda_free_ctx(cuda_context *ctx) { gpudata *next, *curr; CUdevice dev; ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { assert(ctx->enter == 0 && "Context was active when freed!"); if (ctx->blas_handle != NULL) { ctx->blas_ops->teardown((gpucontext *)ctx); } cuMemFreeHost((void *)ctx->errbuf->ptr); deallocate(ctx->errbuf); if (ISCLR(ctx->flags, GA_CTX_SINGLE_STREAM)) cuStreamDestroy(ctx->mem_s); cuStreamDestroy(ctx->s); /* Clear out the freelist */ for (curr = ctx->freeblocks; curr != NULL; curr = next) { next = curr->next; cuMemFree(curr->ptr); deallocate(curr); } cache_destroy(ctx->kernel_cache); if (ctx->disk_cache) cache_destroy(ctx->disk_cache); error_free(ctx->err); if (!(ctx->flags & DONTFREE)) { cuCtxPushCurrent(ctx->ctx); cuCtxGetDevice(&dev); cuCtxPopCurrent(NULL); cuDevicePrimaryCtxRelease(dev); } CLEAR(ctx); free(ctx); } } CUstream cuda_get_stream(cuda_context *ctx) { ASSERT_CTX(ctx); return ctx->s; } void cuda_enter(cuda_context *ctx) { ASSERT_CTX(ctx); if (!ctx->enter) cuCtxPushCurrent(ctx->ctx); ctx->enter++; } void cuda_exit(cuda_context *ctx) { ASSERT_CTX(ctx); assert(ctx->enter > 0); ctx->enter--; if (!ctx->enter) cuCtxPopCurrent(NULL); } static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size) { gpudata *res; CUresult err; int fl = CU_EVENT_DISABLE_TIMING; res = malloc(sizeof(*res)); if (res == NULL) { error_sys(ctx->err, "malloc"); return NULL; } res->refcnt = 0; res->sz = size; res->flags = 0; res->ls = NULL; cuda_enter(ctx); if (ctx->flags & GA_CTX_MULTI_THREAD) fl |= CU_EVENT_BLOCKING_SYNC; err = cuEventCreate(&res->rev, fl); if (err != CUDA_SUCCESS) { error_cuda(ctx->err, "cuEventCreate", err); cuda_exit(ctx); free(res); return NULL; } err = cuEventCreate(&res->wev, fl); if (err != CUDA_SUCCESS) { error_cuda(ctx->err, "cuEventCreate", err); cuEventDestroy(res->rev); cuda_exit(ctx); free(res); return NULL; } cuda_exit(ctx); res->ptr = ptr; res->next = NULL; res->ctx = ctx; TAG_BUF(res); return res; } gpudata *cuda_make_buf(cuda_context *ctx, CUdeviceptr p, size_t sz) { gpudata *res = new_gpudata(ctx, p, sz); if (res == NULL) return NULL; res->refcnt = 1; res->flags |= DONTFREE; res->ctx->refcnt++; return res; } size_t cuda_get_sz(gpudata *g) { ASSERT_BUF(g); return g->sz; } #define CHKFAIL(e, n, v) \ if (err != CUDA_SUCCESS) { \ error_cuda(e, n, err); \ return v; \ } static cuda_context *do_init(CUdevice dev, gpucontext_props *p, error *e) { cuda_context *res; CUcontext ctx; CUresult err; unsigned int fl = 0; unsigned int cur_fl; int act; int i; switch (p->sched) { case GA_CTX_SCHED_AUTO: fl = CU_CTX_SCHED_AUTO; break; case GA_CTX_SCHED_SINGLE: fl = CU_CTX_SCHED_SPIN; break; case GA_CTX_SCHED_MULTI: fl = CU_CTX_SCHED_BLOCKING_SYNC; break; } err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); CHKFAIL(e, "cuDeviceGetAttribute", NULL); if (i != 1) { error_set(e, GA_UNSUPPORTED_ERROR, "device does not support unified addressing"); return NULL; } err = cuDevicePrimaryCtxGetState(dev, &cur_fl, &act); CHKFAIL(e, "cuDevicePrimaryCtxGetState", NULL); if (act == 1) { if ((cur_fl & fl) != fl) { error_set(e, GA_INVALID_ERROR, "device is already active and has unsupported flags"); return NULL; } } else { err = cuDevicePrimaryCtxSetFlags(dev, fl); CHKFAIL(e, "cuDevicePrimaryCtxSetFlags", NULL); } err = cuDevicePrimaryCtxRetain(&ctx, dev); CHKFAIL(e, "cuDevicePrimaryCtxRetain", NULL); err = cuCtxPushCurrent(ctx); CHKFAIL(e, "cuCtxPushCurrent", NULL); res = cuda_make_ctx(ctx, p); if (res == NULL) { cuDevicePrimaryCtxRelease(dev); if (e != global_err) error_set(e, global_err->code, global_err->msg); return NULL; } res->blas_handle = NULL; /* If we can't load cublas, then we have no blas */ if (!load_libcublas(major, minor, res->err)) { res->blas_ops = &cublas_ops; } else { res->blas_ops = NULL; } res->comm_ops = &nccl_ops; /* Don't leave the context on the thread stack */ cuCtxPopCurrent(NULL); return res; } static gpucontext *cuda_init(gpucontext_props *p) { CUdevice dev; cuda_context *res; CUresult err; int r; r = setup_lib(); if (r != GA_NO_ERROR) { return NULL; } if (p->dev == -1) { int i, c; err = cuDeviceGetCount(&c); CHKFAIL(global_err, "cuDeviceGetCount", NULL); for (i = 0; i < c; i++) { err = cuDeviceGet(&dev, i); CHKFAIL(global_err, "cuDeviceGet", NULL); res = do_init(dev, p, global_err); if (res != NULL) return (gpucontext *)res; } error_set(global_err, GA_NODEV_ERROR, "No cuda device available"); return NULL; } else { err = cuDeviceGet(&dev, p->dev); CHKFAIL(global_err, "cuDeviceGet", NULL); return (gpucontext *)do_init(dev, p, global_err); } } static void cuda_deinit(gpucontext *c) { cuda_free_ctx((cuda_context *)c); } /* * Find the block in the free list that is the best fit for the size * we want, which means the smallest that can still fit the size. */ static void find_best(cuda_context *ctx, gpudata **best, gpudata **prev, size_t size) { gpudata *temp, *tempPrev = NULL; *best = NULL; for (temp = ctx->freeblocks; temp; temp = temp->next) { if (temp->sz >= size && (!*best || temp->sz < (*best)->sz)) { *best = temp; *prev = tempPrev; } tempPrev = temp; } } static size_t largest_size(cuda_context *ctx) { gpudata *temp; size_t sz, dummy; cuda_enter(ctx); cuMemGetInfo(&sz, &dummy); cuda_exit(ctx); /* We guess that we can allocate at least a quarter of the free size in a single block. This might be wrong though. */ sz /= 4; for (temp = ctx->freeblocks; temp; temp = temp->next) { if (temp->sz > sz) sz = temp->sz; } return sz; } /* * Allocate a new block and place in on the freelist. Will allocate * the bigger of the requested size and BLOCK_SIZE to avoid allocating * multiple small blocks. */ static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev, size_t size) { CUdeviceptr ptr; gpudata *next; CUresult err; *prev = NULL; if (ctx->max_cache_size != 0) { if (size < BLOCK_SIZE) size = BLOCK_SIZE; if (ctx->cache_size + size > ctx->max_cache_size) return error_set(ctx->err, GA_VALUE_ERROR, "Maximum cache size reached"); } cuda_enter(ctx); err = cuMemAlloc(&ptr, size); if (err != CUDA_SUCCESS) { cuda_exit(ctx); return error_cuda(ctx->err, "cuMemAlloc", err); } *res = new_gpudata(ctx, ptr, size); cuda_exit(ctx); if (*res == NULL) { cuMemFree(ptr); return ctx->err->code; } ctx->cache_size += size; (*res)->flags |= CUDA_HEAD_ALLOC; /* Now that the block is allocated, enter it in the freelist */ next = ctx->freeblocks; for (; next && next->ptr < (*res)->ptr; next = next->next) { *prev = next; } (*res)->next = next; if (*prev) (*prev)->next = *res; else ctx->freeblocks = *res; return GA_NO_ERROR; } /* * Extract the `curr` block from the freelist, possibly splitting it * if it's too big for the requested size. The remaining block will * stay on the freelist if there is a split. `prev` is only to * facilitate the extraction so we don't have to go through the list * again. */ static int extract(gpudata *curr, gpudata *prev, size_t size) { gpudata *next, *split; size_t remaining = curr->sz - size; if (remaining < FRAG_SIZE) { /* No need to split, the remaining block would be too small */ next = curr->next; } else { split = new_gpudata(curr->ctx, curr->ptr + size, remaining); if (split == NULL) return curr->ctx->err->code; /* Make sure the chain keeps going */ split->next = curr->next; curr->next = NULL; /* Make sure we don't start using the split buffer too soon */ cuda_records(split, CUDA_WAIT_ALL, curr->ls); next = split; curr->sz = size; } if (prev != NULL) prev->next = next; else curr->ctx->freeblocks = next; return GA_NO_ERROR; } static int cuda_write(gpudata *dst, size_t dstoff, const void *src, size_t sz); static inline size_t roundup(size_t s, size_t m) { return ((s + (m - 1)) / m) * m; } static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags) { gpudata *res = NULL, *prev = NULL; cuda_context *ctx = (cuda_context *)c; size_t asize; if (size == 0) size = 1; if ((flags & GA_BUFFER_INIT) && data == NULL) { error_set(ctx->err, GA_VALUE_ERROR, "Requested buffer initialisation but no data given"); return NULL; } if ((flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) == (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) { error_set(ctx->err, GA_VALUE_ERROR, "Invalid flags combinaison WRITE_ONLY and READ_ONLY"); return NULL; } /* TODO: figure out how to make this work */ if (flags & GA_BUFFER_HOST) { error_set(ctx->err, GA_DEVSUP_ERROR, "Host mapped allocations are not supported yet"); return NULL; } /* We don't want to manage really small allocations so we round up * to a multiple of FRAG_SIZE. This also ensures that if we split a * block, the next block starts properly aligned for any data type. */ if (ctx->max_cache_size != 0) { asize = roundup(size, FRAG_SIZE); find_best(ctx, &res, &prev, asize); } else { asize = size; } if (res == NULL && allocate(ctx, &res, &prev, asize) != GA_NO_ERROR) return NULL; if (extract(res, prev, asize) != GA_NO_ERROR) return NULL; /* It's out of the freelist, so add a ref */ res->ctx->refcnt++; /* We consider this buffer allocated and ready to go */ res->refcnt = 1; if (flags & GA_BUFFER_INIT) { if (cuda_write(res, 0, data, size) != GA_NO_ERROR) { cuda_free(res); return NULL; } } return res; } int cuda_get_ipc_handle(gpudata *d, GpuArrayIpcMemHandle *h) { ASSERT_BUF(d); cuda_enter(d->ctx); CUDA_EXIT_ON_ERROR(d->ctx, cuIpcGetMemHandle((CUipcMemHandle *)h, d->ptr)); cuda_exit(d->ctx); return GA_NO_ERROR; } gpudata *cuda_open_ipc_handle(gpucontext *c, GpuArrayIpcMemHandle *h, size_t sz) { CUdeviceptr p; cuda_context *ctx = (cuda_context *)c; gpudata *d = NULL; CUresult err; cuda_enter(ctx); err = cuIpcOpenMemHandle(&p, *((CUipcMemHandle *)h), CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); if (err != CUDA_SUCCESS) { cuda_exit(ctx); error_cuda(ctx->err, "cuIpcOpenMemHandle", err); return NULL; } d = cuda_make_buf(ctx, p, sz); if (d != NULL) d->flags |= CUDA_IPC_MEMORY; return d; } static void cuda_retain(gpudata *d) { ASSERT_BUF(d); d->refcnt++; } static void deallocate(gpudata *d) { cuda_enter(d->ctx); cuEventDestroy(d->rev); cuEventDestroy(d->wev); cuda_exit(d->ctx); CLEAR(d); free(d); } static void cuda_free(gpudata *d) { /* We ignore errors on free */ ASSERT_BUF(d); d->refcnt--; if (d->refcnt == 0) { /* Keep a reference to the context since we deallocate the gpudata * object */ cuda_context *ctx = d->ctx; if (d->flags & DONTFREE) { /* This is the path for "external" buffers */ deallocate(d); } else if (d->flags & CUDA_IPC_MEMORY) { cuIpcCloseMemHandle(d->ptr); deallocate(d); } else if (ctx->max_cache_size == 0) { /* Just free the pointer */ cuMemFree(d->ptr); deallocate(d); } else { /* Find the position in the freelist. Freelist is kept in order of allocation address */ gpudata *next = d->ctx->freeblocks, *prev = NULL; for (; next && next->ptr < d->ptr; next = next->next) { prev = next; } next = prev != NULL ? prev->next : d->ctx->freeblocks; /* See if we can merge the block with the previous one */ if (!(d->flags & CUDA_HEAD_ALLOC) && prev != NULL && prev->ptr + prev->sz == d->ptr) { prev->sz = prev->sz + d->sz; cuda_waits(d, CUDA_WAIT_ALL, prev->ls); cuda_records(prev, CUDA_WAIT_ALL, prev->ls); deallocate(d); d = prev; } else if (prev != NULL) { prev->next = d; } else { d->ctx->freeblocks = d; } /* See if we can merge with next */ if (next && !(next->flags & CUDA_HEAD_ALLOC) && d->ptr + d->sz == next->ptr) { d->sz = d->sz + next->sz; d->next = next->next; cuda_waits(next, CUDA_WAIT_ALL, d->ls); cuda_records(d, CUDA_WAIT_ALL, d->ls); deallocate(next); } else { d->next = next; } } /* We keep this at the end since the freed buffer could be the * last reference to the context and therefore clearing the * reference could trigger the freeing if the whole context * including the freelist, which we manipulate. */ cuda_free_ctx(ctx); } } static int cuda_share(gpudata *a, gpudata *b) { ASSERT_BUF(a); ASSERT_BUF(b); return (a->ctx == b->ctx && a->sz != 0 && b->sz != 0 && ((a->ptr <= b->ptr && a->ptr + a->sz > b->ptr) || (b->ptr <= a->ptr && b->ptr + b->sz > a->ptr))); } static int cuda_waits(gpudata *a, int flags, CUstream s) { ASSERT_BUF(a); /* Never skip the wait if CUDA_WAIT_FORCE */ if (ISCLR(flags, CUDA_WAIT_FORCE)) { if (ISSET(a->ctx->flags, GA_CTX_SINGLE_STREAM)) return GA_NO_ERROR; /* If the last stream to touch this buffer is the same, we don't * need to wait for anything. */ if (a->ls == s) return GA_NO_ERROR; } cuda_enter(a->ctx); /* We wait for writes that happened before since multiple reads at * the same time are fine */ if (ISSET(flags, CUDA_WAIT_READ) || ISSET(flags, CUDA_WAIT_WRITE)) CUDA_EXIT_ON_ERROR(a->ctx, cuStreamWaitEvent(s, a->wev, 0)); /* Make sure to not disturb previous reads */ if (ISSET(flags, CUDA_WAIT_WRITE)) CUDA_EXIT_ON_ERROR(a->ctx, cuStreamWaitEvent(s, a->rev, 0)); cuda_exit(a->ctx); return GA_NO_ERROR; } int cuda_wait(gpudata *a, int flags) { return cuda_waits(a, flags, a->ctx->s); } static int cuda_records(gpudata *a, int flags, CUstream s) { ASSERT_BUF(a); if (ISCLR(flags, CUDA_WAIT_FORCE) && ISSET(a->ctx->flags, GA_CTX_SINGLE_STREAM)) return GA_NO_ERROR; cuda_enter(a->ctx); if (ISSET(flags, CUDA_WAIT_READ)) CUDA_EXIT_ON_ERROR(a->ctx, cuEventRecord(a->rev, s)); if (ISSET(flags, CUDA_WAIT_WRITE)) CUDA_EXIT_ON_ERROR(a->ctx, cuEventRecord(a->wev, s)); cuda_exit(a->ctx); a->ls = s; return GA_NO_ERROR; } int cuda_record(gpudata *a, int flags) { return cuda_records(a, flags, a->ctx->s); } static int cuda_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz) { cuda_context *ctx = dst->ctx; int res = GA_NO_ERROR; ASSERT_BUF(dst); ASSERT_BUF(src); if (src->ctx != dst->ctx) return error_set(ctx->err, GA_VALUE_ERROR, "Cannot move between contexts"); if (sz == 0) return GA_NO_ERROR; if ((dst->sz - dstoff) < sz) return error_set(ctx->err, GA_VALUE_ERROR, "Destination is smaller than requested transfer size"); if ((src->sz - srcoff) < sz) return error_set(ctx->err, GA_VALUE_ERROR, "Source is smaller than requested transfer size"); cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dst, CUDA_WAIT_WRITE)); CUDA_EXIT_ON_ERROR(ctx, cuMemcpyDtoDAsync(dst->ptr + dstoff, src->ptr + srcoff, sz, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dst, CUDA_WAIT_WRITE)); cuda_exit(ctx); return res; } static int cuda_read(void *dst, gpudata *src, size_t srcoff, size_t sz) { cuda_context *ctx = src->ctx; ASSERT_BUF(src); if (sz == 0) return GA_NO_ERROR; if ((src->sz - srcoff) < sz) return error_set(ctx->err, GA_VALUE_ERROR, "source is smaller than the read size"); cuda_enter(ctx); if (src->flags & CUDA_MAPPED_PTR) { if (ISSET(ctx->flags, GA_CTX_SINGLE_STREAM)) CUDA_EXIT_ON_ERROR(ctx, cuStreamSynchronize(ctx->s)); else CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(src->wev)); memcpy(dst, (void *)(src->ptr + srcoff), sz); } else { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_waits(src, CUDA_WAIT_READ, ctx->mem_s)); CUDA_EXIT_ON_ERROR(ctx, cuMemcpyDtoHAsync(dst, src->ptr + srcoff, sz, ctx->mem_s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_records(src, CUDA_WAIT_READ, ctx->mem_s)); } cuda_exit(ctx); return GA_NO_ERROR; } static int cuda_write(gpudata *dst, size_t dstoff, const void *src, size_t sz) { cuda_context *ctx = dst->ctx; ASSERT_BUF(dst); if (sz == 0) return GA_NO_ERROR; if ((dst->sz - dstoff) < sz) return error_set(ctx->err, GA_VALUE_ERROR, "Destination is smaller than the write size"); cuda_enter(ctx); if (dst->flags & CUDA_MAPPED_PTR) { if (ISSET(ctx->flags, GA_CTX_SINGLE_STREAM)) CUDA_EXIT_ON_ERROR(ctx, cuStreamSynchronize(ctx->s)); else CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(dst->rev)); memcpy((void *)(dst->ptr + dstoff), src, sz); } else { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_waits(dst, CUDA_WAIT_WRITE, ctx->mem_s)); CUDA_EXIT_ON_ERROR(ctx, cuMemcpyHtoDAsync(dst->ptr + dstoff, src, sz, ctx->mem_s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_records(dst, CUDA_WAIT_WRITE, ctx->mem_s)); } cuda_exit(ctx); return GA_NO_ERROR; } static int cuda_memset(gpudata *dst, size_t dstoff, int data) { cuda_context *ctx = dst->ctx; ASSERT_BUF(dst); if ((dst->sz - dstoff) == 0) return GA_NO_ERROR; cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dst, CUDA_WAIT_WRITE)); CUDA_EXIT_ON_ERROR(ctx, cuMemsetD8Async(dst->ptr + dstoff, data, dst->sz - dstoff, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dst, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; } int get_cc(CUdevice dev, int *maj, int *min, error *e) { CUresult err; err = cuDeviceGetAttribute(maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); if (err != CUDA_SUCCESS) return error_cuda(e, "cuDeviceGetAttribute", err); err = cuDeviceGetAttribute(min, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); if (err != CUDA_SUCCESS) return error_cuda(e, "cuDeviceGetAttribute", err); return GA_NO_ERROR; } static int detect_arch(const char *prefix, char *ret, error *e) { CUdevice dev; CUresult err; int major, minor; int res; size_t sz = strlen(prefix) + 3; err = cuCtxGetDevice(&dev); if (err != CUDA_SUCCESS) return error_cuda(e, "cuCtxGetDevice", err); GA_CHECK(get_cc(dev, &major, &minor, e)); res = snprintf(ret, sz, "%s%d%d", prefix, major, minor); if (res == -1) return error_sys(e, "snprintf"); if (res > (ssize_t)sz) return error_set(e, GA_UNSUPPORTED_ERROR, "detect_arch: arch id is too large"); return GA_NO_ERROR; } static inline int error_nvrtc(error *e, const char *msg, nvrtcResult err) { return error_fmt(e, GA_IMPL_ERROR, "%s: %s", msg, nvrtcGetErrorString(err)); } static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { nvrtcProgram prog; size_t buflen; const char *heads[1] = {"cluda.h"}; const char *hsrc[1]; const char *opts[] = { "-arch", "" #ifdef DEBUG , "-G", "-lineinfo" #endif }; nvrtcResult err; opts[1] = ctx->bin_id; hsrc[0] = cluda_cuda_h; err = nvrtcCreateProgram(&prog, src->s, NULL, 1, hsrc, heads); if (err != NVRTC_SUCCESS) return error_nvrtc(ctx->err, "nvrtcCreateProgram", err); err = nvrtcCompileProgram(prog, sizeof(opts)/sizeof(char *), opts); /* Get the log before handling the error */ if (nvrtcGetProgramLogSize(prog, &buflen) == NVRTC_SUCCESS) { strb_appends(log, "NVRTC compile log::\n"); if (strb_ensure(log, buflen) == 0) if (nvrtcGetProgramLog(prog, log->s+log->l) == NVRTC_SUCCESS) log->l += buflen - 1; // Remove the final NUL strb_appendc(log, '\n'); } if (err != NVRTC_SUCCESS) { nvrtcDestroyProgram(&prog); #ifdef DEBUG strb_dump(src, stderr); strb_dump(log, stderr); #endif return error_nvrtc(ctx->err, "nvrtcCompileProgram", err); } err = nvrtcGetPTXSize(prog, &buflen); if (err != NVRTC_SUCCESS) { nvrtcDestroyProgram(&prog); return error_nvrtc(ctx->err, "nvrtcGetPTXSize", err); } if (strb_ensure(ptx, buflen) == 0) { err = nvrtcGetPTX(prog, ptx->s+ptx->l); if (err != NVRTC_SUCCESS) { nvrtcDestroyProgram(&prog); return error_nvrtc(ctx->err, "nvrtcGetPTX", err); } ptx->l += buflen; } return GA_NO_ERROR; } static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { char info_log[2048] = ""; char error_log[2048] = ""; void *out; size_t out_size; CUlinkState st; CUjit_option cujit_opts[] = { CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER, CU_JIT_LOG_VERBOSE, CU_JIT_GENERATE_DEBUG_INFO, CU_JIT_GENERATE_LINE_INFO, }; void *cujit_opt_vals[] = { (void *)sizeof(info_log), info_log, (void *)sizeof(error_log), error_log, #ifdef DEBUG (void *)1, (void *)1, (void *)1 #else (void *)0, (void *)0, (void *)0 #endif }; CUresult err; int res = GA_NO_ERROR; err = cuLinkCreate(sizeof(cujit_opts)/sizeof(cujit_opts[0]), cujit_opts, cujit_opt_vals, &st); if (err != CUDA_SUCCESS) return error_cuda(ctx->err, "cuLinkCreate", err); err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l, "kernel code", 0, NULL, NULL); if (err != CUDA_SUCCESS) { res = error_cuda(ctx->err, "cuLinkAddData", err); goto out; } err = cuLinkComplete(st, &out, &out_size); if (err != CUDA_SUCCESS) { res = error_cuda(ctx->err, "cuLinkComplete", err); goto out; } strb_appendn(bin, out, out_size); out: cuLinkDestroy(st); strb_appends(log, "Link info log::\n"); strb_appends(log, info_log); strb_appends(log, "\nLink error log::\n"); strb_appends(log, error_log); strb_appendc(log, '\n'); return res; } static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { strb ptx = STRB_STATIC_INIT; strb *cbin; disk_key k; disk_key *pk; memset(&k, 0, sizeof(k)); k.version = 0; #ifdef DEBUG k.debug = 1; #endif k.major = ctx->major; k.minor = ctx->minor; memcpy(k.bin_id, ctx->bin_id, 64); memcpy(&k.src, src, sizeof(strb)); // Look up the binary in the disk cache if (ctx->disk_cache) { cbin = cache_get(ctx->disk_cache, &k); if (cbin != NULL) { strb_appendb(bin, cbin); return GA_NO_ERROR; } } GA_CHECK(call_compiler(ctx, src, &ptx, log)); GA_CHECK(make_bin(ctx, &ptx, bin, log)); strb_clear(&ptx); if (ctx->disk_cache) { pk = calloc(sizeof(disk_key), 1); if (pk == NULL) { error_sys(ctx->err, "calloc"); fprintf(stderr, "Error adding kernel to disk cache: %s\n", ctx->err->msg); return GA_NO_ERROR; } memcpy(pk, &k, DISK_KEY_MM); strb_appendb(&pk->src, src); if (strb_error(&pk->src)) { error_sys(ctx->err, "strb_appendb"); fprintf(stderr, "Error adding kernel to disk cache %s\n", ctx->err->msg); disk_free((cache_key_t)pk); return GA_NO_ERROR; } cbin = strb_alloc(bin->l); if (cbin == NULL) { error_sys(ctx->err, "strb_alloc"); fprintf(stderr, "Error adding kernel to disk cache: %s\n", ctx->err->msg); disk_free((cache_key_t)pk); return GA_NO_ERROR; } strb_appendb(cbin, bin); if (strb_error(cbin)) { error_sys(ctx->err, "strb_appendb"); fprintf(stderr, "Error adding kernel to disk cache %s\n", ctx->err->msg); disk_free((cache_key_t)pk); strb_free(cbin); return GA_NO_ERROR; } if (cache_add(ctx->disk_cache, pk, cbin)) { // TODO use better error messages fprintf(stderr, "Error adding kernel to disk cache\n"); } } return GA_NO_ERROR; } static void _cuda_freekernel(gpukernel *k) { k->refcnt--; if (k->refcnt == 0) { if (k->ctx != NULL) { cuda_enter(k->ctx); cuModuleUnload(k->m); cuda_exit(k->ctx); cuda_free_ctx(k->ctx); } CLEAR(k); free(k->args); free(k->bin); free(k->types); free(k); } } static int cuda_newkernel(gpukernel **k, gpucontext *c, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, const int *types, int flags, char **err_str) { cuda_context *ctx = (cuda_context *)c; strb src = STRB_STATIC_INIT; strb bin = STRB_STATIC_INIT; strb log = STRB_STATIC_INIT; gpukernel *res; kernel_key k_key; kernel_key *p_key; CUdevice dev; CUresult err; unsigned int i; int major, minor; if (count == 0) return error_set(ctx->err, GA_VALUE_ERROR, "String count is 0"); if (flags & GA_USE_OPENCL) return error_set(ctx->err, GA_DEVSUP_ERROR, "OpenCL kernel not supported on cuda devices"); cuda_enter(ctx); err = cuCtxGetDevice(&dev); if (err != CUDA_SUCCESS) { cuda_exit(ctx); return error_cuda(ctx->err, "cuCtxGetDevice", err); } if (get_cc(dev, &major, &minor, ctx->err) != GA_NO_ERROR) return ctx->err->code; // GA_USE_SMALL will always work // GA_USE_HALF should always work if (flags & GA_USE_DOUBLE) { if (major < 1 || (major == 1 && minor < 3)) { cuda_exit(ctx); return error_set(ctx->err, GA_DEVSUP_ERROR, "Requested double support and current device doesn't support them"); } } if (flags & GA_USE_COMPLEX) { // just for now since it is most likely broken cuda_exit(ctx); return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Complex support is not there yet."); } if (lengths == NULL) { for (i = 0; i < count; i++) strb_appends(&src, strings[i]); } else { for (i = 0; i < count; i++) { if (lengths[i] == 0) strb_appends(&src, strings[i]); else strb_appendn(&src, strings[i], lengths[i]); } } strb_append0(&src); if (strb_error(&src)) { strb_clear(&src); cuda_exit(ctx); return error_sys(ctx->err, "strb"); } k_key.fname = fname; k_key.src = src; res = (gpukernel *)cache_get(ctx->kernel_cache, &k_key); if (res != NULL) { res->refcnt++; strb_clear(&src); *k = res; return GA_NO_ERROR; } if (compile(ctx, &src, &bin, &log) != GA_NO_ERROR) { if (err_str != NULL) { strb debug_msg = STRB_STATIC_INIT; strb_appends(&debug_msg, "CUDA kernel compile failure ::\n"); src.l--; gpukernel_source_with_line_numbers(1, (const char **)&src.s, &src.l, &debug_msg); strb_appends(&debug_msg, "\nCompile log:\n"); strb_appendb(&debug_msg, &log); *err_str = strb_cstr(&debug_msg); } strb_clear(&src); strb_clear(&bin); strb_clear(&log); cuda_exit(ctx); return ctx->err->code; } strb_clear(&log); if (strb_error(&bin)) { strb_clear(&src); strb_clear(&bin); cuda_exit(ctx); return error_sys(ctx->err, "strb"); } res = calloc(1, sizeof(*res)); if (res == NULL) { strb_clear(&src); strb_clear(&bin); cuda_exit(ctx); return error_sys(ctx->err, "calloc"); } /* Don't clear bin after this */ res->bin_sz = bin.l; res->bin = bin.s; res->refcnt = 1; res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); return error_sys(ctx->err, "calloc"); } memcpy(res->types, types, argcount*sizeof(int)); res->args = calloc(argcount, sizeof(void *)); if (res->args == NULL) { _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); return error_sys(ctx->err, "calloc"); } err = cuModuleLoadData(&res->m, bin.s); if (err != CUDA_SUCCESS) { error_cuda(ctx->err, "cuModuleLoadData", err); _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); return error_cuda(ctx->err, "cuModuleLoadData", err); } err = cuModuleGetFunction(&res->k, res->m, fname); if (err != CUDA_SUCCESS) { _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); return error_cuda(ctx->err, "cuModuleGetFunction", err); } res->ctx = ctx; ctx->refcnt++; cuda_exit(ctx); TAG_KER(res); p_key = memdup(&k_key, sizeof(kernel_key)); if (p_key != NULL) { p_key->fname = strdup(fname); if (p_key->fname != NULL) { /* One of the refs is for the cache */ res->refcnt++; /* If this fails, it will free the key and remove a ref from the kernel. */ cache_add(ctx->kernel_cache, p_key, res); } else { free(p_key); strb_clear(&src); } } else { strb_clear(&src); } *k = res; return GA_NO_ERROR; } static void cuda_retainkernel(gpukernel *k) { ASSERT_KER(k); k->refcnt++; } static void cuda_freekernel(gpukernel *k) { ASSERT_KER(k); _cuda_freekernel(k); } static int cuda_kernelsetarg(gpukernel *k, unsigned int i, void *arg) { ASSERT_KER(k); if (i >= k->argcount) return error_set(k->ctx->err, GA_VALUE_ERROR, "index is beyond the last argument"); k->args[i] = arg; return GA_NO_ERROR; } static int cuda_callkernel(gpukernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args) { cuda_context *ctx = k->ctx; unsigned int i; ASSERT_KER(k); cuda_enter(ctx); if (args == NULL) args = k->args; for (i = 0; i < k->argcount; i++) { if (k->types[i] == GA_BUFFER) { /* We don't have any better info for now */ GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait((gpudata *)args[i], CUDA_WAIT_ALL)); } } switch (n) { case 1: CUDA_EXIT_ON_ERROR(ctx, cuLaunchKernel(k->k, gs[0], 1, 1, ls[0], 1, 1, shared, ctx->s, args, NULL)); break; case 2: CUDA_EXIT_ON_ERROR(ctx, cuLaunchKernel(k->k, gs[0], gs[1], 1, ls[0], ls[1], 1, shared, ctx->s, args, NULL)); break; case 3: CUDA_EXIT_ON_ERROR(ctx, cuLaunchKernel(k->k, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2], shared, ctx->s, args, NULL)); break; default: cuda_exit(ctx); return error_set(ctx->err, GA_VALUE_ERROR, "Call with more than 3 dimensions"); } for (i = 0; i < k->argcount; i++) { if (k->types[i] == GA_BUFFER) { /* We don't have any better info for now */ GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record((gpudata *)args[i], CUDA_WAIT_ALL)); } } cuda_exit(ctx); return GA_NO_ERROR; } static int cuda_sync(gpudata *b) { cuda_context *ctx = (cuda_context *)b->ctx; int err = GA_NO_ERROR; ASSERT_BUF(b); cuda_enter(ctx); if (ctx->flags & GA_CTX_SINGLE_STREAM) { CUDA_EXIT_ON_ERROR(ctx, cuStreamSynchronize(ctx->s)); } else { CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(b->wev)); CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(b->rev)); } cuda_exit(ctx); return err; } static int cuda_transfer(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz) { ASSERT_BUF(src); ASSERT_BUF(dst); /* The forced synchronization are there because they are required for proper inter-device correctness. */ cuda_enter(dst->ctx); /* Make sure we have a rev for the source */ GA_CUDA_EXIT_ON_ERROR(dst->ctx, cuda_records(src, CUDA_WAIT_READ|CUDA_WAIT_FORCE, src->ctx->mem_s)); /* Make the destination stream wait for it */ GA_CUDA_EXIT_ON_ERROR(dst->ctx, cuda_waits(src, CUDA_WAIT_READ|CUDA_WAIT_FORCE, dst->ctx->mem_s)); /* Also wait on the destination buffer */ GA_CUDA_EXIT_ON_ERROR(dst->ctx, cuda_waits(dst, CUDA_WAIT_WRITE, dst->ctx->mem_s)); CUDA_EXIT_ON_ERROR(dst->ctx, cuMemcpyPeerAsync(dst->ptr+dstoff, dst->ctx->ctx, src->ptr+srcoff, src->ctx->ctx, sz, dst->ctx->mem_s)); /* This records the event in dst->wev */ GA_CUDA_EXIT_ON_ERROR(dst->ctx, cuda_records(dst, CUDA_WAIT_WRITE|CUDA_WAIT_FORCE, dst->ctx->mem_s)); /* This makes the source stream wait on the wev of dst */ GA_CUDA_EXIT_ON_ERROR(dst->ctx, cuda_waits(dst, CUDA_WAIT_WRITE|CUDA_WAIT_FORCE, src->ctx->mem_s)); /* This records the event on src->rev */ GA_CUDA_EXIT_ON_ERROR(dst->ctx, cuda_records(src, CUDA_WAIT_READ, src->ctx->mem_s)); cuda_exit(dst->ctx); return GA_NO_ERROR; } static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { cuda_context *ctx = NULL; if (c != NULL) { ctx = (cuda_context *)c; ASSERT_CTX(ctx); } else if (buf != NULL) { ASSERT_BUF(buf); ctx = buf->ctx; } else if (k != NULL) { ASSERT_KER(k); ctx = k->ctx; } if (prop_id < GA_BUFFER_PROP_START) { if (ctx == NULL) return error_set(global_err, GA_VALUE_ERROR, "Attempting to get a context property with no context"); } else if (prop_id < GA_KERNEL_PROP_START) { if (buf == NULL) return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR, "Attempting to get a buffer property with no buffer"); } else { if (k == NULL) return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR, "Attempting to get a kernel property with no kernel"); } #define GETPROP(prop, type) do { \ cuda_enter(ctx); \ CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id)); \ CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetAttribute(&i, (prop), id)); \ cuda_exit(ctx); \ *((type *)res) = i; \ } while(0) switch (prop_id) { CUdevice id; int i; size_t sz; case GA_CTX_PROP_DEVNAME: cuda_enter(ctx); CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id)); CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetName((char *)res, 256, id)); cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_UNIQUE_ID: cuda_enter(ctx); CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id)); CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetPCIBusId((char *)res, 13, id)); cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_LARGEST_MEMBLOCK: *((size_t *)res) = largest_size(ctx); return GA_NO_ERROR; case GA_CTX_PROP_LMEMSIZE: GETPROP(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, size_t); return GA_NO_ERROR; case GA_CTX_PROP_NUMPROCS: GETPROP(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, unsigned int); return GA_NO_ERROR; case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; return GA_NO_ERROR; case GA_CTX_PROP_ERRBUF: *((gpudata **)res) = ctx->errbuf; return GA_NO_ERROR; case GA_CTX_PROP_TOTAL_GMEM: cuda_enter(ctx); CUDA_EXIT_ON_ERROR(ctx, cuMemGetInfo(&sz, (size_t *)res)); cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_FREE_GMEM: cuda_enter(ctx); CUDA_EXIT_ON_ERROR(ctx, cuMemGetInfo((size_t *)res, &sz)); cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_NATIVE_FLOAT16: /* We claim that nobody supports this for now */ *((int *)res) = 0; return CUDA_SUCCESS; case GA_CTX_PROP_MAXGSIZE0: GETPROP(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, size_t); return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE1: GETPROP(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, size_t); return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE2: GETPROP(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, size_t); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE0: GETPROP(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, size_t); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE1: GETPROP(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, size_t); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE2: GETPROP(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, size_t); return GA_NO_ERROR; case GA_BUFFER_PROP_REFCNT: *((unsigned int *)res) = buf->refcnt; return GA_NO_ERROR; case GA_BUFFER_PROP_SIZE: *((size_t *)res) = buf->sz; return GA_NO_ERROR; case GA_BUFFER_PROP_CTX: case GA_KERNEL_PROP_CTX: *((gpucontext **)res) = (gpucontext *)ctx; return GA_NO_ERROR; case GA_KERNEL_PROP_MAXLSIZE: cuda_enter(ctx); CUDA_EXIT_ON_ERROR(ctx, cuFuncGetAttribute(&i, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, k->k)); cuda_exit(ctx); *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_PREFLSIZE: cuda_enter(ctx); CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id)); CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id)); cuda_exit(ctx); *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_NUMARGS: *((unsigned int *)res) = k->argcount; return GA_NO_ERROR; case GA_KERNEL_PROP_TYPES: *((const int **)res) = k->types; return GA_NO_ERROR; default: return error_fmt(ctx->err, GA_INVALID_ERROR, "Invalid property: %d", prop_id); } } static const char *cuda_error(gpucontext *c) { cuda_context *ctx = (cuda_context *)c; const char *errstr = NULL; if (ctx == NULL) return global_err->msg; else return ctx->err->msg; return errstr; } const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count, cuda_get_device_count, cuda_init, cuda_deinit, cuda_alloc, cuda_retain, cuda_free, cuda_share, cuda_move, cuda_read, cuda_write, cuda_memset, cuda_newkernel, cuda_retainkernel, cuda_freekernel, cuda_kernelsetarg, cuda_callkernel, cuda_sync, cuda_transfer, cuda_property, cuda_error}; libgpuarray-0.7.6/src/gpuarray_buffer_opencl.c000066400000000000000000001155221326743622600215440ustar00rootroot00000000000000#define _CRT_SECURE_NO_WARNINGS #include "private.h" #include "private_opencl.h" #include "gpuarray/buffer.h" #include "gpuarray/util.h" #include "gpuarray/error.h" #include "gpuarray/buffer_blas.h" #include #include #include #include #include "loaders/libclblas.h" #include "loaders/libclblast.h" #include "cluda_opencl.h.c" #define _unused(x) ((void)x) #define SSIZE_MIN (-(SSIZE_MAX-1)) extern gpuarray_blas_ops clblas_ops; extern gpuarray_blas_ops clblast_ops; const gpuarray_buffer_ops opencl_ops; static int cl_property(gpucontext *c, gpudata *b, gpukernel *k, int p, void *r); static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags); static void cl_release(gpudata *b); static void cl_free_ctx(cl_ctx *ctx); static int cl_newkernel(gpukernel **k, gpucontext *ctx, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, const int *types, int flags, char **err_str); static const char CL_CONTEXT_PREAMBLE[] = "-D __GA_WARP_SIZE=%lu"; // to be filled by cl_make_ctx() static int setup_done = 0; static int setup_lib(error *e) { if (setup_done) return GA_NO_ERROR; GA_CHECK(load_libopencl(e)); setup_done = 1; return GA_NO_ERROR; } static int cl_get_platform_count(unsigned int* platcount) { cl_uint nump; GA_CHECK(setup_lib(global_err)); CL_CHECK(global_err, clGetPlatformIDs(0, NULL, &nump)); *platcount = (unsigned int)nump; return GA_NO_ERROR; } static int cl_get_device_count(unsigned int platform, unsigned int* devcount) { cl_platform_id *ps; cl_platform_id p; cl_uint numd; cl_int err; unsigned int platcount = 0; /* This will load the library if needed */ GA_CHECK(cl_get_platform_count(&platcount)); ps = calloc(sizeof(*ps), platcount); if (ps == NULL) return error_sys(global_err, "calloc"); err = clGetPlatformIDs(platcount, ps, NULL); if (err != CL_SUCCESS) { free(ps); return error_cl(global_err, "clGetPlatformIDs", err); } p = ps[platform]; err = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &numd); free(ps); if (err != CL_SUCCESS) return error_cl(global_err, "clGetDeviceIds", err); *devcount = (unsigned int)numd; return GA_NO_ERROR; } static cl_device_id get_dev(cl_context ctx, error *e) { size_t sz; cl_device_id res; cl_device_id *ids; cl_int err; CL_CHECKN(e, clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, NULL, &sz)); ids = malloc(sz); if (ids == NULL) { error_sys(e, "malloc"); return NULL; } err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sz, ids, NULL); if (err != CL_SUCCESS) { free(ids); error_cl(e, "clContextGetInfo", err); return NULL; } res = ids[0]; free(ids); return res; } cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p) { cl_ctx *res; cl_device_id id; cl_command_queue_properties qprop; char vendor[32]; char driver_version[64]; char *device_version = NULL; size_t device_version_size = 0; cl_uint vendor_id; cl_int err; size_t len; int64_t v = 0; int e = 0; size_t warp_size; int ret; const char dummy_kern[] = "__kernel void kdummy(__global float *f) { f[0] = 0; }\n"; strb context_preamble = STRB_STATIC_INIT; const char *rlk[1]; gpukernel *m; e = setup_lib(global_err); if (e != GA_NO_ERROR) return NULL; id = get_dev(ctx, global_err); if (id == NULL) return NULL; /* Query device version string size */ CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VERSION, 0, NULL, &device_version_size)); if (device_version_size > 1024) { error_set(global_err, GA_UNSUPPORTED_ERROR, "device version buffer too large"); return NULL; } device_version = alloca(device_version_size); CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VERSION, device_version_size, device_version, NULL)); if (device_version[7] == '1' && device_version[9] < '2') { error_set(global_err, GA_UNSUPPORTED_ERROR, "We only support OpenCL 1.2 and up"); return NULL; } CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_QUEUE_PROPERTIES, sizeof(qprop), &qprop, NULL)); CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL)); CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL)); CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DRIVER_VERSION, sizeof(driver_version), driver_version, NULL)); res = malloc(sizeof(*res)); if (res == NULL) { error_sys(global_err, "malloc"); return NULL; } res->ctx = ctx; res->ops = &opencl_ops; if (error_alloc(&res->err)) { error_set(global_err, GA_SYS_ERROR, "Could not create error context"); free(res); return NULL; } res->refcnt = 1; res->exts = NULL; res->blas_handle = NULL; res->options = NULL; res->q = clCreateCommandQueue( ctx, id, ISSET(p->flags, GA_CTX_SINGLE_STREAM) ? 0 : qprop&CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); if (res->q == NULL) { error_cl(global_err, "clCreateCommandQueue", err); error_free(res->err); free(res); return NULL; } /* Can't overflow (source is 32 + 16 + 12 and buffer is 64) */ len = strlcpy(res->bin_id, vendor, sizeof(res->bin_id)); snprintf(res->bin_id + len, sizeof(res->bin_id) - len, " %#x ", vendor_id); strlcat(res->bin_id, driver_version, sizeof(res->bin_id)); clRetainContext(res->ctx); TAG_CTX(res); res->errbuf = cl_alloc((gpucontext *)res, 8, &v, GA_BUFFER_INIT); if (res->errbuf == NULL) goto fail; res->refcnt--; /* Prevent ref loop */ /* Create per-context OpenCL preamble */ // Create a dummy kernel and check GA_KERNEL_PROP_PREFLSIZE rlk[0] = dummy_kern; len = sizeof(dummy_kern); // this dummy kernel does not require a CLUDA preamble if (cl_newkernel(&m, (gpucontext *)res, 1, rlk, &len, "kdummy", 0, NULL, 0, NULL) != GA_NO_ERROR) goto fail; ret = cl_property((gpucontext *)res, NULL, m, GA_KERNEL_PROP_PREFLSIZE, &warp_size); if (ret != GA_NO_ERROR) goto fail; // Write the preferred workgroup multiple as GA_WARP_SIZE in preamble strb_appendf(&context_preamble, CL_CONTEXT_PREAMBLE, (unsigned long)warp_size); res->options = strb_cstr(&context_preamble); if (res->options == NULL) goto fail; res->blas_handle = NULL; if (load_libclblas(res->err) == GA_NO_ERROR) { res->blas_ops = &clblas_ops; } else if (load_libclblast(res->err) == GA_NO_ERROR) { res->blas_ops = &clblast_ops; } else { res->blas_ops = NULL; } res->comm_ops = NULL; return res; fail: error_set(global_err, res->err->code, res->err->msg); cl_free_ctx(res); return NULL; } cl_command_queue cl_get_stream(gpucontext *ctx) { ASSERT_CTX((cl_ctx *)ctx); return ((cl_ctx *)ctx)->q; } static void cl_free_ctx(cl_ctx *ctx) { ASSERT_CTX(ctx); assert(ctx->refcnt != 0); ctx->refcnt--; if (ctx->refcnt == 0) { if (ctx->errbuf != NULL) { ctx->refcnt = 2; /* Avoid recursive release */ cl_release(ctx->errbuf); } clReleaseCommandQueue(ctx->q); clReleaseContext(ctx->ctx); if (ctx->options != NULL) free(ctx->options); error_free(ctx->err); CLEAR(ctx); free(ctx); } } gpudata *cl_make_buf(gpucontext *c, cl_mem buf) { cl_ctx *ctx = (cl_ctx *)c; gpudata *res; cl_context buf_ctx; cl_int err; ASSERT_CTX(ctx); CL_CHECKN(ctx->err, clGetMemObjectInfo(buf, CL_MEM_CONTEXT, sizeof(buf_ctx), &buf_ctx, NULL)); if (buf_ctx != ctx->ctx) { error_set(ctx->err, GA_VALUE_ERROR, "Requested context doesn't macth object context"); return NULL; } res = malloc(sizeof(*res)); if (res == NULL) { error_sys(ctx->err, "malloc"); return NULL; } res->buf = buf; res->ev = NULL; res->refcnt = 1; err = clRetainMemObject(buf); if (err != CL_SUCCESS) { free(res); error_cl(ctx->err, "clRetainMemObject", err); return NULL; } res->ctx = ctx; res->ctx->refcnt++; TAG_BUF(res); return res; } cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; } #define PRAGMA "#pragma OPENCL EXTENSION " #define ENABLE " : enable\n" #define CL_SMALL "cl_khr_byte_addressable_store" #define CL_DOUBLE "cl_khr_fp64" #define CL_HALF "cl_khr_fp16" static void cl_releasekernel(gpukernel *k); static int cl_callkernel(gpukernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args); const char *cl_error_string(cl_int err) { switch (err) { case CL_SUCCESS: return "Success!"; case CL_DEVICE_NOT_FOUND: return "Device not found."; case CL_DEVICE_NOT_AVAILABLE: return "Device not available"; case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available"; case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure"; case CL_OUT_OF_RESOURCES: return "Out of resources"; case CL_OUT_OF_HOST_MEMORY: return "Out of host memory"; case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available"; case CL_MEM_COPY_OVERLAP: return "Memory copy overlap"; case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch"; case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported"; case CL_BUILD_PROGRAM_FAILURE: return "Program build failure"; case CL_MAP_FAILURE: return "Map failure"; case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "Buffer offset improperly aligned"; case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "Event in wait list has an error status"; case CL_INVALID_VALUE: return "Invalid value"; case CL_INVALID_DEVICE_TYPE: return "Invalid device type"; case CL_INVALID_PLATFORM: return "Invalid platform"; case CL_INVALID_DEVICE: return "Invalid device"; case CL_INVALID_CONTEXT: return "Invalid context"; case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties"; case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue"; case CL_INVALID_HOST_PTR: return "Invalid host pointer"; case CL_INVALID_MEM_OBJECT: return "Invalid memory object"; case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:return "Invalid image format descriptor"; case CL_INVALID_IMAGE_SIZE: return "Invalid image size"; case CL_INVALID_SAMPLER: return "Invalid sampler"; case CL_INVALID_BINARY: return "Invalid binary"; case CL_INVALID_BUILD_OPTIONS: return "Invalid build options"; case CL_INVALID_PROGRAM: return "Invalid program"; case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable"; case CL_INVALID_KERNEL_NAME: return "Invalid kernel name"; case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition"; case CL_INVALID_KERNEL: return "Invalid kernel"; case CL_INVALID_ARG_INDEX: return "Invalid argument index"; case CL_INVALID_ARG_VALUE: return "Invalid argument value"; case CL_INVALID_ARG_SIZE: return "Invalid argument size"; case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments"; case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension"; case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size"; case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size"; case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset"; case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list"; case CL_INVALID_EVENT: return "Invalid event"; case CL_INVALID_OPERATION: return "Invalid operation"; case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object"; case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size"; case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level"; case CL_INVALID_GLOBAL_WORK_SIZE: return "Invalid global work size"; case CL_INVALID_PROPERTY: return "Invalid property"; default: return "Unknown error"; } } static int check_ext(cl_ctx *ctx, const char *name) { cl_device_id dev; if (ctx->exts == NULL) { dev = get_dev(ctx->ctx, ctx->err); if (dev == NULL) return ctx->err->code; CL_GET_PROP(ctx->err, clGetDeviceInfo, dev, CL_DEVICE_EXTENSIONS, ctx->exts); } if (strstr(ctx->exts, name) == NULL) return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Unsupported extension %s", name); else return GA_NO_ERROR; } static void #ifdef _MSC_VER __stdcall #endif errcb(const char *errinfo, const void *pi, size_t cb, void *u) { fprintf(stderr, "%s\n", errinfo); } static gpucontext *cl_init(gpucontext_props *pp) { cl_device_id *ds; cl_device_id d; cl_platform_id *ps; cl_platform_id p; cl_uint nump, numd; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0, }; cl_context ctx; cl_ctx *res; cl_int err; int platno; int devno; int e; platno = pp->dev >> 16; devno = pp->dev & 0xFFFF; e = setup_lib(global_err); if (e != GA_NO_ERROR) return NULL; CL_CHECKN(global_err, clGetPlatformIDs(0, NULL, &nump)); if ((unsigned int)platno >= nump || platno < 0) { error_set(global_err, GA_VALUE_ERROR, "Platform ID out of range"); return NULL; } ps = calloc(sizeof(*ps), nump); if (ps == NULL) { error_sys(global_err, "calloc"); return NULL; } err = clGetPlatformIDs(nump, ps, NULL); /* We may get garbage on failure here but it won't matter as we will not use it */ p = ps[platno]; free(ps); if (err != CL_SUCCESS) { error_cl(global_err, "clGetPlatformIDs", err); return NULL; } CL_CHECKN(global_err, clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &numd)); if ((unsigned int)devno >= numd || devno < 0) { error_set(global_err, GA_VALUE_ERROR, "Device ID out of range"); return NULL; } ds = calloc(sizeof(*ds), numd); if (ds == NULL) { error_sys(global_err, "calloc"); return NULL; } err = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, numd, ds, NULL); d = ds[devno]; free(ds); if (err != CL_SUCCESS) { error_cl(global_err, "clGetDeviceIDs", err); return NULL; } props[1] = (cl_context_properties)p; ctx = clCreateContext(props, 1, &d, errcb, NULL, &err); if (ctx == NULL) { error_cl(global_err, "clCreateContext", err); return NULL; } res = cl_make_ctx(ctx, pp); clReleaseContext(ctx); return (gpucontext *)res; } static void cl_deinit(gpucontext *c) { ASSERT_CTX((cl_ctx *)c); cl_free_ctx((cl_ctx *)c); } static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags) { cl_ctx *ctx = (cl_ctx *)c; gpudata *res; void *hostp = NULL; cl_int err; cl_mem_flags clflags = CL_MEM_READ_WRITE; ASSERT_CTX(ctx); if (flags & GA_BUFFER_INIT) { if (data == NULL) { error_set(ctx->err, GA_VALUE_ERROR, "Requested initialization, but no data provided"); return NULL; } hostp = data; clflags |= CL_MEM_COPY_HOST_PTR; } if (flags & GA_BUFFER_HOST) { clflags |= CL_MEM_ALLOC_HOST_PTR; } if (flags & GA_BUFFER_READ_ONLY) { if (flags & GA_BUFFER_WRITE_ONLY) { error_set(ctx->err, GA_VALUE_ERROR, "Invalid combinaison: READ_ONLY and WRITE_ONLY"); return NULL; } clflags &= ~CL_MEM_READ_WRITE; clflags |= CL_MEM_READ_ONLY; } if (flags & GA_BUFFER_WRITE_ONLY) { clflags &= ~CL_MEM_READ_WRITE; clflags |= CL_MEM_WRITE_ONLY; } res = malloc(sizeof(*res)); if (res == NULL) { error_sys(ctx->err, "malloc"); return NULL; } res->refcnt = 1; if (size == 0) { /* OpenCL doesn't like a zero-sized buffer */ size = 1; } res->buf = clCreateBuffer(ctx->ctx, clflags, size, hostp, &err); res->ev = NULL; if (err != CL_SUCCESS) { free(res); error_cl(ctx->err, "clCreateBuffer", err); return NULL; } res->ctx = ctx; ctx->refcnt++; TAG_BUF(res); return res; } static void cl_retain(gpudata *b) { ASSERT_BUF(b); b->refcnt++; } static void cl_release(gpudata *b) { ASSERT_BUF(b); b->refcnt--; if (b->refcnt == 0) { CLEAR(b); clReleaseMemObject(b->buf); if (b->ev != NULL) clReleaseEvent(b->ev); cl_free_ctx(b->ctx); free(b); } } static int cl_share(gpudata *a, gpudata *b) { cl_ctx *ctx; cl_mem aa, bb; cl_int err; ASSERT_BUF(a); ASSERT_BUF(b); if (a->buf == b->buf) return 1; if (a->ctx != b->ctx) return 0; ctx = a->ctx; ASSERT_CTX(ctx); err = clGetMemObjectInfo(a->buf, CL_MEM_ASSOCIATED_MEMOBJECT, sizeof(aa), &aa, NULL); if (err != CL_SUCCESS) { error_cl(ctx->err, "clGetMemObjectInfo", err); return -1; } err = clGetMemObjectInfo(b->buf, CL_MEM_ASSOCIATED_MEMOBJECT, sizeof(bb), &bb, NULL); if (err != CL_SUCCESS) { error_cl(ctx->err, "clGetMemObjectInfo", err); return -1; } if (aa == NULL) aa = a->buf; if (bb == NULL) bb = b->buf; if (aa == bb) return 1; return 0; } static int cl_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz) { cl_ctx *ctx; cl_event ev; cl_event evw[2]; cl_event *evl = NULL; cl_uint num_ev = 0; ASSERT_BUF(dst); ASSERT_BUF(src); if (dst->ctx != src->ctx) { error_set(src->ctx->err, GA_VALUE_ERROR, "Differing contexts for source and destination"); return error_set(dst->ctx->err, src->ctx->err->code, src->ctx->err->msg); } ctx = dst->ctx; ASSERT_CTX(ctx); if (sz == 0) return GA_NO_ERROR; if (src->ev != NULL) evw[num_ev++] = src->ev; if (dst->ev != NULL && src != dst) evw[num_ev++] = dst->ev; if (num_ev > 0) evl = evw; CL_CHECK(ctx->err, clEnqueueCopyBuffer(ctx->q, src->buf, dst->buf, srcoff, dstoff, sz, num_ev, evl, &ev)); if (src->ev != NULL) clReleaseEvent(src->ev); if (dst->ev != NULL && src != dst) clReleaseEvent(dst->ev); src->ev = ev; dst->ev = ev; clRetainEvent(ev); return GA_NO_ERROR; } static int cl_read(void *dst, gpudata *src, size_t srcoff, size_t sz) { cl_ctx *ctx = src->ctx; cl_event ev[1]; cl_event *evl = NULL; cl_uint num_ev = 0; ASSERT_BUF(src); ASSERT_CTX(ctx); if (sz == 0) return GA_NO_ERROR; if (src->ev != NULL) { ev[0] = src->ev; evl = ev; num_ev = 1; } CL_CHECK(ctx->err, clEnqueueReadBuffer(ctx->q, src->buf, CL_TRUE, srcoff, sz, dst, num_ev, evl, NULL)); if (src->ev != NULL) clReleaseEvent(src->ev); src->ev = NULL; return GA_NO_ERROR; } static int cl_write(gpudata *dst, size_t dstoff, const void *src, size_t sz) { cl_ctx *ctx = dst->ctx; cl_event ev[1]; cl_event *evl = NULL; cl_uint num_ev = 0; ASSERT_BUF(dst); ASSERT_CTX(ctx); if (sz == 0) return GA_NO_ERROR; if (dst->ev != NULL) { ev[0] = dst->ev; evl = ev; num_ev = 1; } CL_CHECK(ctx->err, clEnqueueWriteBuffer(ctx->q, dst->buf, CL_TRUE, dstoff, sz, src, num_ev, evl, NULL)); if (dst->ev != NULL) clReleaseEvent(dst->ev); dst->ev = NULL; return GA_NO_ERROR; } static int cl_memset(gpudata *dst, size_t offset, int data) { char local_kern[256]; cl_ctx *ctx = dst->ctx; const char *rlk[1]; void *args[1]; size_t sz, bytes, n, ls, gs; gpukernel *m; cl_mem_flags fl; int type; int r, res; unsigned char val = (unsigned char)data; cl_uint pattern = (cl_uint)val & (cl_uint)val >> 8 & \ (cl_uint)val >> 16 & (cl_uint)val >> 24; ASSERT_BUF(dst); ASSERT_CTX(ctx); CL_CHECK(ctx->err, clGetMemObjectInfo(dst->buf, CL_MEM_FLAGS, sizeof(fl), &fl, NULL)); if (fl & CL_MEM_READ_ONLY) return error_set(ctx->err, GA_READONLY_ERROR, "destination is read only"); CL_CHECK(ctx->err, clGetMemObjectInfo(dst->buf, CL_MEM_SIZE, sizeof(bytes), &bytes, NULL)); bytes -= offset; if (bytes == 0) return GA_NO_ERROR; if ((bytes % 16) == 0) { n = bytes/16; r = snprintf(local_kern, sizeof(local_kern), "__kernel void kmemset(__global uint4 *mem) {" "unsigned int i; __global char *tmp = (__global char *)mem;" "tmp += %" SPREFIX "u; mem = (__global uint4 *)tmp;" "for (i = get_global_id(0); i < %" SPREFIX "u; " "i += get_global_size(0)) {mem[i] = (uint4)(%u,%u,%u,%u); }}", offset, n, pattern, pattern, pattern, pattern); } else if ((bytes % 8) == 0) { n = bytes/8; r = snprintf(local_kern, sizeof(local_kern), "__kernel void kmemset(__global uint2 *mem) {" "unsigned int i; __global char *tmp = (__global char *)mem;" "tmp += %" SPREFIX "u; mem = (__global uint2 *)tmp;" "for (i = get_global_id(0); i < %" SPREFIX "u;" "i += get_global_size(0)) {mem[i] = (uint2)(%u,%u); }}", offset, n, pattern, pattern); } else if ((bytes % 4) == 0) { n = bytes/4; r = snprintf(local_kern, sizeof(local_kern), "__kernel void kmemset(__global unsigned int *mem) {" "unsigned int i; __global char *tmp = (__global char *)mem;" "tmp += %" SPREFIX "u; mem = (__global unsigned int *)tmp;" "for (i = get_global_id(0); i < %" SPREFIX "u;" "i += get_global_size(0)) {mem[i] = %u; }}", offset, n, pattern); } else { GA_CHECK(check_ext(ctx, CL_SMALL)); n = bytes; r = snprintf(local_kern, sizeof(local_kern), "__kernel void kmemset(__global unsigned char *mem) {" "unsigned int i; mem += %" SPREFIX "u;" "for (i = get_global_id(0); i < %" SPREFIX "u;" "i += get_global_size(0)) {mem[i] = %u; }}", offset, n, val); } /* If this assert fires, increase the size of local_kern above. */ assert(r <= (int)sizeof(local_kern)); _unused(r); sz = strlen(local_kern); rlk[0] = local_kern; type = GA_BUFFER; r = cl_newkernel(&m, (gpucontext *)ctx, 1, rlk, &sz, "kmemset", 1, &type, 0, NULL); if (r != GA_NO_ERROR) return r; /* Cheap kernel scheduling */ res = cl_property(NULL, NULL, m, GA_KERNEL_PROP_MAXLSIZE, &ls); if (res != GA_NO_ERROR) goto fail; gs = ((n-1) / ls) + 1; args[0] = dst; res = cl_callkernel(m, 1, &gs, &ls, 0, args); fail: cl_releasekernel(m); return res; } static int cl_check_extensions(const char **preamble, unsigned int *count, int flags, cl_ctx *ctx) { if (flags & GA_USE_SMALL) { GA_CHECK(check_ext(ctx, CL_SMALL)); preamble[*count] = PRAGMA CL_SMALL ENABLE; (*count)++; } if (flags & GA_USE_DOUBLE) { GA_CHECK(check_ext(ctx, CL_DOUBLE)); preamble[*count] = PRAGMA CL_DOUBLE ENABLE; (*count)++; } if (flags & GA_USE_COMPLEX) { return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Complex are not supported yet"); } if (flags & GA_USE_CUDA) { return error_set(ctx->err, GA_DEVSUP_ERROR, "Cuda kernels not supported on opencl devices"); } return GA_NO_ERROR; } static int cl_newkernel(gpukernel **k, gpucontext *c, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, const int *types, int flags, char **err_str) { cl_ctx *ctx = (cl_ctx *)c; gpukernel *res; cl_device_id dev; cl_program p; cl_program cluda; cl_program tmp; // Sync this table size with the number of flags that can add stuff // at the beginning const char *preamble[5]; const char *cluda_src[1]; const char *headers[1] = {"cluda.h"}; size_t *newl = NULL; const char **news = NULL; cl_int err; unsigned int n = 0; strb debug_msg = STRB_STATIC_INIT; size_t log_size; ASSERT_CTX(ctx); if (count == 0) return error_set(ctx->err, GA_VALUE_ERROR, "Empty kernel source list"); dev = get_dev(ctx->ctx, ctx->err); if (dev == NULL) return ctx->err->code; if (cl_check_extensions(preamble, &n, flags, ctx)) return ctx->err->code; if (n != 0) { news = calloc(count+n, sizeof(const char *)); if (news == NULL) return error_sys(ctx->err, "calloc"); memcpy(news, preamble, n*sizeof(const char *)); memcpy(news+n, strings, count*sizeof(const char *)); if (lengths == NULL) { newl = NULL; } else { newl = calloc(count+n, sizeof(size_t)); if (newl == NULL) { free(news); return error_sys(ctx->err, "calloc"); } memcpy(newl+n, lengths, count*sizeof(size_t)); } } else { news = strings; newl = (size_t *)lengths; } cluda_src[0] = cluda_opencl_h; cluda = clCreateProgramWithSource(ctx->ctx, 1, cluda_src, NULL, &err); if (err != CL_SUCCESS) { if (n != 0) { free(news); free(newl); } return error_cl(ctx->err, "clCreateProgramWithSource (header)", err); } p = clCreateProgramWithSource(ctx->ctx, count+n, news, newl, &err); if (err != CL_SUCCESS) { if (n != 0) { free(news); free(newl); clReleaseProgram(cluda); } return error_cl(ctx->err, "clCreateProgramWithSource (kernel)", err); } err = clCompileProgram(p, 0, NULL, ctx->options, 1, &cluda, headers, NULL, NULL); if (err != CL_SUCCESS) goto compile_error; tmp = clLinkProgram(ctx->ctx, 0, NULL, NULL, 1, &p, NULL, NULL, &err); if (tmp != NULL) { clReleaseProgram(p); p = tmp; tmp = NULL; } compile_error: if (err != CL_SUCCESS) { if ((err == CL_COMPILE_PROGRAM_FAILURE || err == CL_LINK_PROGRAM_FAILURE) && err_str != NULL) { *err_str = NULL; // Fallback, in case there's an error // We're substituting debug_msg for a string with this first line: strb_appends(&debug_msg, "Program build failure ::\n"); // Determine the size of the log clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); if (strb_ensure(&debug_msg, log_size)!=-1 && log_size>=1) { // Checks strb has enough space // Get the log directly into the debug_msg clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size, debug_msg.s+debug_msg.l, NULL); debug_msg.l += (log_size-1); // Back off to before final '\0' } gpukernel_source_with_line_numbers(count+n, news, newl, &debug_msg); strb_append0(&debug_msg); // Make sure a final '\0' is present if (!strb_error(&debug_msg)) { // Make sure the strb is in a valid state *err_str = memdup(debug_msg.s, debug_msg.l); // If there's a memory alloc error, fall-through : announcing a compile error is more important } strb_clear(&debug_msg); // *err_str will be free()d by the caller (see docs in kernel.h) } clReleaseProgram(p); if (n != 0) { free(news); free(newl); } return error_cl(ctx->err, "clBuildProgram", err); } if (n != 0) { free(news); free(newl); } res = malloc(sizeof(*res)); if (res == NULL) return error_sys(ctx->err, "malloc"); res->refcnt = 1; res->ev = NULL; res->argcount = argcount; res->k = clCreateKernel(p, fname, &err); res->types = NULL; /* This avoids a crash in cl_releasekernel */ res->evr = NULL; /* This avoids a crash in cl_releasekernel */ res->ctx = ctx; ctx->refcnt++; clReleaseProgram(p); TAG_KER(res); if (err != CL_SUCCESS) { cl_releasekernel(res); return error_cl(ctx->err, "clCreateKernel", err); } res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { cl_releasekernel(res); return error_sys(ctx->err, "calloc"); } memcpy(res->types, types, argcount * sizeof(int)); res->evr = calloc(argcount, sizeof(cl_event *)); if (res->evr == NULL) { cl_releasekernel(res); return error_sys(ctx->err, "calloc"); } *k = res; return GA_NO_ERROR; } static void cl_retainkernel(gpukernel *k) { ASSERT_KER(k); k->refcnt++; } static void cl_releasekernel(gpukernel *k) { ASSERT_KER(k); k->refcnt--; if (k->refcnt == 0) { CLEAR(k); if (k->ev != NULL) clReleaseEvent(k->ev); if (k->k) clReleaseKernel(k->k); cl_free_ctx(k->ctx); free(k->types); free(k->evr); free(k); } } static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) { cl_ctx *ctx = k->ctx; gpudata *btmp; cl_ulong temp; cl_long stemp; switch (k->types[i]) { case GA_BUFFER: btmp = (gpudata *)a; CL_CHECK(ctx->err, clSetKernelArg(k->k, i, sizeof(cl_mem), &btmp->buf)); k->evr[i] = &btmp->ev; break; case GA_SIZE: temp = *((size_t *)a); CL_CHECK(ctx->err, clSetKernelArg(k->k, i, gpuarray_get_elsize(GA_ULONG), &temp)); k->evr[i] = NULL; break; case GA_SSIZE: stemp = *((ssize_t *)a); CL_CHECK(ctx->err, clSetKernelArg(k->k, i, gpuarray_get_elsize(GA_LONG), &stemp)); k->evr[i] = NULL; break; default: CL_CHECK(ctx->err, clSetKernelArg(k->k, i, gpuarray_get_elsize(k->types[i]), a)); k->evr[i] = NULL; } return GA_NO_ERROR; } static int cl_callkernel(gpukernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args) { cl_ctx *ctx = k->ctx; size_t _gs[3]; cl_event ev; cl_event *evw; cl_device_id dev; cl_uint num_ev; cl_uint i; cl_int err; ASSERT_KER(k); ASSERT_CTX(ctx); if (n > 3) return error_set(ctx->err, GA_VALUE_ERROR, "Call with more than 3 dimensions"); dev = get_dev(ctx->ctx, ctx->err); if (dev == NULL) return ctx->err->code; if (args != NULL) { for (i = 0; i < k->argcount; i++) { GA_CHECK(cl_setkernelarg(k, i, args[i])); } } if (shared != 0) { // the shared memory pointer must be the last argument CL_CHECK(ctx->err, clSetKernelArg(k->k, k->argcount, shared, NULL)); } evw = calloc(sizeof(cl_event), k->argcount); if (evw == NULL) { return error_sys(ctx->err, "calloc"); } num_ev = 0; for (i = 0; i < k->argcount; i++) { if (k->evr[i] != NULL && *k->evr[i] != NULL) { evw[num_ev++] = *k->evr[i]; } } if (num_ev == 0) { free(evw); evw = NULL; } switch (n) { case 3: _gs[2] = gs[2] * ls[2]; case 2: _gs[1] = gs[1] * ls[1]; case 1: _gs[0] = gs[0] * ls[0]; } err = clEnqueueNDRangeKernel(ctx->q, k->k, n, NULL, _gs, ls, num_ev, evw, &ev); free(evw); if (err != CL_SUCCESS) return error_cl(ctx->err, "clEnqueueNDRangeKernel", err); for (i = 0; i < k->argcount; i++) { if (k->types[i] == GA_BUFFER) { if (*k->evr[i] != NULL) clReleaseEvent(*k->evr[i]); *k->evr[i] = ev; clRetainEvent(ev); } } if (k->ev != NULL) clReleaseEvent(k->ev); k->ev = ev; return GA_NO_ERROR; } static int cl_sync(gpudata *b) { cl_ctx *ctx = (cl_ctx *)b->ctx; ASSERT_BUF(b); ASSERT_CTX(ctx); if (b->ev != NULL) { CL_CHECK(ctx->err, clWaitForEvents(1, &b->ev)); clReleaseEvent(b->ev); b->ev = NULL; } return GA_NO_ERROR; } static int cl_transfer(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz) { ASSERT_BUF(dst); ASSERT_BUF(src); return error_set(dst->ctx->err, GA_UNSUPPORTED_ERROR, "Operation not supported"); } static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { cl_ctx *ctx = NULL; if (c != NULL) { ctx = (cl_ctx *)c; ASSERT_CTX(ctx); } else if (buf != NULL) { ASSERT_BUF(buf); ctx = buf->ctx; } else if (k != NULL) { ASSERT_KER(k); ctx = k->ctx; } if (prop_id < GA_BUFFER_PROP_START) { if (ctx == NULL) return error_set(global_err, GA_VALUE_ERROR, "Requesting context property with no context"); } else if (prop_id < GA_KERNEL_PROP_START) { if (buf == NULL) return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR, "Requesting buffer property with no buffer"); } else { if (k == NULL) return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR, "Requesting kernel property with no kernel"); } switch (prop_id) { size_t sz; size_t *psz; cl_device_id id; cl_uint ui; case GA_CTX_PROP_DEVNAME: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_NAME, 256, (char *)res, NULL)); return GA_NO_ERROR; case GA_CTX_PROP_UNIQUE_ID: return error_set(ctx->err, GA_DEVSUP_ERROR, "Can't get unique ID on OpenCL"); case GA_CTX_PROP_LMEMSIZE: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(sz), &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; case GA_CTX_PROP_NUMPROCS: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(ui), &ui, NULL)); *((unsigned int *)res) = ui; return GA_NO_ERROR; case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; return GA_NO_ERROR; case GA_CTX_PROP_ERRBUF: *((gpudata **)res) = ctx->errbuf; return GA_NO_ERROR; case GA_CTX_PROP_TOTAL_GMEM: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(sz), &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; case GA_CTX_PROP_FREE_GMEM: /* There is no way to query free memory so we just return the largest block size */ case GA_CTX_PROP_LARGEST_MEMBLOCK: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(sz), &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; case GA_CTX_PROP_NATIVE_FLOAT16: *((int *)res) = 0; return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE0: /* It might be bigger than that, but it's not readily available information. */ *((size_t *)res) = (1>>31) - 1; return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE1: /* It might be bigger than that, but it's not readily available information. */ *((size_t *)res) = (1>>31) - 1; return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE2: /* It might be bigger than that, but it's not readily available information. */ *((size_t *)res) = (1>>31) - 1; return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE0: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_GET_PROP(ctx->err, clGetDeviceInfo, id, CL_DEVICE_MAX_WORK_ITEM_SIZES, psz); *((size_t *)res) = psz[0]; free(psz); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE1: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_GET_PROP(ctx->err, clGetDeviceInfo, id, CL_DEVICE_MAX_WORK_ITEM_SIZES, psz); *((size_t *)res) = psz[1]; free(psz); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE2: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_GET_PROP(ctx->err, clGetDeviceInfo, id, CL_DEVICE_MAX_WORK_ITEM_SIZES, psz); *((size_t *)res) = psz[2]; free(psz); return GA_NO_ERROR; case GA_BUFFER_PROP_REFCNT: *((unsigned int *)res) = buf->refcnt; return GA_NO_ERROR; case GA_BUFFER_PROP_SIZE: CL_CHECK(ctx->err, clGetMemObjectInfo(buf->buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; /* GA_BUFFER_PROP_CTX is not ordered to simplify code */ case GA_BUFFER_PROP_CTX: case GA_KERNEL_PROP_CTX: *((gpucontext **)res) = (gpucontext *)ctx; return GA_NO_ERROR; case GA_KERNEL_PROP_MAXLSIZE: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_CHECK(ctx->err, clGetKernelWorkGroupInfo(k->k, id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(sz), &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; case GA_KERNEL_PROP_PREFLSIZE: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_CHECK(ctx->err, clGetKernelWorkGroupInfo(k->k, id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(sz), &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; case GA_KERNEL_PROP_NUMARGS: *((unsigned int *)res) = k->argcount; return GA_NO_ERROR; case GA_KERNEL_PROP_TYPES: *((const int **)res) = k->types; return GA_NO_ERROR; default: return error_fmt(ctx->err, GA_INVALID_ERROR, "Invalid property: %d", prop_id); } } static const char *cl_error(gpucontext *c) { cl_ctx *ctx = (cl_ctx *)c; if (ctx == NULL){ return global_err->msg; } else { ASSERT_CTX(ctx); return ctx->err->msg; } } const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count, cl_get_device_count, cl_init, cl_deinit, cl_alloc, cl_retain, cl_release, cl_share, cl_move, cl_read, cl_write, cl_memset, cl_newkernel, cl_retainkernel, cl_releasekernel, cl_setkernelarg, cl_callkernel, cl_sync, cl_transfer, cl_property, cl_error}; libgpuarray-0.7.6/src/gpuarray_collectives_cuda_nccl.c000066400000000000000000000352141326743622600232410ustar00rootroot00000000000000#include #include #include #include "loaders/libnccl.h" #include "gpuarray/buffer_collectives.h" #include "gpuarray/config.h" #include "gpuarray/error.h" #include "gpuarray/util.h" #include "private.h" #include "private_cuda.h" static inline int error_nccl(error *e, const char *msg, ncclResult_t err) { return error_fmt(e, GA_COMM_ERROR, "%s: %s", msg, ncclGetErrorString(err)); } /** * Execute `cmd` and return appropriate code. Save a describing error message in * context. */ #define NCCL_CHKFAIL(ctx, cmd) \ do { \ ncclResult_t err = (cmd); \ if (err != ncclSuccess) { \ return error_nccl((ctx)->err, #cmd, err); \ } \ return GA_NO_ERROR; \ } while (0) /** * Execute `cmd` and check for failure. Save a describing error message in * context. Exit from context and return \ref GA_COMM_ERROR if nccl does not * succeed. */ #define NCCL_EXIT_ON_ERROR(ctx, cmd) \ do { \ ncclResult_t err = (cmd); \ if (err != ncclSuccess) { \ cuda_exit((ctx)); \ return error_nccl((ctx)->err, #cmd, err); \ } \ } while (0) //!< Link wrapped cuda core operations extern const gpuarray_buffer_ops cuda_ops; /** * Definition of struct _gpucomm * * \note This must be the only "module" which manages the definition's contents. */ struct _gpucomm { cuda_context* ctx; // Start after the context ncclComm_t c; #ifdef DEBUG char tag[8]; #endif }; static int setup_done = 0; static int setup_lib(error *e) { if (setup_done) return GA_NO_ERROR; GA_CHECK(load_libnccl(e)); setup_done = 1; return GA_NO_ERROR; } /** * \brief Helper function to dereference a `comm`'s context and free memory */ static void comm_clear(gpucomm *comm) { gpucontext_deref((gpucontext *)comm->ctx); CLEAR(comm); free(comm); } /** * \brief NCCL implementation of \ref gpucomm_new. */ static int comm_new(gpucomm **comm_ptr, gpucontext *ctx, gpucommCliqueId comm_id, int ndev, int rank) { gpucomm *comm; ncclResult_t err; ASSERT_CTX(ctx); GA_CHECK(setup_lib(ctx->err)); comm = calloc(1, sizeof(*comm)); // Allocate memory if (comm == NULL) { *comm_ptr = NULL; // Set to NULL if failed return error_sys(ctx->err, "calloc"); } comm->ctx = (cuda_context *)ctx; // convert to underlying cuda context // So that context would not be destroyed before communicator comm->ctx->refcnt++; cuda_enter(comm->ctx); // Use device err = ncclCommInitRank(&comm->c, ndev, *((ncclUniqueId *)&comm_id), rank); cuda_exit(comm->ctx); TAG_COMM(comm); if (err != ncclSuccess) { *comm_ptr = NULL; // Set to NULL if failed comm_clear(comm); return error_nccl(ctx->err, "ncclCommInitRank", err); } *comm_ptr = comm; return GA_NO_ERROR; } /** * \brief NCCL implementation of \ref gpucomm_free. */ static void comm_free(gpucomm *comm) { ASSERT_COMM(comm); cuda_enter(comm->ctx); ncclCommDestroy(comm->c); cuda_exit(comm->ctx); comm_clear(comm); } /** * \brief NCCL implementation of \ref gpucomm_gen_clique_id. */ static int generate_clique_id(gpucontext *c, gpucommCliqueId *comm_id) { ASSERT_CTX(c); GA_CHECK(setup_lib(c->err)); NCCL_CHKFAIL(c, ncclGetUniqueId((ncclUniqueId *)comm_id)); } /** * \brief NCCL implementation of \ref gpucomm_get_count. */ static int get_count(const gpucomm *comm, int *gpucount) { ASSERT_COMM(comm); NCCL_CHKFAIL(comm->ctx, ncclCommCount(comm->c, gpucount)); } /** * \brief NCCL implementation of \ref gpucomm_get_rank. */ static int get_rank(const gpucomm *comm, int *rank) { ASSERT_COMM(comm); NCCL_CHKFAIL(comm->ctx, ncclCommUserRank(comm->c, rank)); } /** * \brief Helper function to try to convert \ref enum gpucomm_reduce_ops to * \ref * ncclRedOp_t. * * If invalid, return `ncclNumOps`. */ static inline ncclRedOp_t convert_reduce_op(int opcode) { switch (opcode) { case GA_SUM: return ncclSum; case GA_PROD: return ncclProd; case GA_MAX: return ncclMax; case GA_MIN: return ncclMin; } return ncclNumOps; } /** * \brief Helper function to try to convert \ref enum GPUARRAY_TYPES to \ref * ncclDataType_t. * * If invalid, return `ncclNumTypes`. */ static inline ncclDataType_t convert_data_type(int typecode) { switch (typecode) { case GA_BYTE: return ncclChar; case GA_INT: return ncclInt; case GA_FLOAT: return ncclFloat; case GA_DOUBLE: return ncclDouble; case GA_LONG: return ncclInt64; case GA_ULONG: return ncclUint64; case GA_HALF: return ncclHalf; } return ncclNumTypes; } /** * \brief Helper function to check for restrictions on `gpudata` to be used in * nccl * collective operations. */ static inline int check_restrictions(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm, ncclDataType_t *datatype, ncclRedOp_t *op) { size_t op_size; // Check if count is larger than INT_MAX // TODO remove whenif nccl adapts to size_t if (count > INT_MAX) return error_set(comm->ctx->err, GA_XLARGE_ERROR, "Count too large for int"); // src, dest and comm must refer to the same context if (src->ctx != comm->ctx) return error_set(comm->ctx->err, GA_VALUE_ERROR, "source and comm context differ"); if (dest != NULL && dest->ctx != comm->ctx) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); // typecode must correspond to a valid ncclDataType_t if (datatype != NULL) { *datatype = convert_data_type(typecode); if (*datatype == ncclNumTypes) return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type"); } // opcode must correspond to a valid ncclRedOp_t if (op != NULL) { *op = convert_reduce_op(opcode); if (*op == ncclNumOps) return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op"); } // offsets must not be larger than gpudata's size itself // (else out of alloc-ed mem scope) assert(!(offsrc > src->sz)); assert(!(dest != NULL && offdest > dest->sz)); // size to operate upon must be able to fit inside the gpudata (incl offsets) op_size = count * gpuarray_get_elsize(typecode); if ((src->sz - offsrc) < op_size) return error_set(comm->ctx->err, GA_VALUE_ERROR, "source too small for operation"); if (dest != NULL && (dest->sz - offdest) < op_size) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); return GA_NO_ERROR; } /** * \brief NCCL implementation of \ref gpucomm_reduce. */ static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up ncclRedOp_t op = ncclNumOps; ncclDataType_t datatype = ncclNumTypes; gpudata *dst = NULL; int rank = 0; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); GA_CHECK(get_rank(comm, &rank)); if (rank == root) { dst = dest; ASSERT_BUF(dest); } GA_CHECK(check_restrictions(src, offsrc, dst, offdest, count, typecode, opcode, comm, &datatype, &op)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency if (rank == root) NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, op, root, comm->c, ctx->s)); else NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void *)(src->ptr + offsrc), NULL, count, datatype, op, root, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; } /** * \brief NCCL implementation of \ref gpucomm_all_reduce. */ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclRedOp_t op = ncclNumOps; ncclDataType_t datatype = ncclNumTypes; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(check_restrictions(src, offsrc, dest, offdest, count, typecode, opcode, comm, &datatype, &op)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclAllReduce((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; } /** * \brief NCCL implementation of \ref gpucomm_reduce_scatter. */ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclRedOp_t op = ncclNumOps; ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(get_count(comm, &ndev)); GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count * ndev, typecode, opcode, comm, &datatype, &op)); if (dest->ctx != comm->ctx) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); resc_size = count * gpuarray_get_elsize(typecode); if ((dest->sz - offdest) < resc_size) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); assert(!(offdest > dest->sz)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; } /** * \brief NCCL implementation of \ref gpucomm_broadcast. */ static int broadcast(gpudata *array, size_t offset, size_t count, int typecode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up ncclDataType_t datatype = ncclNumTypes; int rank = 0; cuda_context *ctx; ASSERT_BUF(array); ASSERT_COMM(comm); GA_CHECK(check_restrictions(array, offset, NULL, 0, count, typecode, 0, comm, &datatype, NULL)); GA_CHECK(get_rank(comm, &rank)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_READ)); else GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclBcast((void *)(array->ptr + offset), count, datatype, root, comm->c, ctx->s)); if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_READ)); else GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; } /** * \brief NCCL implementation of \ref gpucomm_all_gather. */ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count, typecode, 0, comm, &datatype, NULL)); if (dest->ctx != comm->ctx) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); GA_CHECK(get_count(comm, &ndev)); resc_size = ndev * count * gpuarray_get_elsize(typecode); if ((dest->sz - offdest) < resc_size) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); assert(!(offdest > dest->sz)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR( ctx, ncclAllGather((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; } /** * Instance of `gpuarray_comm_ops` which contains NCCL implementations. To be * linked in \ref gpuarray_buffer_cuda.c, in order to fill a /ref gpucontext's * comm_ops. */ gpuarray_comm_ops nccl_ops = { comm_new, comm_free, generate_clique_id, get_count, get_rank, reduce, all_reduce, reduce_scatter, broadcast, all_gather}; libgpuarray-0.7.6/src/gpuarray_elemwise.c000066400000000000000000000557261326743622600205560ustar00rootroot00000000000000#include #include #include #include #include #include #include "private.h" #include "util/strb.h" struct _GpuElemwise { const char *expr; /* Expression code (to be able to build kernels on-demand) */ const char *preamble; /* Preamble code */ gpuelemwise_arg *args; /* Argument descriptors */ GpuKernel k_contig; /* Contiguous kernel */ GpuKernel *k_basic; /* Normal basic kernels */ GpuKernel *k_basic_32; /* 32-bit address basic kernels */ size_t *dims; /* Preallocated shape buffer for dimension collapsing */ ssize_t **strides; /* Preallocated strides buffer for dimension collapsing */ unsigned int nd; /* Current maximum number of dimensions allocated */ unsigned int n; /* Number of arguments */ unsigned int narray; /* Number of array arguments */ int flags; /* Flags for the operation (none at the moment */ }; #define GEN_ADDR32 0x1 #define GEN_CONVERT_F16 0x2 /* This makes sure we have the same value for those flags since we use some shortcuts */ STATIC_ASSERT(GEN_CONVERT_F16 == GE_CONVERT_F16, same_flags_value_elem1); #define is_array(a) (ISCLR((a).flags, GE_SCALAR)) #define is_output(a) (ISSET((a).flags, GE_WRITE)) static inline int k_initialized(GpuKernel *k) { return k->k != NULL; } static inline const char *ctype(int typecode) { return gpuarray_get_type(typecode)->cluda_name; } /* dst has to be zero-initialized on entry */ static int copy_arg(gpuelemwise_arg *dst, gpuelemwise_arg *src) { dst->name = strdup(src->name); if (dst->name == NULL) return -1; dst->typecode = src->typecode; dst->flags = src->flags; return 0; } static void clear_arg(gpuelemwise_arg *a) { free((void *)a->name); a->name = NULL; } static gpuelemwise_arg *copy_args(unsigned int n, gpuelemwise_arg *a) { gpuelemwise_arg *res = calloc(n, sizeof(gpuelemwise_arg)); unsigned int i; if (res == NULL) return NULL; for (i = 0; i < n; i++) if (copy_arg(&res[i], &a[i]) != 0) goto bail; return res; bail: for (; i > 0; i--) { clear_arg(&res[i]); } return NULL; } static void free_args(unsigned int n, gpuelemwise_arg *args) { unsigned int i; if (args != NULL) for (i = 0; i < n; i++) clear_arg(&args[i]); free(args); } #define MUL_NO_OVERFLOW ((size_t)1 << (sizeof(size_t) * 4)) static int reallocaz(void **p, size_t elsz, size_t old, size_t new) { char *res; assert(old <= new); if ((new >= MUL_NO_OVERFLOW || elsz >= MUL_NO_OVERFLOW) && new > 0 && SIZE_MAX / new < elsz) { return 1; } res = realloc(*p, elsz*new); if (res == NULL) return 1; memset(res + (elsz*old), 0, elsz*(new-old)); *p = (void *)res; return 0; } static int ge_grow(GpuElemwise *ge, unsigned int nd) { unsigned int i; assert(nd > ge->nd); if (reallocaz((void **)&ge->k_basic, sizeof(GpuKernel), ge->nd, nd) || reallocaz((void **)&ge->k_basic_32, sizeof(GpuKernel), ge->nd, nd) || reallocaz((void **)&ge->dims, sizeof(size_t), ge->nd, nd)) return 1; for (i = 0; i < ge->narray; i++) { if (reallocaz((void **)&ge->strides[i], sizeof(ssize_t), ge->nd, nd)) return 1; } ge->nd = nd; return 0; } static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, const char *preamble, const char *expr, unsigned int nd, /* Number of dims */ unsigned int n, /* Length of args */ gpuelemwise_arg *args, int gen_flags) { strb sb = STRB_STATIC_INIT; unsigned int i, _i, j; int *ktypes; char *size = "ga_size", *ssize = "ga_ssize"; unsigned int p; int flags = 0; int res; if (ISSET(gen_flags, GEN_ADDR32)) { size = "ga_uint"; ssize = "ga_int"; } flags |= gpuarray_type_flagsa(n, args); p = 1 + nd; for (j = 0; j < n; j++) { p += ISSET(args[j].flags, GE_SCALAR) ? 1 : (2 + nd); } ktypes = calloc(p, sizeof(int)); if (ktypes == NULL) return error_sys(ctx->err, "calloc"); p = 0; strb_appends(&sb, "#include \"cluda.h\"\n"); if (preamble) strb_appends(&sb, preamble); strb_appends(&sb, "\nKERNEL void elem(const ga_size n, "); ktypes[p++] = GA_SIZE; for (i = 0; i < nd; i++) { strb_appendf(&sb, "const ga_size dim%u, ", i); ktypes[p++] = GA_SIZE; } for (j = 0; j < n; j++) { if (is_array(args[j])) { strb_appendf(&sb, "GLOBAL_MEM %s *%s_data, const ga_size %s_offset%s", ctype(args[j].typecode), args[j].name, args[j].name, nd == 0 ? "" : ", "); ktypes[p++] = GA_BUFFER; ktypes[p++] = GA_SIZE; for (i = 0; i < nd; i++) { strb_appendf(&sb, "const ga_ssize %s_str_%u%s", args[j].name, i, (i == (nd - 1)) ? "": ", "); ktypes[p++] = GA_SSIZE; } } else { strb_appendf(&sb, "%s %s", ctype(args[j].typecode), args[j].name); ktypes[p++] = args[j].typecode; } if (j != (n - 1)) strb_appends(&sb, ", "); } strb_appendf(&sb, ") {\n" "const %s idx = LDIM_0 * GID_0 + LID_0;\n" "const %s numThreads = LDIM_0 * GDIM_0;\n" "%s i;\n", size, size, size); strb_appends(&sb, "for(i = idx; i < n; i += numThreads) {\n"); if (nd > 0) strb_appendf(&sb, "%s ii = i;\n%s pos;\n", size, size); for (j = 0; j < n; j++) { if (is_array(args[j])) strb_appendf(&sb, "%s %s_p = %s_offset;\n", size, args[j].name, args[j].name); } for (_i = nd; _i > 0; _i--) { i = _i - 1; if (i > 0) strb_appendf(&sb, "pos = ii %% (%s)dim%u;\nii = ii / (%s)dim%u;\n", size, i, size, i); else strb_appends(&sb, "pos = ii;\n"); for (j = 0; j < n; j++) { if (is_array(args[j])) strb_appendf(&sb, "%s_p += pos * (%s)%s_str_%u;\n", args[j].name, ssize, args[j].name, i); } } for (j = 0; j < n; j++) { if (is_array(args[j])) { strb_appendf(&sb, "%s %s;", ctype(ISSET(gen_flags, GEN_CONVERT_F16) && args[j].typecode == GA_HALF ? GA_FLOAT : args[j].typecode), args[j].name); if (ISSET(args[j].flags, GE_READ)) { if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { strb_appendf(&sb, "%s = ga_half2float(*(GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p));\n", args[j].name, args[j].name, args[j].name); } else { strb_appendf(&sb, "%s = *(GLOBAL_MEM %s *)(((GLOBAL_MEM char *)%s_data) + %s_p);\n", args[j].name, ctype(args[j].typecode), args[j].name, args[j].name); } } } } strb_appends(&sb, expr); strb_appends(&sb, ";\n"); for (j = 0; j < n; j++) { if (is_array(args[j]) && ISSET(args[j].flags, GE_WRITE)) { if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { strb_appendf(&sb, "*(GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p) = ga_float2half(%s);\n", args[j].name, args[j].name, args[j].name); } else { strb_appendf(&sb, "*(GLOBAL_MEM %s *)(((GLOBAL_MEM char *)%s_data) + %s_p) = %s;\n", ctype(args[j].typecode), args[j].name, args[j].name, args[j].name); } } } strb_appends(&sb, "}\n}\n"); if (strb_error(&sb)) { res = GA_MEMORY_ERROR; goto bail; } res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "elem", p, ktypes, flags, err_str); bail: free(ktypes); strb_clear(&sb); return res; } static ssize_t **strides_array(unsigned int num, unsigned int nd) { ssize_t **res = calloc(num, sizeof(ssize_t *)); unsigned int i; if (res == NULL) return NULL; for (i = 0; i < num; i++) { res[i] = calloc(nd, sizeof(ssize_t)); if (res[i] == NULL) goto bail; } return res; bail: for (i = 0; i < num; i++) free(res[i]); free(res); return NULL; } static int check_basic(GpuElemwise *ge, void **args, int flags, size_t *_n, unsigned int *_nd, size_t **_dims, ssize_t ***_strides, int *_call32) { size_t n; gpucontext *ctx = GpuKernel_context(&ge->k_contig); GpuArray *a = NULL, *v; unsigned int i, j, p, num_arrays = 0, nd = 0, nnd; int call32 = 1; unsigned int nd_i = 0; size_t v_dim_j = 0; /* Go through the list and grab some info */ for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { nd_i = ((GpuArray *)args[i])->nd; if (num_arrays == 0) nd = nd_i; else if (nd_i != nd) { if (flags & GE_PADSHAPE) nd = nd_i > nd ? nd_i : nd; else return error_fmt(ctx->err, GA_VALUE_ERROR, "Arg %u has differing nd = %u", i, nd_i); } ++num_arrays; if (a == NULL && is_output(ge->args[i])) a = (GpuArray *)args[i]; } } if (a == NULL) return error_set(ctx->err, GA_VALUE_ERROR, "No output arrays"); /* Check if we need to grow the internal buffers */ if (nd > ge->nd) { nnd = ge->nd * 2; while (nd > nnd) nnd *= 2; if (ge_grow(ge, nnd)) return error_sys(ctx->err, "ge_grow"); } /* Now we know that all array arguments have at most nd dimensions and that the expected output size is the size of a */ /* And copy their initial values in */ memcpy(ge->dims, a->dimensions, nd*sizeof(size_t)); p = 0; for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { /* Left-pad strides with zero on implicitly broadcasted dimensions */ memset(ge->strides[p], 0, nd*sizeof(ssize_t)); nd_i = ((GpuArray *)args[i])->nd; memcpy((char *)(ge->strides[p]) + (nd - nd_i)*sizeof(ssize_t), ((GpuArray *)args[i])->strides, nd_i*sizeof(ssize_t)); p++; } } /* Check that all arrays are the same size (or broadcast-compatible if GE_BROADCAST), adjust strides of broadcastable dimensions and check if we can use the 32 bit address version. Basically for each dimension go over all the arguments and make sure that the dimension size matches. */ n = 1; for (j = 0; j < nd; j++) { p = 0; for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { v = (GpuArray *)args[i]; nd_i = v->nd; /* Pad shape with 1 if needed for implicitly broadcasted dimensions and shift if needed */ if (j < nd - nd_i) v_dim_j = 1; else v_dim_j = v->dimensions[j - (nd - nd_i)]; if (ge->dims[j] != v_dim_j) { /* We can't broadcast outputs */ if (ISCLR(flags, GE_BROADCAST) || is_output(ge->args[i]) || v_dim_j != 1) { return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u for input %u (expected %" SPREFIX "u got %" SPREFIX "u)", j, i, ge->dims[j], v_dim_j); } } /* If the dimension is 1 set the strides to 0 regardless since it won't change anything in the non-broadcast case. */ if (v_dim_j == 1) { ge->strides[p][j] = 0; } call32 &= v->offset < ADDR32_MAX; call32 &= (SADDR32_MIN < ge->strides[p][j] && ge->strides[p][j] < SADDR32_MAX); p++; } /* is_array() */ } /* for each arg */ /* We have the final value in dims[j] */ n *= ge->dims[j]; } /* for each dim */ call32 &= n < ADDR32_MAX; if (ISCLR(flags, GE_NOCOLLAPSE) && nd > 1) { gpuarray_elemwise_collapse(num_arrays, &nd, ge->dims, ge->strides); } *_n = n; *_nd = nd; *_dims = ge->dims; *_strides = ge->strides; *_call32 = call32; return GA_NO_ERROR; } static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd, size_t *dims, ssize_t **strs, int call32) { GpuKernel *k; size_t ls = 0, gs = 0; unsigned int p = 0, i, j, l; int err; if (nd == 0) return error_set(GpuKernel_context(&ge->k_contig)->err, GA_VALUE_ERROR, "nd == 0"); if (call32) k = &ge->k_basic_32[nd-1]; else k = &ge->k_basic[nd-1]; if (!k_initialized(k)) { err = gen_elemwise_basic_kernel(k, GpuKernel_context(&ge->k_contig), NULL, ge->preamble, ge->expr, nd, ge->n, ge->args, ((call32 ? GEN_ADDR32 : 0) | (ge->flags & GE_CONVERT_F16))); if (err != GA_NO_ERROR) return err; } err = GpuKernel_setarg(k, p++, &n); if (err != GA_NO_ERROR) goto error_call_basic; for (i = 0; i < nd; i++) { err = GpuKernel_setarg(k, p++, &dims[i]); if (err != GA_NO_ERROR) goto error_call_basic; } /* l is the number of arrays to date */ l = 0; for (j = 0; j < ge->n; j++) { if (is_array(ge->args[j])) { GpuArray *v = (GpuArray *)args[j]; err = GpuKernel_setarg(k, p++, v->data); if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k, p++, &v->offset); if (err != GA_NO_ERROR) goto error_call_basic; for (i = 0; i < nd; i++) { err = GpuKernel_setarg(k, p++, &strs[l][i]); if (err != GA_NO_ERROR) goto error_call_basic; } l++; } else { err = GpuKernel_setarg(k, p++, args[j]); if (err != GA_NO_ERROR) goto error_call_basic; } } err = GpuKernel_sched(k, n, &gs, &ls); if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL); error_call_basic: return err; } static int gen_elemwise_contig_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, const char *preamble, const char *expr, unsigned int n, gpuelemwise_arg *args, int gen_flags) { strb sb = STRB_STATIC_INIT; int *ktypes = NULL; unsigned int p; unsigned int j; int flags = 0; int res; flags |= gpuarray_type_flagsa(n, args); p = 1; for (j = 0; j < n; j++) p += ISSET(args[j].flags, GE_SCALAR) ? 1 : 2; ktypes = calloc(p, sizeof(int)); if (ktypes == NULL) { res = error_sys(ctx->err, "calloc"); goto bail; } p = 0; strb_appends(&sb, "#include \"cluda.h\"\n"); if (preamble) strb_appends(&sb, preamble); strb_appends(&sb, "\nKERNEL void elem(const ga_size n, "); ktypes[p++] = GA_SIZE; for (j = 0; j < n; j++) { if (is_array(args[j])) { strb_appendf(&sb, "GLOBAL_MEM %s *%s_p, const ga_size %s_offset", ctype(args[j].typecode), args[j].name, args[j].name); ktypes[p++] = GA_BUFFER; ktypes[p++] = GA_SIZE; } else { strb_appendf(&sb, "%s %s", ctype(args[j].typecode), args[j].name); ktypes[p++] = args[j].typecode; } if (j != (n - 1)) strb_appends(&sb, ", "); } strb_appends(&sb, ") {\n" "const ga_size idx = LDIM_0 * GID_0 + LID_0;\n" "const ga_size numThreads = LDIM_0 * GDIM_0;\n" "ga_size i;\n" "GLOBAL_MEM char *tmp;\n\n"); for (j = 0; j < n; j++) { if (is_array(args[j])) { strb_appendf(&sb, "tmp = (GLOBAL_MEM char *)%s_p;" "tmp += %s_offset; %s_p = (GLOBAL_MEM %s *)tmp;", args[j].name, args[j].name, args[j].name, ctype(args[j].typecode)); } } strb_appends(&sb, "for (i = idx; i < n; i += numThreads) {\n"); for (j = 0; j < n; j++) { if (is_array(args[j])) { strb_appendf(&sb, "%s %s;\n", ctype(ISSET(gen_flags, GEN_CONVERT_F16) && args[j].typecode == GA_HALF ? GA_FLOAT : args[j].typecode), args[j].name); if (ISSET(args[j].flags, GE_READ)) { if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { strb_appendf(&sb, "%s = ga_half2float(%s_p[i]);\n", args[j].name, args[j].name); } else { strb_appendf(&sb, "%s = %s_p[i];\n", args[j].name, args[j].name); } } } } strb_appends(&sb, expr); strb_appends(&sb, ";\n"); for (j = 0; j < n; j++) { if (is_array(args[j])) { if (ISSET(args[j].flags, GE_WRITE)) { if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { strb_appendf(&sb, "%s_p[i] = ga_float2half(%s);\n", args[j].name, args[j].name); } else { strb_appendf(&sb, "%s_p[i] = %s;\n", args[j].name, args[j].name); } } } } strb_appends(&sb, "}\n}\n"); if (strb_error(&sb)) { res = error_set(ctx->err, GA_MISC_ERROR, "Formatting error creating kernel source"); goto bail; } res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "elem", p, ktypes, flags, err_str); bail: strb_clear(&sb); free(ktypes); return res; } static int check_contig(GpuElemwise *ge, void **args, size_t *_n, int *contig) { GpuArray *a = NULL, *v; size_t n = 1; unsigned int i, j; int c_contig = 1, f_contig = 1; for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { v = (GpuArray *)args[i]; if (a == NULL) { a = v; for (j = 0; j < a->nd; j++) n *= a->dimensions[j]; } c_contig &= GpuArray_IS_C_CONTIGUOUS(v); f_contig &= GpuArray_IS_F_CONTIGUOUS(v); if (a != v) { if (a->nd != v->nd) return -1; /* We don't check the value of the error code */ for (j = 0; j < a->nd; j++) { if (v->dimensions[j] != a->dimensions[j]) return -1; /* We don't check the value of the error code */ } } } } *contig = f_contig || c_contig; *_n = n; return GA_NO_ERROR; } static int call_contig(GpuElemwise *ge, void **args, size_t n) { GpuArray *a; size_t ls = 0, gs = 0; unsigned int i, p; int err; p = 0; err = GpuKernel_setarg(&ge->k_contig, p++, &n); if (err != GA_NO_ERROR) return err; for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { a = (GpuArray *)args[i]; err = GpuKernel_setarg(&ge->k_contig, p++, a->data); if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(&ge->k_contig, p++, &a->offset); if (err != GA_NO_ERROR) return err; } else { err = GpuKernel_setarg(&ge->k_contig, p++, args[i]); if (err != GA_NO_ERROR) return err; } } err = GpuKernel_sched(&ge->k_contig, n, &gs, &ls); if (err != GA_NO_ERROR) return err; return GpuKernel_call(&ge->k_contig, 1, &gs, &ls, 0, NULL); } GpuElemwise *GpuElemwise_new(gpucontext *ctx, const char *preamble, const char *expr, unsigned int n, gpuelemwise_arg *args, unsigned int nd, int flags) { GpuElemwise *res; #ifdef DEBUG char *errstr = NULL; #endif unsigned int i; int ret; res = calloc(1, sizeof(*res)); if (res == NULL) { error_sys(ctx->err, "calloc"); return NULL; } res->flags = flags; res->nd = 8; res->n = n; res->expr = strdup(expr); if (res->expr == NULL) { error_sys(ctx->err, "strdup"); goto fail; } if (preamble != NULL) { res->preamble = strdup(preamble); if (res->preamble == NULL) { error_sys(ctx->err, "strdup"); goto fail; } } res->args = copy_args(n, args); if (res->args == NULL) { error_sys(ctx->err, "copy_args"); goto fail; } /* Count the arrays in the arguements */ res->narray = 0; for (i = 0; i < res->n; i++) if (is_array(res->args[i])) res->narray++; while (res->nd < nd) res->nd *= 2; res->dims = calloc(res->nd, sizeof(size_t)); if (res->dims == NULL) { error_sys(ctx->err, "calloc"); goto fail; } res->strides = strides_array(res->narray, res->nd); if (res->strides == NULL) { error_sys(ctx->err, "strides_array"); goto fail; } res->k_basic = calloc(res->nd, sizeof(GpuKernel)); if (res->k_basic == NULL) { error_sys(ctx->err, "calloc"); goto fail; } res->k_basic_32 = calloc(res->nd, sizeof(GpuKernel)); if (res->k_basic_32 == NULL) { error_sys(ctx->err, "calloc"); goto fail; } ret = gen_elemwise_contig_kernel(&res->k_contig, ctx, #ifdef DEBUG &errstr, #else NULL, #endif res->preamble, res->expr, res->n, res->args, (res->flags & GE_CONVERT_F16)); if (ret != GA_NO_ERROR) { #ifdef DEBUG if (errstr != NULL) fprintf(stderr, "%s\n", errstr); free(errstr); #endif goto fail; } if (ISCLR(flags, GE_NOADDR64)) { for (i = 0; i < nd; i++) { ret = gen_elemwise_basic_kernel(&res->k_basic[i], ctx, #ifdef DEBUG &errstr, #else NULL, #endif res->preamble, res->expr, i+1, res->n, res->args, (res->flags & GE_CONVERT_F16)); if (ret != GA_NO_ERROR) { #ifdef DEBUG if (errstr != NULL) fprintf(stderr, "%s\n", errstr); free(errstr); #endif goto fail; } } } for (i = 0; i < nd; i++) { ret = gen_elemwise_basic_kernel(&res->k_basic_32[i], ctx, #ifdef DEBUG &errstr, #else NULL, #endif res->preamble, res->expr, i+1, res->n, res->args, GEN_ADDR32 | (res->flags & GE_CONVERT_F16)); if (ret != GA_NO_ERROR) { #ifdef DEBUG if (errstr != NULL) fprintf(stderr, "%s\n", errstr); free(errstr); #endif goto fail; } } return res; fail: GpuElemwise_free(res); return NULL; } void GpuElemwise_free(GpuElemwise *ge) { unsigned int i; if (ge->k_basic_32 != NULL) for (i = 0; i < ge->nd; i++) { if (k_initialized(&ge->k_basic_32[i])) GpuKernel_clear(&ge->k_basic_32[i]); } if (ge->k_basic != NULL) for (i = 0; i < ge->nd; i++) { if (k_initialized(&ge->k_basic[i])) GpuKernel_clear(&ge->k_basic[i]); } if (ge->strides != NULL) for (i = 0; i < ge->narray; i++) { free(ge->strides[i]); } if (k_initialized(&ge->k_contig)) GpuKernel_clear(&ge->k_contig); free(ge->k_basic_32); free(ge->k_basic); free_args(ge->n, ge->args); free((void *)ge->preamble); free((void *)ge->expr); free(ge->dims); free(ge->strides); free(ge); } int GpuElemwise_call(GpuElemwise *ge, void **args, int flags) { size_t n = 0; size_t *dims = NULL; ssize_t **strides = NULL; unsigned int nd = 0; int contig = 0; int call32 = 0; int err; err = check_contig(ge, args, &n, &contig); if (err == GA_NO_ERROR && contig) { if (n == 0) return GA_NO_ERROR; return call_contig(ge, args, n); } err = check_basic(ge, args, flags, &n, &nd, &dims, &strides, &call32); if (err == GA_NO_ERROR) { if (n == 0) return GA_NO_ERROR; return call_basic(ge, args, n, nd, dims, strides, call32); } return err; } libgpuarray-0.7.6/src/gpuarray_error.c000066400000000000000000000025651326743622600200660ustar00rootroot00000000000000#define _CRT_SECURE_NO_WARNINGS #include "gpuarray/error.h" #include #include const char *gpuarray_error_str(int err) { switch (err) { case GA_NO_ERROR: return "No error"; case GA_MEMORY_ERROR: return "Out of memory"; case GA_VALUE_ERROR: return "Value invalid or out of range"; case GA_IMPL_ERROR: return "Unknown device error"; case GA_INVALID_ERROR: return "Invalid value or operation"; case GA_UNSUPPORTED_ERROR: return "Unsupported operation"; case GA_SYS_ERROR: return strerror(errno); case GA_RUN_ERROR: return "Could not execute helper program"; case GA_DEVSUP_ERROR: return "Device does not support operation"; case GA_READONLY_ERROR: return "Buffer is read-only"; case GA_WRITEONLY_ERROR: return "Buffer is write-only"; case GA_BLAS_ERROR: return "Error in BLAS call"; case GA_UNALIGNED_ERROR: return "Unaligned array"; case GA_COPY_ERROR: return "Copy is needed but disallowed by parameters"; case GA_NODEV_ERROR: return "No devices are available"; case GA_MISC_ERROR: return "Undeterminate error"; case GA_COMM_ERROR: return "Error in collectives call"; case GA_XLARGE_ERROR: return "Input size too large for operation"; case GA_LOAD_ERROR: return "Error loading library"; default: return "Unknown GA error"; } } libgpuarray-0.7.6/src/gpuarray_extension.c000066400000000000000000000025061326743622600207440ustar00rootroot00000000000000#include #include "gpuarray/extension.h" typedef struct _ext { const char *name; void *val; } ext; extern void cuda_enter(void); extern void cuda_exit(void); extern void *cuda_make_ctx(void); extern void *cuda_get_stream(void); extern void *cuda_make_buf(void); extern void *cuda_get_sz(void); extern void *cuda_wait(void); extern void *cuda_record(void); extern void *cuda_get_ipc_handle(void); extern void *cuda_open_ipc_handle(void); extern void *cl_make_ctx(void); extern void *cl_get_stream(void); extern void *cl_make_buf(void); extern void *cl_get_buf(void); static ext ext_list[] = { {"cuda_enter", cuda_enter}, {"cuda_exit", cuda_exit}, {"cuda_make_ctx", cuda_make_ctx}, {"cuda_get_stream", cuda_get_stream}, {"cuda_make_buf", cuda_make_buf}, {"cuda_get_sz", cuda_get_sz}, {"cuda_wait", cuda_wait}, {"cuda_record", cuda_record}, {"cuda_get_ipc_handle", cuda_get_ipc_handle}, {"cuda_open_ipc_handle", cuda_open_ipc_handle}, {"cl_make_ctx", cl_make_ctx}, {"cl_get_stream", cl_get_stream}, {"cl_make_buf", cl_make_buf}, {"cl_get_buf", cl_get_buf}, }; #define N_EXT (sizeof(ext_list)/sizeof(ext_list[0])) void *gpuarray_get_extension(const char *name) { unsigned int i; for (i = 0; i < N_EXT; i++) { if (strcmp(name, ext_list[i].name) == 0) return ext_list[i].val; } return NULL; } libgpuarray-0.7.6/src/gpuarray_kernel.c000066400000000000000000000051021326743622600202030ustar00rootroot00000000000000#include "gpuarray/kernel.h" #include "gpuarray/error.h" #include "gpuarray/types.h" #include "util/error.h" #include "private.h" #include int GpuKernel_init(GpuKernel *k, gpucontext *ctx, unsigned int count, const char **strs, const size_t *lens, const char *name, unsigned int argcount, const int *types, int flags, char **err_str) { int res = GA_NO_ERROR; k->args = calloc(argcount, sizeof(void *)); if (k->args == NULL) return error_sys(ctx->err, "calloc"); k->k = gpukernel_init(ctx, count, strs, lens, name, argcount, types, flags, &res, err_str); if (res != GA_NO_ERROR) GpuKernel_clear(k); return res; } void GpuKernel_clear(GpuKernel *k) { if (k->k) gpukernel_release(k->k); free(k->args); k->k = NULL; k->args = NULL; } gpucontext *GpuKernel_context(GpuKernel *k) { return gpukernel_context(k->k); } int GpuKernel_sched(GpuKernel *k, size_t n, size_t *gs, size_t *ls) { size_t min_l; size_t max_l; size_t target_l; size_t max_g; size_t target_g; unsigned int numprocs; int err; int want_ls = 0; err = gpukernel_property(k->k, GA_KERNEL_PROP_MAXLSIZE, &max_l); if (err != GA_NO_ERROR) return err; err = gpukernel_property(k->k, GA_KERNEL_PROP_PREFLSIZE, &min_l); if (err != GA_NO_ERROR) return err; err = gpukernel_property(k->k, GA_CTX_PROP_NUMPROCS, &numprocs); if (err != GA_NO_ERROR) return err; err = gpukernel_property(k->k, GA_CTX_PROP_MAXGSIZE0, &max_g); if (err != GA_NO_ERROR) return err; /* Do something about these hardcoded values */ target_g = numprocs * 32; if (target_g > max_g) target_g = max_g; target_l = 512; if (target_l > max_l) target_l = max_l; if (*ls == 0) { want_ls = 1; *ls = min_l; } if (*gs == 0) { *gs = ((n-1) / *ls) + 1; if (*gs > target_g) *gs = target_g; } if (want_ls && n > (*ls * *gs)) { /* The division and multiplication by min_l is to ensure we end up * with a multiple of min_l */ *ls = ((n / min_l) / *gs) * min_l; if (*ls > target_l) *ls = target_l; } return GA_NO_ERROR; } int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *a) { return gpukernel_setarg(k->k, i, a); } int GpuKernel_call(GpuKernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args) { return gpukernel_call(k->k, n, gs, ls, shared, args); } const char *GpuKernel_error(const GpuKernel *k, int err) { return gpucontext_error(gpukernel_context(k->k), err); } libgpuarray-0.7.6/src/gpuarray_mkstemp.c000066400000000000000000000010721326743622600204050ustar00rootroot00000000000000#define _CRT_SECURE_NO_WARNINGS #include #include #include #include #ifdef _MSC_VER #include #define open _open #define mktemp _mktemp #else #define O_BINARY 0 #endif int mkstemp(char *path) { char *tmp; int res; int tries = 3; do { tmp = mktemp(path); if (tmp == NULL) return -1; res = open(path, O_CREAT|O_EXCL|O_RDWR|O_BINARY, S_IREAD|S_IWRITE); if (res != -1 || errno != EEXIST) return res; } while (--tries); errno = EEXIST; return -1; } libgpuarray-0.7.6/src/gpuarray_reduction.c000066400000000000000000000663631326743622600207370ustar00rootroot00000000000000/* Includes */ #ifdef _MSC_VER #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include "gpuarray/config.h" #include #include #include #include "private.h" #include "gpuarray/array.h" #include "gpuarray/error.h" #include "gpuarray/kernel.h" #include "gpuarray/util.h" #include "util/strb.h" #include "util/integerfactoring.h" /* Datatypes */ struct maxandargmax_ctx{ /* Function Arguments. */ GpuArray* dstMax; GpuArray* dstArgmax; const GpuArray* src; int reduxLen; const int* reduxList; /* General. */ int ret; int* axisList; gpucontext* gpuCtx; /* Source code Generator. */ const char* dstMaxType; const char* dstArgmaxType; int ndd; int ndr; int nds; int ndh; strb s; char* sourceCode; GpuKernel kernel; /* Scheduler */ int hwAxisList[3]; size_t blockSize [3]; size_t gridSize [3]; size_t chunkSize [3]; /* Invoker */ gpudata* srcStepsGD; gpudata* srcSizeGD; gpudata* chunkSizeGD; gpudata* dstMaxStepsGD; gpudata* dstArgmaxStepsGD; }; typedef struct maxandargmax_ctx maxandargmax_ctx; /* Function prototypes */ static int axisInSet (int v, const int* set, size_t setLen, size_t* where); static void appendIdxes (strb* s, const char* prologue, const char* prefix, int startIdx, int endIdx, const char* suffix, const char* epilogue); static int maxandargmaxCheckargs (maxandargmax_ctx* ctx); static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx); static int maxandargmaxGenSource (maxandargmax_ctx* ctx); static void maxandargmaxAppendKernel (maxandargmax_ctx* ctx); static void maxandargmaxAppendTypedefs (maxandargmax_ctx* ctx); static void maxandargmaxAppendPrototype (maxandargmax_ctx* ctx); static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx); static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx); static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx); static void maxandargmaxAppendLoops (maxandargmax_ctx* ctx); static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx); static void maxandargmaxAppendLoopOuter (maxandargmax_ctx* ctx); static void maxandargmaxAppendLoopInner (maxandargmax_ctx* ctx); static void maxandargmaxAppendLoopMacroUndefs (maxandargmax_ctx* ctx); static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx); static int maxandargmaxCompile (maxandargmax_ctx* ctx); static int maxandargmaxSchedule (maxandargmax_ctx* ctx); static int maxandargmaxInvoke (maxandargmax_ctx* ctx); static int maxandargmaxCleanup (maxandargmax_ctx* ctx); /* Function implementation */ GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax, GpuArray* dstArgmax, const GpuArray* src, unsigned reduxLen, const unsigned* reduxList){ maxandargmax_ctx ctxSTACK = {0}; maxandargmax_ctx *ctx = &ctxSTACK; ctxSTACK.dstMax = dstMax; ctxSTACK.dstArgmax = dstArgmax; ctxSTACK.src = src; ctxSTACK.reduxLen = (int)reduxLen; ctxSTACK.reduxList = (const int*)reduxList; if(maxandargmaxCheckargs (ctx) == GA_NO_ERROR && maxandargmaxSelectHwAxes(ctx) == GA_NO_ERROR && maxandargmaxGenSource (ctx) == GA_NO_ERROR && maxandargmaxCompile (ctx) == GA_NO_ERROR && maxandargmaxSchedule (ctx) == GA_NO_ERROR && maxandargmaxInvoke (ctx) == GA_NO_ERROR){ return maxandargmaxCleanup(ctx); }else{ return maxandargmaxCleanup(ctx); } } /** * @brief Check whether axis numbered v is already in the given set of axes. * * @param [in] v * @param [in] set * @param [in] setLen * @param [out] where * @return Non-zero if the set is non-empty and v is in it; Zero otherwise. */ static int axisInSet (int v, const int* set, size_t setLen, size_t* where){ size_t i; for(i=0;iret = GA_NO_ERROR; ctx->axisList = NULL; ctx->gpuCtx = NULL; ctx->dstMaxType = ctx->dstArgmaxType = NULL; ctx->ndh = 0; ctx->sourceCode = NULL; ctx->hwAxisList[0] = ctx->hwAxisList[1] = ctx->hwAxisList[2] = 0; ctx->blockSize [0] = ctx->blockSize [1] = ctx->blockSize [2] = 1; ctx->gridSize [0] = ctx->gridSize [1] = ctx->gridSize [2] = 1; ctx->chunkSize [0] = ctx->chunkSize [1] = ctx->chunkSize [2] = 1; ctx->srcStepsGD = ctx->srcSizeGD = ctx->chunkSizeGD = ctx->dstMaxStepsGD = ctx->dstArgmaxStepsGD = NULL; /* Insane src or reduxLen? */ if(!ctx->dstMax || !ctx->dstArgmax || !ctx->src || ctx->src->nd == 0 || ctx->reduxLen == 0 || ctx->reduxLen > (int)ctx->src->nd){ return ctx->ret=GA_INVALID_ERROR; } /* Insane or duplicate list entry? */ for(i=0;ireduxLen;i++){ if(ctx->reduxList[i] < 0 || ctx->reduxList[i] >= (int)ctx->src->nd || axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){ return ctx->ret=GA_INVALID_ERROR; } } /* Unknown type? */ ctx->dstMaxType = gpuarray_get_type(ctx->src->typecode)->cluda_name; ctx->dstArgmaxType = gpuarray_get_type(GA_SSIZE) ->cluda_name; if(!ctx->dstMaxType || !ctx->dstArgmaxType){ return ctx->ret=GA_INVALID_ERROR; } /* GPU context non-existent? */ ctx->gpuCtx = GpuArray_context(ctx->src); if(!ctx->gpuCtx){ return ctx->ret=GA_INVALID_ERROR; } /** * We initialize some more parts of the context, using the guarantees * we now have about the sanity of the arguments. */ ctx->nds = ctx->src->nd; ctx->ndr = ctx->reduxLen; ctx->ndd = ctx->nds - ctx->ndr; return ctx->ret; } /** * @brief Select which axes (up to 3) will be assigned to hardware * dimensions. */ static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx){ int i, j, maxI = 0; size_t maxV; ctx->ndh = ctx->ndd<3 ? ctx->ndd : 3; /** * The ctx->hwAxisLen largest axes are selected and assigned in * descending order to X, Y, Z. */ for(i=0;indh;i++){ maxV = 0; for(j=0;jnds;j++){ if(!axisInSet(j, ctx->hwAxisList, i, 0) && !axisInSet(j, ctx->reduxList, ctx->ndr, 0) && ctx->src->dimensions[j] >= maxV){ maxV = ctx->src->dimensions[j]; maxI = j; } } ctx->hwAxisList[i] = maxI; } return ctx->ret=GA_NO_ERROR; } /** * @brief Generate the kernel code for MaxAndArgmax. * * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise. */ static int maxandargmaxGenSource (maxandargmax_ctx* ctx){ /* Compute internal axis remapping. */ ctx->axisList = malloc(ctx->nds * sizeof(unsigned)); if(!ctx->axisList){ return ctx->ret=GA_MEMORY_ERROR; } maxandargmaxComputeAxisList(ctx); /* Generate kernel proper. */ strb_ensure(&ctx->s, 5*1024); maxandargmaxAppendKernel(ctx); free(ctx->axisList); ctx->axisList = NULL; ctx->sourceCode = strb_cstr(&ctx->s); if(!ctx->sourceCode){ return ctx->ret=GA_MEMORY_ERROR; } /* Return it. */ return ctx->ret=GA_NO_ERROR; } static void maxandargmaxAppendKernel (maxandargmax_ctx* ctx){ strb_appends (&ctx->s, "#include \"cluda.h\"\n"); maxandargmaxAppendTypedefs (ctx); maxandargmaxAppendPrototype (ctx); strb_appends (&ctx->s, "{\n"); maxandargmaxAppendOffsets (ctx); maxandargmaxAppendIndexDeclarations(ctx); maxandargmaxAppendRangeCalculations(ctx); maxandargmaxAppendLoops (ctx); strb_appends (&ctx->s, "}\n"); } static void maxandargmaxAppendTypedefs (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "/* Typedefs */\n"); strb_appendf(&ctx->s, "typedef %s T;/* The type of the array being processed. */\n", ctx->dstMaxType); strb_appendf(&ctx->s, "typedef %s X;/* Index type: signed 32/64-bit. */\n", ctx->dstArgmaxType); strb_appends(&ctx->s, "\n"); strb_appends(&ctx->s, "\n"); strb_appends(&ctx->s, "\n"); } static void maxandargmaxAppendPrototype (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "KERNEL void maxandargmax(const GLOBAL_MEM T* src,\n"); strb_appends(&ctx->s, " const X srcOff,\n"); strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSteps,\n"); strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSize,\n"); strb_appends(&ctx->s, " const GLOBAL_MEM X* chunkSize,\n"); strb_appends(&ctx->s, " GLOBAL_MEM T* dstMax,\n"); strb_appends(&ctx->s, " const X dstMaxOff,\n"); strb_appends(&ctx->s, " const GLOBAL_MEM X* dstMaxSteps,\n"); strb_appends(&ctx->s, " GLOBAL_MEM X* dstArgmax,\n"); strb_appends(&ctx->s, " const X dstArgmaxOff,\n"); strb_appends(&ctx->s, " const GLOBAL_MEM X* dstArgmaxSteps)"); } static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\t/* Add offsets */\n"); strb_appends(&ctx->s, "\tsrc = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); strb_appends(&ctx->s, "\tdstMax = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dstMax + dstMaxOff);\n"); strb_appends(&ctx->s, "\tdstArgmax = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArgmax + dstArgmaxOff);\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); } static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ int i; strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n"); strb_appends(&ctx->s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); if(ctx->ndh>0){ strb_appends(&ctx->s, "\tX "); for(i=0;indh;i++){ strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", i, i, (i==ctx->ndh-1) ? ";\n" : ", "); } } strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n"); if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n");} if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n");} if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");} if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");} strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); } static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ size_t hwDim; int i; /* Use internal remapping when computing the ranges for this thread. */ strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n"); for(i=0;inds;i++){ strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->axisList[i]); } for(i=0;inds;i++){ strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->axisList[i]); } for(i=0;indd;i++){ strb_appendf(&ctx->s, "\ti%dMStep = dstMaxSteps[%d];\n", i, i); } for(i=0;indd;i++){ strb_appendf(&ctx->s, "\ti%dAStep = dstArgmaxSteps[%d];\n", i, i); } for(i=ctx->nds-1;i>=ctx->ndd;i--){ /** * If this is the last index, it's the first cumulative dimension * product we generate, and thus we initialize to 1. */ if(i == ctx->nds-1){ strb_appendf(&ctx->s, "\ti%dPDim = 1;\n", i); }else{ strb_appendf(&ctx->s, "\ti%dPDim = i%dPDim * i%dDim;\n", i, i+1, i+1); } } for(i=0;inds;i++){ /** * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dStart = 0;\n", i); } } for(i=0;inds;i++){ /** * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); } } strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); } static void maxandargmaxAppendLoops (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * FREE LOOPS.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); maxandargmaxAppendLoopMacroDefs (ctx); maxandargmaxAppendLoopOuter (ctx); maxandargmaxAppendLoopMacroUndefs(ctx); } static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx){ int i; /** * FOROVER Macro */ strb_appends(&ctx->s, "#define FOROVER(idx) for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); /** * ESCAPE Macro */ strb_appends(&ctx->s, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); /** * SRCINDEXER Macro */ appendIdxes (&ctx->s, "#define SRCINDEXER(", "i", 0, ctx->nds, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + "); for(i=0;inds;i++){ strb_appendf(&ctx->s, "i%d*i%dSStep + \\\n ", i, i); } strb_appends(&ctx->s, "0))\n"); /** * RDXINDEXER Macro */ appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ") ("); for(i=ctx->ndd;inds;i++){ strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n ", i, i); } strb_appends(&ctx->s, "0)\n"); /** * DSTMINDEXER Macro */ appendIdxes (&ctx->s, "#define DSTMINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + "); for(i=0;indd;i++){ strb_appendf(&ctx->s, "i%d*i%dMStep + \\\n ", i, i); } strb_appends(&ctx->s, "0))\n"); /** * DSTAINDEXER Macro */ appendIdxes (&ctx->s, "#define DSTAINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + "); for(i=0;indd;i++){ strb_appendf(&ctx->s, "i%d*i%dAStep + \\\n ", i, i); } strb_appends(&ctx->s, "0))\n"); } static void maxandargmaxAppendLoopOuter (maxandargmax_ctx* ctx){ int i; /** * Outer Loop Header Generation */ for(i=0;indd;i++){ strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } /** * Inner Loop Generation */ maxandargmaxAppendLoopInner(ctx); /** * Outer Loop Trailer Generation */ for(i=0;indd;i++){ strb_appends(&ctx->s, "\t}\n"); } } static void maxandargmaxAppendLoopInner (maxandargmax_ctx* ctx){ int i; /** * Inner Loop Prologue */ strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * Reduction initialization.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); appendIdxes (&ctx->s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->ndd, "", ""); if(ctx->ndd && ctx->ndr){strb_appends(&ctx->s, ",");} appendIdxes (&ctx->s, "", "i", ctx->ndd, ctx->nds, "Start", ");\n"); appendIdxes (&ctx->s, "\tX maxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "Start", ");\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * REDUCTION LOOPS.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); /** * Inner Loop Header Generation */ for(i=ctx->ndd;inds;i++){ strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } /** * Inner Loop Body Generation */ appendIdxes (&ctx->s, "\tT V = SRCINDEXER(", "i", 0, ctx->nds, "", ");\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\tif(V > maxV){\n"); strb_appends(&ctx->s, "\t\tmaxV = V;\n"); appendIdxes (&ctx->s, "\t\tmaxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); strb_appends(&ctx->s, "\t}\n"); /** * Inner Loop Trailer Generation */ for(i=ctx->ndd;inds;i++){ strb_appends(&ctx->s, "\t}\n"); } strb_appends(&ctx->s, "\t\n"); /** * Inner Loop Epilogue Generation */ strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * Destination writeback.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); appendIdxes (&ctx->s, "\tDSTMINDEXER(", "i", 0, ctx->ndd, "", ") = maxV;\n"); appendIdxes (&ctx->s, "\tDSTAINDEXER(", "i", 0, ctx->ndd, "", ") = maxI;\n"); } static void maxandargmaxAppendLoopMacroUndefs (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "#undef FOROVER\n"); strb_appends(&ctx->s, "#undef ESCAPE\n"); strb_appends(&ctx->s, "#undef SRCINDEXER\n"); strb_appends(&ctx->s, "#undef RDXINDEXER\n"); strb_appends(&ctx->s, "#undef DSTMINDEXER\n"); strb_appends(&ctx->s, "#undef DSTAINDEXER\n"); } static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx){ int i, f=0; for(i=0;inds;i++){ if(axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ continue; } ctx->axisList[f++] = i; } memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList)); } /** * @brief Compile the kernel from source code. * * @return */ static int maxandargmaxCompile (maxandargmax_ctx* ctx){ const int ARG_TYPECODES[] = { GA_BUFFER, /* src */ GA_SIZE, /* srcOff */ GA_BUFFER, /* srcSteps */ GA_BUFFER, /* srcSize */ GA_BUFFER, /* chnkSize */ GA_BUFFER, /* dstMax */ GA_SIZE, /* dstMaxOff */ GA_BUFFER, /* dstMaxSteps */ GA_BUFFER, /* dstArgmax */ GA_SIZE, /* dstArgmaxOff */ GA_BUFFER /* dstArgmaxSteps */ }; const unsigned int ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); const char* SRCS[1]; SRCS[0] = ctx->sourceCode; ctx->ret = GpuKernel_init(&ctx->kernel, ctx->gpuCtx, 1, SRCS, NULL, "maxandargmax", ARG_TYPECODES_LEN, ARG_TYPECODES, 0, (char**)0); free(ctx->sourceCode); ctx->sourceCode = NULL; return ctx->ret; } /** * Compute a good thread block size / grid size / software chunk size for Nvidia. */ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ int i; size_t warpMod; size_t bestWarpMod = 1; unsigned bestWarpAxis = 0; uint64_t maxLg; uint64_t maxLs[3]; uint64_t maxGg; uint64_t maxGs[3]; uint64_t dims [3]; double slack[3]; ga_factor_list factBS[3]; ga_factor_list factGS[3]; ga_factor_list factCS[3]; /** * Obtain the constraints of our problem. */ size_t warpSize, maxL, maxL0, maxL1, maxL2, /* Maximum total and per-dimension thread/block sizes */ maxG, maxG0, maxG1, maxG2; /* Maximum total and per-dimension block /grid sizes */ gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &maxL0); gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &maxL1); gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &maxL2); gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &maxG0); maxG = maxG0; gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); /** * Prepare inputs to the solver. * * This involves, amongst others, * - Initializing the blockSize, gridSize and chunkSize factor lists for all * hardware dimensions. * - Finding on which hardware axis is it optimal to place the warpSize factor. */ maxLg = maxL; maxLs[0] = maxL0, maxLs[1]=maxL1, maxLs[2]=maxL2; maxGg = maxG; maxGs[0] = maxG0, maxGs[1]=maxG1, maxGs[2]=maxG2; dims[0] = dims[1] = dims[2] = 1; slack[0] = slack[1] = slack[2] = 1.1; for(i=0;indh;i++){ dims[i] = ctx->src->dimensions[ctx->hwAxisList[i]]; gaIFLInit(&factBS[i]); gaIFLInit(&factGS[i]); gaIFLInit(&factCS[i]); warpMod = dims[i]%warpSize; if(bestWarpMod>0 && (warpMod==0 || warpMod>=bestWarpMod)){ bestWarpAxis = i; bestWarpMod = warpMod; } } if(ctx->ndh > 0){ dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize; gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]); } /** * Factorization job. We'll steadily increase the slack in case of failure * in order to ensure we do get a factorization, which we place into * chunkSize. */ for(i=0;indh;i++){ while(!gaIFactorize(dims[i], (uint64_t)(dims[i]*slack[i]), maxLs[i], &factCS[i])){ /** * Error! Failed to factorize dimension i with given slack and * k-smoothness constraints! Increase slack. Once slack reaches * 2.0 it will factorize guaranteed. */ slack[i] += 0.1; } } /** * Invoke the scheduler. * * The scheduler will move some factors from chunkSize into blockSize and * gridSize, improving performance. */ gaIFLSchedule(ctx->ndh, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS); /* Output. */ for(i=0;indh;i++){ ctx->blockSize[i] = gaIFLGetProduct(&factBS[i]); ctx->gridSize [i] = gaIFLGetProduct(&factGS[i]); ctx->chunkSize[i] = gaIFLGetProduct(&factCS[i]); } /* Return. */ return ctx->ret=GA_NO_ERROR; } /** * Invoke the kernel. */ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ void* args[11]; /** * Argument Marshalling. This the grossest gross thing in here. */ const int flags = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT; ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), ctx->src->strides, flags, 0); ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), ctx->src->dimensions, flags, 0); ctx->chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t), ctx->chunkSize, flags, 0); ctx->dstMaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), ctx->dstMax->strides, flags, 0); ctx->dstArgmaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), ctx->dstArgmax->strides, flags, 0); args[ 0] = (void*) ctx->src->data; args[ 1] = (void*)&ctx->src->offset; args[ 2] = (void*) ctx->srcStepsGD; args[ 3] = (void*) ctx->srcSizeGD; args[ 4] = (void*) ctx->chunkSizeGD; args[ 5] = (void*) ctx->dstMax->data; args[ 6] = (void*)&ctx->dstMax->offset; args[ 7] = (void*) ctx->dstMaxStepsGD; args[ 8] = (void*) ctx->dstArgmax->data; args[ 9] = (void*)&ctx->dstArgmax->offset; args[10] = (void*) ctx->dstArgmaxStepsGD; if(ctx->srcStepsGD && ctx->srcSizeGD && ctx->chunkSizeGD && ctx->dstMaxStepsGD && ctx->dstArgmaxStepsGD){ ctx->ret = GpuKernel_call(&ctx->kernel, ctx->ndh>0 ? ctx->ndh : 1, ctx->gridSize, ctx->blockSize, 0, args); }else{ ctx->ret = GA_MEMORY_ERROR; } gpudata_release(ctx->srcStepsGD); gpudata_release(ctx->srcSizeGD); gpudata_release(ctx->chunkSizeGD); gpudata_release(ctx->dstMaxStepsGD); gpudata_release(ctx->dstArgmaxStepsGD); return ctx->ret; } /** * Cleanup */ static int maxandargmaxCleanup (maxandargmax_ctx* ctx){ free(ctx->axisList); free(ctx->sourceCode); ctx->axisList = NULL; ctx->sourceCode = NULL; return ctx->ret; } libgpuarray-0.7.6/src/gpuarray_strl.c000066400000000000000000000046141326743622600177160ustar00rootroot00000000000000/*$OpenBSD: strlcat.c,v 1.13 2005/08/08 08:05:37 espie Exp $*/ /*$OpenBSD: strlcpy.c,v 1.11 2006/05/05 15:27:38 millert Exp $*/ /* * Copyright (c) 1998 Todd C. Miller * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include /* * Appends src to string dst of size siz (unlike strncat, siz is the * full size of dst, not space left). At most siz-1 characters * will be copied. Always NUL terminates (unless siz <= strlen(dst)). * Returns strlen(src) + MIN(siz, strlen(initial dst)). * If retval >= siz, truncation occurred. */ size_t strlcat(char *dst, const char *src, size_t siz) { char *d = dst; const char *s = src; size_t n = siz; size_t dlen; /* Find the end of dst and adjust bytes left but don't go past end */ while (n-- != 0 && *d != '\0') d++; dlen = d - dst; n = siz - dlen; if (n == 0) return(dlen + strlen(s)); while (*s != '\0') { if (n != 1) { *d++ = *s; n--; } s++; } *d = '\0'; return(dlen + (s - src));/* count does not include NUL */ } /* * Copy src to string dst of size siz. At most siz-1 characters * will be copied. Always NUL terminates (unless siz == 0). * Returns strlen(src); if retval >= siz, truncation occurred. */ size_t strlcpy(char *dst, const char *src, size_t siz) { char *d = dst; const char *s = src; size_t n = siz; /* Copy as many bytes as will fit */ if (n != 0) { while (--n != 0) { if ((*d++ = *s++) == '\0') break; } } /* Not enough room in dst, add NUL and traverse rest of src */ if (n == 0) { if (siz != 0) *d = '\0';/* NUL-terminate dst */ while (*s++) ; } return(s - src - 1);/* count does not include NUL */ } libgpuarray-0.7.6/src/gpuarray_types.c000066400000000000000000000140101326743622600200650ustar00rootroot00000000000000 /* This file is generated by gen_types.py */ #include "gpuarray/types.h" #include /* For NULL */ #ifdef _MSC_VER typedef signed __int8 int8_t; typedef unsigned __int8 uint8_t; typedef signed __int16 int16_t; typedef unsigned __int16 uint16_t; typedef signed __int32 int32_t; typedef unsigned __int32 uint32_t; typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t; #else #include #endif typedef struct _int128 { union int128_u { int8_t as_int8[16]; int16_t as_int16[8]; int32_t as_int32[4]; int64_t as_int64[2]; } value; } int128_t; typedef struct _uint128 { union uint128_u { uint8_t as_uint8[16]; uint16_t as_uint16[8]; uint32_t as_uint32[4]; uint64_t as_uint64[2]; } value; } uint128_t; typedef struct _quad { union { struct { int16_t exp; uint16_t hi; uint32_t lo; } s; uint128_t raw; } u; } ga_quad; typedef uint16_t half_t; typedef struct _cfloat { float r; float i; } ga_cfloat; typedef struct _cdouble { double r; double i; } ga_cdouble; typedef struct _cquad { ga_quad r; ga_quad i; } ga_cquad; typedef struct {char c; uint8_t x; } st_bool; #define BOOL_ALIGN (sizeof(st_bool) - sizeof(uint8_t)) typedef struct {char c; int8_t x; } st_byte; #define BYTE_ALIGN (sizeof(st_byte) - sizeof(int8_t)) typedef struct {char c; uint8_t x; } st_ubyte; #define UBYTE_ALIGN (sizeof(st_ubyte) - sizeof(uint8_t)) typedef struct {char c; int16_t x; } st_short; #define SHORT_ALIGN (sizeof(st_short) - sizeof(int16_t)) typedef struct {char c; uint16_t x; } st_ushort; #define USHORT_ALIGN (sizeof(st_ushort) - sizeof(uint16_t)) typedef struct {char c; int32_t x; } st_int; #define INT_ALIGN (sizeof(st_int) - sizeof(int32_t)) typedef struct {char c; uint32_t x; } st_uint; #define UINT_ALIGN (sizeof(st_uint) - sizeof(uint32_t)) typedef struct {char c; int64_t x; } st_long; #define LONG_ALIGN (sizeof(st_long) - sizeof(int64_t)) typedef struct {char c; uint64_t x; } st_ulong; #define ULONG_ALIGN (sizeof(st_ulong) - sizeof(uint64_t)) typedef struct {char c; int128_t x; } st_longlong; #define LONGLONG_ALIGN (sizeof(st_longlong) - sizeof(int128_t)) typedef struct {char c; uint128_t x; } st_ulonglong; #define ULONGLONG_ALIGN (sizeof(st_ulonglong) - sizeof(uint128_t)) typedef struct {char c; float x; } st_float; #define FLOAT_ALIGN (sizeof(st_float) - sizeof(float)) typedef struct {char c; double x; } st_double; #define DOUBLE_ALIGN (sizeof(st_double) - sizeof(double)) typedef struct {char c; ga_quad x; } st_quad; #define QUAD_ALIGN (sizeof(st_quad) - sizeof(ga_quad)) typedef struct {char c; ga_cfloat x; } st_cfloat; #define CFLOAT_ALIGN (sizeof(st_cfloat) - sizeof(ga_cfloat)) typedef struct {char c; ga_cdouble x; } st_cdouble; #define CDOUBLE_ALIGN (sizeof(st_cdouble) - sizeof(ga_cdouble)) typedef struct {char c; ga_cquad x; } st_cquad; #define CQUAD_ALIGN (sizeof(st_cquad) - sizeof(ga_cquad)) typedef struct {char c; half_t x; } st_half; #define HALF_ALIGN (sizeof(st_half) - sizeof(half_t)) typedef struct {char c; size_t x; } st_size; #define SIZE_ALIGN (sizeof(st_size) - sizeof(size_t)) typedef struct {char c; ssize_t x; } st_ssize; #define SSIZE_ALIGN (sizeof(st_ssize) - sizeof(ssize_t)) const gpuarray_type scalar_types[] = { {"ga_bool", 1, BOOL_ALIGN, GA_BOOL}, {"ga_byte", 1, BYTE_ALIGN, GA_BYTE}, {"ga_ubyte", 1, UBYTE_ALIGN, GA_UBYTE}, {"ga_short", 2, SHORT_ALIGN, GA_SHORT}, {"ga_ushort", 2, USHORT_ALIGN, GA_USHORT}, {"ga_int", 4, INT_ALIGN, GA_INT}, {"ga_uint", 4, UINT_ALIGN, GA_UINT}, {"ga_long", 8, LONG_ALIGN, GA_LONG}, {"ga_ulong", 8, ULONG_ALIGN, GA_ULONG}, {"ga_longlong", 16, LONGLONG_ALIGN, GA_LONGLONG}, {"ga_ulonglong", 16, ULONGLONG_ALIGN, GA_ULONGLONG}, {"ga_float", 4, FLOAT_ALIGN, GA_FLOAT}, {"ga_double", 8, DOUBLE_ALIGN, GA_DOUBLE}, {"ga_quad", 16, QUAD_ALIGN, GA_QUAD}, {"ga_cfloat", 8, CFLOAT_ALIGN, GA_CFLOAT}, {"ga_cdouble", 16, CDOUBLE_ALIGN, GA_CDOUBLE}, {"ga_cquad", 32, CQUAD_ALIGN, GA_CQUAD}, {NULL, 0, 0, -1}, {NULL, 0, 0, -1}, {NULL, 0, 0, -1}, {NULL, 0, 0, -1}, {NULL, 0, 0, -1}, {NULL, 0, 0, -1}, {"ga_half", 2, HALF_ALIGN, GA_HALF}, {"ga_size", sizeof(size_t), SIZE_ALIGN, GA_SIZE}, {"ga_ssize", sizeof(ssize_t), SSIZE_ALIGN, GA_SSIZE}, }; const gpuarray_type vector_types[] = { {"ga_byte2", 2, 0, GA_BYTE2}, {"ga_ubyte2", 2, 0, GA_UBYTE2}, {"ga_byte3", 3, 0, GA_BYTE3}, {"ga_ubyte3", 3, 0, GA_UBYTE3}, {"ga_byte4", 4, 0, GA_BYTE4}, {"ga_ubyte4", 4, 0, GA_UBYTE4}, {"ga_byte8", 8, 0, GA_BYTE8}, {"ga_ubyte8", 8, 0, GA_UBYTE8}, {"ga_byte16", 16, 0, GA_BYTE16}, {"ga_ubyte16", 16, 0, GA_UBYTE16}, {"ga_short2", 4, 0, GA_SHORT2}, {"ga_ushort2", 4, 0, GA_USHORT2}, {"ga_short3", 6, 0, GA_SHORT3}, {"ga_ushort3", 6, 0, GA_USHORT3}, {"ga_short4", 8, 0, GA_SHORT4}, {"ga_ushort4", 8, 0, GA_USHORT4}, {"ga_short8", 16, 0, GA_SHORT8}, {"ga_ushort8", 16, 0, GA_USHORT8}, {"ga_short16", 32, 0, GA_SHORT16}, {"ga_ushort16", 32, 0, GA_USHORT16}, {"ga_int2", 8, 0, GA_INT2}, {"ga_uint2", 8, 0, GA_UINT2}, {"ga_int3", 12, 0, GA_INT3}, {"ga_uint3", 12, 0, GA_UINT3}, {"ga_int4", 16, 0, GA_INT4}, {"ga_uint4", 16, 0, GA_UINT4}, {"ga_int8", 32, 0, GA_INT8}, {"ga_uint8", 32, 0, GA_UINT8}, {"ga_int16", 64, 0, GA_INT16}, {"ga_uint16", 64, 0, GA_UINT16}, {"ga_long2", 16, 0, GA_LONG2}, {"ga_ulong2", 16, 0, GA_ULONG2}, {"ga_long3", 24, 0, GA_LONG3}, {"ga_ulong3", 24, 0, GA_ULONG3}, {"ga_long4", 32, 0, GA_LONG4}, {"ga_ulong4", 32, 0, GA_ULONG4}, {"ga_long8", 64, 0, GA_LONG8}, {"ga_ulong8", 64, 0, GA_ULONG8}, {"ga_long16", 128, 0, GA_LONG16}, {"ga_ulong16", 128, 0, GA_ULONG16}, {"ga_float2", 8, 0, GA_FLOAT2}, {"ga_float4", 16, 0, GA_FLOAT4}, {"ga_float8", 32, 0, GA_FLOAT8}, {"ga_float16", 64, 0, GA_FLOAT16}, {"ga_double2", 16, 0, GA_DOUBLE2}, {"ga_double4", 32, 0, GA_DOUBLE4}, {"ga_double8", 64, 0, GA_DOUBLE8}, {"ga_double16", 128, 0, GA_DOUBLE16}, {"ga_half2", 4, 0, GA_HALF2}, {"ga_half4", 8, 0, GA_HALF4}, {"ga_half8", 16, 0, GA_HALF8}, {"ga_half16", 32, 0, GA_HALF16}, }; libgpuarray-0.7.6/src/gpuarray_util.c000066400000000000000000000116211326743622600177030ustar00rootroot00000000000000#include #include "private.h" #include "util/strb.h" #include "gpuarray/util.h" #include "gpuarray/error.h" #include "gpuarray/kernel.h" #include "gpuarray/elemwise.h" /* * API version is negative since we are still in the development * phase. Once we go stable, this will move to 0 and go up from * there. */ static gpuarray_type **custom_types = NULL; static int n_types = 0; static gpuarray_type no_type = {NULL, 0, 0, -1}; typedef struct _buf_st { char c; GpuArray *a; } buf_st; #define BUF_ALIGN (sizeof(buf_st) - sizeof(GpuArray *)) static gpuarray_type buffer_type = {NULL, sizeof(GpuArray *), BUF_ALIGN, GA_BUFFER}; int gpuarray_register_type(gpuarray_type *t, int *ret) { gpuarray_type **tmp; tmp = realloc(custom_types, (n_types+1)*sizeof(*tmp)); if (tmp == NULL) { if (ret) *ret = GA_SYS_ERROR; return -1; } custom_types = tmp; t->typecode = 512 + n_types; custom_types[n_types++] = t; return t->typecode; } const gpuarray_type *gpuarray_get_type(int typecode) { if (typecode <= GA_DELIM) { if (typecode == GA_BUFFER) return &buffer_type; if (typecode < GA_NBASE) return &scalar_types[typecode]; else return &no_type; } else if (typecode < GA_ENDVEC) { if (typecode < GA_NVEC) return &vector_types[typecode - 256]; else return &no_type; } else { if ((typecode - 512) < n_types) return custom_types[typecode - 512]; else return &no_type; } } size_t gpuarray_get_elsize(int typecode) { return gpuarray_get_type(typecode)->size; } static inline ssize_t ssabs(ssize_t v) { return (v < 0 ? -v : v); } void gpuarray_elem_perdim(strb *sb, unsigned int nd, const size_t *dims, const ssize_t *str, const char *id) { int i; if (nd > 0) { strb_appendf(sb, "int %si = i;", id); for (i = nd-1; i > 0; i--) { strb_appendf(sb, "%s %c= ((%si %% %" SPREFIX "u) * " "%" SPREFIX "d);%si = %si / %" SPREFIX "u;", id, (str[i] < 0 ? '-' : '+'), id, dims[i], ssabs(str[i]), id, id, dims[i]); } strb_appendf(sb, "%s %c= (%si * %" SPREFIX "d);", id, (str[0] < 0 ? '-' : '+'), id, ssabs(str[0])); } } void gpukernel_source_with_line_numbers(unsigned int count, const char **news, size_t *newl, strb *src) { unsigned int section, line, i, j; size_t len; line = 1; // start the line counter at 1 for (section = 0; section < count; section++) { len = (newl == NULL) ? 0 : newl[section]; if (len <= 0) len = strlen(news[section]); i = 0; // position of line-starts within news[section] while (i < len) { strb_appendf(src, "%04d\t", line); for (j = i; j < len && news[section][j] != '\n'; j++); strb_appendn(src, news[section]+i, (j-i)); strb_appendc(src, '\n'); i = j+1; // Character after the newline line++; } } } static int get_type_flags(int typecode) { int flags = 0; if (typecode == GA_DOUBLE || typecode == GA_CDOUBLE) flags |= GA_USE_DOUBLE; if (typecode == GA_HALF) flags |= GA_USE_HALF; if (typecode == GA_CFLOAT || typecode == GA_CDOUBLE) flags |= GA_USE_COMPLEX; if (gpuarray_get_elsize(typecode) < 4) flags |= GA_USE_SMALL; return flags; } /* List of typecodes terminated by -1 */ int gpuarray_type_flags(int init, ...) { va_list ap; int typecode = init; int flags = 0; va_start(ap, init); while (typecode != -1) { flags |= get_type_flags(typecode); typecode = va_arg(ap, int); } va_end(ap); return flags; } int gpuarray_type_flagsa(unsigned int n, gpuelemwise_arg *args) { unsigned int i; int flags = 0; for (i = 0; i < n; i++) { flags |= get_type_flags(args[i].typecode); } return flags; } static inline void shiftdown(ssize_t *base, unsigned int i, unsigned int nd) { if (base != NULL) memmove(&base[i], &base[i+1], (nd - i - 1)*sizeof(size_t)); } void gpuarray_elemwise_collapse(unsigned int n, unsigned int *_nd, size_t *dims, ssize_t **strs) { unsigned int i; unsigned int k; unsigned int nd = *_nd; /* Remove dimensions of size 1 */ for (i = nd; i > 0; i--) { if (nd > 1 && dims[i-1] == 1) { shiftdown((ssize_t *)dims, i-1, nd); for (k = 0; k < n; k++) shiftdown(strs[k], i-1, nd); nd--; } } for (i = nd - 1; i > 0; i--) { int collapse = 1; for (k = 0; k < n; k++) { collapse &= (strs[k] == NULL || strs[k][i - 1] == (ssize_t)dims[i] * strs[k][i]); } if (collapse) { dims[i-1] *= dims[i]; shiftdown((ssize_t *)dims, i, nd); for (k = 0; k < n; k++) { if (strs[k] != NULL) { strs[k][i-1] = strs[k][i]; shiftdown(strs[k], i, nd); } } nd--; } } *_nd = nd; } libgpuarray-0.7.6/src/head.py000066400000000000000000000016711326743622600161270ustar00rootroot00000000000000# Used to generate the string tables to embed the cluda headers. # Usage: python head.py # This will output .c def wrt(f, n, b): f.write(b',') n += 1 if n > 10: f.write(b'\n') n = 0 else: f.write(b' ') f.write(b"0x%02x" % (b,)) return n def convert(src, dst): src_name = src.replace('.', '_') with open(src, 'rb') as f: src_data = f.read() with open(dst, 'wb') as f: f.write(b'static const char %s[] = {\n' % (src_name.encode('utf-8'),)) first = True n = 0 for b in bytearray(src_data): if b == 0: raise ValueError('NUL in file') if first: f.write(b"0x%02x" % (b,)) first = False else: n = wrt(f, n, b) wrt(f, n, 0) f.write(b'};\n') if __name__ == '__main__': import sys convert(sys.argv[1], sys.argv[1] + '.c') libgpuarray-0.7.6/src/loaders/000077500000000000000000000000001326743622600163005ustar00rootroot00000000000000libgpuarray-0.7.6/src/loaders/CMakeLists.txt000066400000000000000000000001611326743622600210360ustar00rootroot00000000000000set_rel(LOADERS_SRC dyn_load.c libcuda.c libnvrtc.c libcublas.c libnccl.c libopencl.c libclblas.c libclblast.c ) libgpuarray-0.7.6/src/loaders/dyn_load.c000066400000000000000000000026631326743622600202440ustar00rootroot00000000000000#include "dyn_load.h" #include "util/error.h" #if defined(__unix__) || defined(__APPLE__) #include #include #include #include #include void *ga_load_library(const char *name, error *e) { void *res = dlopen(name, RTLD_LAZY|RTLD_LOCAL); if (res == NULL) error_fmt(e, GA_LOAD_ERROR, "Could not load \"%s\": %s", name, dlerror()); return res; } void *ga_func_ptr(void *h, const char *name, error *e) { void *res = dlsym(h, name); if (res == NULL) error_fmt(e, GA_LOAD_ERROR, "Could not find symbol \"%s\": %s", name, dlerror()); return res; } #else /* Should be windows */ #include static inline void error_win(const char* name, error *e) { char msgbuf[512]; DWORD err = GetLastError(); DWORD len = FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM| FORMAT_MESSAGE_IGNORE_INSERTS, NULL, err, 0, msgbuf, 512, NULL); if (len == 0) error_fmt(e, GA_LOAD_ERROR, "Could not load \"%s\": error code %X", name, err); else error_fmt(e, GA_LOAD_ERROR, "Could not load \"%s\": %s", name, msgbuf); } void *ga_load_library(const char *name, error *e) { void *res = LoadLibrary(name); if (res == NULL) error_win(name, e); return res; } void *ga_func_ptr(void *h, const char *name, error *e) { void *res = (void *)GetProcAddress(h, name); if (res == NULL) error_win(name, e); return res; } #endif libgpuarray-0.7.6/src/loaders/dyn_load.h000066400000000000000000000002751326743622600202460ustar00rootroot00000000000000#ifndef UTIL_DYN_LOAD_H #define UTIL_DYN_LOAD_H #include "util/error.h" void *ga_load_library(const char *name, error *e); void *ga_func_ptr(void *h, const char *name, error *e); #endif libgpuarray-0.7.6/src/loaders/libclblas.c000066400000000000000000000016261326743622600204000ustar00rootroot00000000000000#include #include "libclblas.h" #include "dyn_load.h" #include "gpuarray/error.h" #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) static const char libname[] = "clBLAS.dll"; #else /* Unix */ #ifdef __APPLE__ static const char libname[] = "libclBLAS.dylib"; #else static const char libname[] = "libclBLAS.so"; #endif #endif #define DEF_PROC(ret, name, args) t##name *name #include "libclblas.fn" #undef DEF_PROC #define DEF_PROC(ret, name, args) \ name = (t##name *)ga_func_ptr(lib, #name, e); \ if (name == NULL) { \ return e->code; \ } static int loaded = 0; int load_libclblas(error *e) { void *lib; if (loaded) return GA_NO_ERROR; lib = ga_load_library(libname, e); if (lib == NULL) return e->code; #include "libclblas.fn" loaded = 1; return GA_NO_ERROR; } libgpuarray-0.7.6/src/loaders/libclblas.fn000066400000000000000000000057321326743622600205630ustar00rootroot00000000000000DEF_PROC(clblasStatus, clblasSetup, (void)); DEF_PROC(void, clblasTeardown, (void)); DEF_PROC(clblasStatus, clblasSdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSger, (clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDger, (clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); libgpuarray-0.7.6/src/loaders/libclblas.h000066400000000000000000000022011326743622600203730ustar00rootroot00000000000000#ifndef LOADER_LIBCLBLAS_H #define LOADER_LIBCLBLAS_H #include "util/error.h" #include "libopencl.h" /** @cond NEVER */ typedef enum clblasOrder_ { clblasRowMajor, clblasColumnMajor } clblasOrder; typedef enum clblasTranspose_ { clblasNoTrans, clblasTrans, clblasConjTrans } clblasTranspose; typedef enum clblasStatus_ { clblasSuccess = CL_SUCCESS, /* Rest is not exposed from here */ clblasNotImplemented = -1024, clblasNotInitialized, clblasInvalidMatA, clblasInvalidMatB, clblasInvalidMatC, clblasInvalidVecX, clblasInvalidVecY, clblasInvalidDim, clblasInvalidLeadDimA, clblasInvalidLeadDimB, clblasInvalidLeadDimC, clblasInvalidIncX, clblasInvalidIncY, clblasInsufficientMemMatA, clblasInsufficientMemMatB, clblasInsufficientMemMatC, clblasInsufficientMemVecX, clblasInsufficientMemVecY, } clblasStatus; /** @endcond */ int load_libclblas(error *); /** @cond NEVER */ #define DEF_PROC(ret, name, args) typedef ret t##name args #include "libclblas.fn" #undef DEF_PROC #define DEF_PROC(ret, name, args) extern t##name *name #include "libclblas.fn" #undef DEF_PROC /** @endcond */ #endif libgpuarray-0.7.6/src/loaders/libclblast.c000066400000000000000000000016351326743622600205640ustar00rootroot00000000000000#include #include "libclblast.h" #include "dyn_load.h" #include "gpuarray/error.h" #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) static const char libname[] = "clblast.dll"; #else /* Unix */ #ifdef __APPLE__ static const char libname[] = "libclblast.dylib"; #else static const char libname[] = "libclblast.so"; #endif #endif #define DEF_PROC(ret, name, args) t##name *name #include "libclblast.fn" #undef DEF_PROC #define DEF_PROC(ret, name, args) \ name = (t##name *)ga_func_ptr(lib, #name, e); \ if (name == NULL) { \ return e->code; \ } static int loaded = 0; int load_libclblast(error *e) { void *lib; if (loaded) return GA_NO_ERROR; lib = ga_load_library(libname, e); if (lib == NULL) return e->code; #include "libclblast.fn" loaded = 1; return GA_NO_ERROR; } libgpuarray-0.7.6/src/loaders/libclblast.fn000066400000000000000000000064601326743622600207460ustar00rootroot00000000000000DEF_PROC(CLBlastStatusCode, CLBlastHdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); DEF_PROC(CLBlastStatusCode, CLBlastSdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); DEF_PROC(CLBlastStatusCode, CLBlastDdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); DEF_PROC(CLBlastStatusCode, CLBlastHgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_half beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); DEF_PROC(CLBlastStatusCode, CLBlastSgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); DEF_PROC(CLBlastStatusCode, CLBlastDgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); DEF_PROC(CLBlastStatusCode, CLBlastHgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_half beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event)); DEF_PROC(CLBlastStatusCode, CLBlastSgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event)); DEF_PROC(CLBlastStatusCode, CLBlastDgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *events)); DEF_PROC(CLBlastStatusCode, CLBlastHger, (Layout order, size_t M, size_t N, cl_half alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); DEF_PROC(CLBlastStatusCode, CLBlastSger, (Layout order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); DEF_PROC(CLBlastStatusCode, CLBlastDger, (Layout order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); libgpuarray-0.7.6/src/loaders/libclblast.h000066400000000000000000000033731326743622600205720ustar00rootroot00000000000000#ifndef LOADER_LIBCLBLAST_H #define LOADER_LIBCLBLAST_H #include "util/error.h" #include "libopencl.h" /** @cond NEVER */ typedef enum Layout_ { kRowMajor = 101, kColMajor = 102 } Layout; typedef enum Transpose_ { kNo = 111, kYes = 112, kConjugate = 113 } Transpose; typedef enum CLBLastStatusCode_ { kSuccess = 0, /* Rest is not exposed from here */ CLBlastNotImplemented = -1024, CLBlastInvalidMatrixA = -1022, CLBlastInvalidMatrixB = -1021, CLBlastInvalidMatrixC = -1020, CLBlastInvalidVectorX = -1019, CLBlastInvalidVectorY = -1018, CLBlastInvalidDimension = -1017, CLBlastInvalidLeadDimA = -1016, CLBlastInvalidLeadDimB = -1015, CLBlastInvalidLeadDimC = -1014, CLBlastInvalidIncrementX = -1013, CLBlastInvalidIncrementY = -1012, CLBlastInsufficientMemoryA = -1011, CLBlastInsufficientMemoryB = -1010, CLBlastInsufficientMemoryC = -1009, CLBlastInsufficientMemoryX = -1008, CLBlastInsufficientMemoryY = -1007, CLBlastInvalidLocalMemUsage = -2046, CLBlastNoHalfPrecision = -2045, CLBlastNoDoublePrecision = -2044, CLBlastInvalidVectorScalar = -2043, CLBlastInsufficientMemoryScalar = -2042, CLBlastDatabaseError = -2041, CLBlastUnknownError = -2040, CLBlastUnexpectedError = -2039, } CLBlastStatusCode; /** @endcond */ int load_libclblast(error *); /** @cond NEVER */ #define DEF_PROC(ret, name, args) typedef ret t##name args #include "libclblast.fn" #undef DEF_PROC #define DEF_PROC(ret, name, args) extern t##name *name #include "libclblast.fn" #undef DEF_PROC /** @endcond */ #endif libgpuarray-0.7.6/src/loaders/libcublas.c000066400000000000000000000041341326743622600204060ustar00rootroot00000000000000/* To be able to use snprintf with any compiler including MSVC2008. */ #include #include "libcublas.h" #include "dyn_load.h" #include "gpuarray/error.h" #define DEF_PROC(name, args) t##name *name #define DEF_PROC_V2(name, args) DEF_PROC(name, args) #define DEF_PROC_OPT(name, args) DEF_PROC(name, args) #include "libcublas.fn" #undef DEF_PROC_OPT #undef DEF_PROC_V2 #undef DEF_PROC #define STRINGIFY(X) #X #define DEF_PROC(name, args) \ name = (t##name *)ga_func_ptr(lib, #name, e); \ if (name == NULL) { \ return e->code; \ } #define DEF_PROC_OPT(name, args) \ name = (t##name *)ga_func_ptr(lib, #name, e); #define DEF_PROC_V2(name, args) \ name = (t##name *)ga_func_ptr(lib, STRINGIFY(name##_v2), e); \ if (name == NULL) { \ return e->code; \ } static int loaded = 0; int load_libcublas(int major, int minor, error *e) { void *lib; if (loaded) return GA_NO_ERROR; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) { char libname[64]; int n; #ifdef DEBUG fprintf(stderr, "Loading cuBLAS %d.%d.\n", major, minor); #endif n = snprintf(libname, sizeof(libname), "cublas64_%d%d.dll", major, minor); if (n < 0 || n >= sizeof(libname)) return error_set(e, GA_SYS_ERROR, "snprintf"); lib = ga_load_library(libname, e); } #else /* Unix */ #ifdef __APPLE__ { char libname[128]; int n; #ifdef DEBUG fprintf(stderr, "Loading cuBLAS %d.%d.\n", major, minor); #endif n = snprintf(libname, sizeof(libname), "/Developer/NVIDIA/CUDA-%d.%d/lib/libcublas.dylib", major, minor); if (n < 0 || n >= sizeof(libname)) return error_set(e, GA_SYS_ERROR, "snprintf"); lib = ga_load_library(libname, e); } #else lib = ga_load_library("libcublas.so", e); #endif #endif if (lib == NULL) return e->code; #include "libcublas.fn" loaded = 1; return GA_NO_ERROR; } libgpuarray-0.7.6/src/loaders/libcublas.fn000066400000000000000000000077661326743622600206050ustar00rootroot00000000000000DEF_PROC_V2(cublasCreate, (cublasHandle_t *handle)); DEF_PROC_V2(cublasDestroy, (cublasHandle_t handle)); DEF_PROC_V2(cublasSetStream, (cublasHandle_t handle, cudaStream_t streamId)); DEF_PROC_V2(cublasSetPointerMode, (cublasHandle_t handle, cublasPointerMode_t mode)); DEF_PROC_V2(cublasGetPointerMode, (cublasHandle_t handle, cublasPointerMode_t *mode)); DEF_PROC(cublasSetAtomicsMode, (cublasHandle_t handle, cublasAtomicsMode_t mode)); DEF_PROC_V2(cublasSdot, (cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result)); DEF_PROC_V2(cublasDdot, (cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy, double *result)); DEF_PROC_V2(cublasSgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc)); DEF_PROC_V2(cublasDgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc)); DEF_PROC_V2(cublasSgemv, (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *alpha, const float *A, int lda, const float *x, int incx, const float *beta, float *y, int incy)); DEF_PROC_V2(cublasDgemv, (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double *alpha, const double *A, int lda, const double *x, int incx, const double *beta, double *y, int incy)); DEF_PROC_V2(cublasSger, (cublasHandle_t handle, int m, int n, const float *alpha, const float *x, int incx, const float *y, int incy, float *A, int lda)); DEF_PROC_V2(cublasDger, (cublasHandle_t handle, int m, int n, const double *alpha, const double *x, int incx, const double *y, int incy, double *A, int lda)); DEF_PROC_OPT(cublasSgemmEx, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const float *beta, void *C, cudaDataType Ctype, int ldc)); DEF_PROC_OPT(cublasGemmEx, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType_t Atype, int lda, const void *B, cudaDataType_t Btype, int ldb, const void *beta, void *C, cudaDataType_t Ctype, int ldc, cudaDataType_t computeType, cublasGemmAlgo_t algo)); DEF_PROC(cublasSgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *Aarray[], int lda, const float *Barray[], int ldb, const float *beta, float *Carray[], int ldc, int batchCount)); DEF_PROC(cublasDgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *Aarray[], int lda, const double *Barray[], int ldb, const double *beta, double *Carray[], int ldc, int batchCount)); DEF_PROC_OPT(cublasHgemmStridedBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half *alpha, const __half *A, int lda, long long int strideA, const __half *B, int ldb, long long int strideB, const __half *beta, __half *C, int ldc, long long int strideC, int batchCount)); DEF_PROC_OPT(cublasSgemmStridedBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, long long int strideA, const float *B, int ldb, long long int strideB, const float *beta, float *C, int ldc, long long int strideC, int batchCount)); DEF_PROC_OPT(cublasDgemmStridedBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, long long int strideA, const double *B, int ldb, long long int strideB, const double *beta, double *C, int ldc, long long int strideC, int batchCount)); libgpuarray-0.7.6/src/loaders/libcublas.h000066400000000000000000000066461326743622600204250ustar00rootroot00000000000000#ifndef LOADER_LIBCUBLAS_H #define LOADER_LIBCUBLAS_H #include "util/error.h" //TODO: how to have it work with align? typedef struct {//__align__(2) { unsigned short x; } __half; /** @cond NEVER */ #ifdef _WIN32 #define CUBLASWINAPI __stdcall #else #define CUBLASWINAPI #endif typedef enum cudaDataType_t { CUDA_R_16F= 2, // real as a half CUDA_C_16F= 6, // complex as a pair of half numbers CUDA_R_32F= 0, // real as a float CUDA_C_32F= 4, // complex as a pair of float numbers CUDA_R_64F= 1, // real as a double CUDA_C_64F= 5, // complex as a pair of double numbers CUDA_R_8I= 3, // real as a signed char CUDA_C_8I= 7, // complex as a pair of signed char numbers CUDA_R_8U= 8, // real as a unsigned char CUDA_C_8U= 9, // complex as a pair of unsigned char numbers CUDA_R_32I= 10, // real as a signed int CUDA_C_32I= 11, // complex as a pair of signed int numbers CUDA_R_32U= 12, // real as a unsigned int CUDA_C_32U= 13 // complex as a pair of unsigned int numbers } cudaDataType; typedef cudaDataType cudaDataType_t; typedef enum { CUBLAS_GEMM_DFALT = -1, CUBLAS_GEMM_ALGO0 = 0, CUBLAS_GEMM_ALGO1 = 1, CUBLAS_GEMM_ALGO2 = 2, CUBLAS_GEMM_ALGO3 = 3, CUBLAS_GEMM_ALGO4 = 4, CUBLAS_GEMM_ALGO5 = 5, CUBLAS_GEMM_ALGO6 = 6, CUBLAS_GEMM_ALGO7 = 7, CUBLAS_GEMM_ALGO8 = 8, CUBLAS_GEMM_ALGO9 = 9, CUBLAS_GEMM_ALGO10 = 10, CUBLAS_GEMM_ALGO11 = 11, CUBLAS_GEMM_ALGO12 = 12, CUBLAS_GEMM_ALGO13 = 13, CUBLAS_GEMM_ALGO14 = 14, CUBLAS_GEMM_ALGO15 = 15, CUBLAS_GEMM_ALGO16 = 16, CUBLAS_GEMM_ALGO17 = 17, CUBLAS_GEMM_DFALT_TENSOR_OP = 99, CUBLAS_GEMM_ALGO0_TENSOR_OP = 100, CUBLAS_GEMM_ALGO1_TENSOR_OP = 101, CUBLAS_GEMM_ALGO2_TENSOR_OP = 102 } cublasGemmAlgo_t; typedef struct CUstream_st *cudaStream_t; typedef enum { CUBLAS_STATUS_SUCCESS =0, CUBLAS_STATUS_NOT_INITIALIZED =1, CUBLAS_STATUS_ALLOC_FAILED =3, CUBLAS_STATUS_INVALID_VALUE =7, CUBLAS_STATUS_ARCH_MISMATCH =8, CUBLAS_STATUS_MAPPING_ERROR =11, CUBLAS_STATUS_EXECUTION_FAILED=13, CUBLAS_STATUS_INTERNAL_ERROR =14, CUBLAS_STATUS_NOT_SUPPORTED =15, CUBLAS_STATUS_LICENSE_ERROR =16 } cublasStatus_t; typedef enum { CUBLAS_OP_N=0, CUBLAS_OP_T=1, CUBLAS_OP_C=2 } cublasOperation_t; typedef enum { CUBLAS_POINTER_MODE_HOST = 0, CUBLAS_POINTER_MODE_DEVICE = 1 } cublasPointerMode_t; typedef enum { CUBLAS_ATOMICS_NOT_ALLOWED = 0, CUBLAS_ATOMICS_ALLOWED = 1 } cublasAtomicsMode_t; typedef struct cublasContext *cublasHandle_t; /** @endcond */ int load_libcublas(int major, int minor, error *e); /** @cond NEVER */ #define DEF_PROC(name, args) typedef cublasStatus_t CUBLASWINAPI t##name args #define DEF_PROC_V2(name, args) DEF_PROC(name, args) #define DEF_PROC_OPT(name, args) DEF_PROC(name, args) #include "libcublas.fn" #undef DEF_PROC_OPT #undef DEF_PROC_V2 #undef DEF_PROC #define DEF_PROC(name, args) extern t##name *name #define DEF_PROC_V2(name, args) DEF_PROC(name, args) #define DEF_PROC_OPT(name, args) DEF_PROC(name, args) #include "libcublas.fn" #undef DEF_PROC_OPT #undef DEF_PROC_V2 #undef DEF_PROC /** @endcond */ #endif libgpuarray-0.7.6/src/loaders/libcuda.c000066400000000000000000000025431326743622600200530ustar00rootroot00000000000000#include #include #include "libcuda.h" #include "dyn_load.h" #include "gpuarray/error.h" #include "util/error.h" /* This code is inspired from the dynamic loading code in the samples */ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) static char libname[] = "nvcuda.dll"; #else /* Unix */ #ifdef __APPLE__ static char libname[] = "/Library/Frameworks/CUDA.framework/CUDA"; #else static char libname[] = "libcuda.so"; #endif #endif #define DEF_PROC(name, args) t##name *name #define DEF_PROC_V2(name, args) DEF_PROC(name, args) #include "libcuda.fn" #undef DEF_PROC_V2 #undef DEF_PROC #define STRINGIFY(X) #X #define DEF_PROC(name, args) \ name = (t##name *)ga_func_ptr(lib, #name, e); \ if (name == NULL) { \ return e->code; \ } #define DEF_PROC_V2(name, args) \ name = (t##name *)ga_func_ptr(lib, STRINGIFY(name##_v2), e); \ if (name == NULL) { \ return e->code; \ } static int loaded = 0; int load_libcuda(error *e) { void *lib; if (loaded) return GA_NO_ERROR; lib = ga_load_library(libname, e); if (lib == NULL) return e->code; #include "libcuda.fn" loaded = 1; return GA_NO_ERROR; } libgpuarray-0.7.6/src/loaders/libcuda.fn000066400000000000000000000072041326743622600202330ustar00rootroot00000000000000DEF_PROC(cuInit, (int flags)); DEF_PROC(cuDriverGetVersion, (int *driverVersion)); DEF_PROC(cuGetErrorName, (CUresult error, const char **pStr)); DEF_PROC(cuGetErrorString, (CUresult error, const char **pStr)); DEF_PROC(cuDeviceGet, (CUdevice *device, int ordinal)); DEF_PROC(cuDeviceGetCount, (int *count)); DEF_PROC(cuDeviceGetName, (char *name, int len, CUdevice dev)); DEF_PROC(cuDeviceGetAttribute, (int *pi, CUdevice_attribute attrib, CUdevice dev)); DEF_PROC(cuDeviceGetPCIBusId, (char *pciBusId, int len, CUdevice dev)); DEF_PROC(cuDevicePrimaryCtxGetState, (CUdevice dev, unsigned int *flags, int *active)); DEF_PROC(cuDevicePrimaryCtxSetFlags, (CUdevice dev, unsigned int flags)); DEF_PROC(cuDevicePrimaryCtxRelease, (CUdevice dev)); DEF_PROC(cuDevicePrimaryCtxRetain, (CUcontext *pctx, CUdevice dev)); DEF_PROC(cuCtxGetDevice, (CUdevice *device)); DEF_PROC_V2(cuCtxPushCurrent, (CUcontext ctx)); DEF_PROC_V2(cuCtxPopCurrent, (CUcontext *pctx)); DEF_PROC(cuLinkCreate, (unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut)); DEF_PROC(cuLinkAddData, (CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues)); DEF_PROC(cuLinkComplete, (CUlinkState state, void **cubinOut, size_t *sizeOut)); DEF_PROC(cuLinkDestroy, (CUlinkState state)); DEF_PROC(cuModuleLoadData, (CUmodule *module, const void *image)); DEF_PROC(cuModuleLoadDataEx, (CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)); DEF_PROC(cuModuleUnload, (CUmodule hmod)); DEF_PROC(cuModuleGetFunction, (CUfunction *hfunc, CUmodule hmod, const char *name)); DEF_PROC_V2(cuMemGetInfo, (size_t *free, size_t *total)); DEF_PROC_V2(cuMemAlloc, (CUdeviceptr *dptr, size_t bytesize)); DEF_PROC_V2(cuMemFree, (CUdeviceptr dptr)); DEF_PROC_V2(cuMemAllocHost, (void **pp, size_t bytesize)); DEF_PROC(cuMemFreeHost, (void *p)); DEF_PROC_V2(cuMemcpyHtoDAsync, (CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream)); DEF_PROC_V2(cuMemcpyHtoD, (CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)); DEF_PROC_V2(cuMemcpyDtoHAsync, (void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)); DEF_PROC_V2(cuMemcpyDtoDAsync, (CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)); DEF_PROC(cuMemcpyPeerAsync, (CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream)); DEF_PROC(cuMemsetD8Async, (CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream)); DEF_PROC(cuLaunchKernel, (CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra)); DEF_PROC(cuFuncGetAttribute, (int *pi, CUfunction_attribute attrib, CUfunction hfunc)); DEF_PROC(cuEventCreate, (CUevent *phEvent, unsigned int Flags)); DEF_PROC(cuEventRecord, (CUevent hEvent, CUstream hStream)); DEF_PROC(cuEventSynchronize, (CUevent hEvent)); DEF_PROC_V2(cuEventDestroy, (CUevent hEvent)); DEF_PROC(cuStreamCreate, (CUstream *phStream, unsigned int Flags)); DEF_PROC(cuStreamWaitEvent, (CUstream hStream, CUevent hEvent, unsigned int Flags)); DEF_PROC(cuStreamSynchronize, (CUstream hStream)); DEF_PROC_V2(cuStreamDestroy, (CUstream hStream)); DEF_PROC(cuIpcGetMemHandle, (CUipcMemHandle *pHandle, CUdeviceptr dptr)); DEF_PROC(cuIpcOpenMemHandle, (CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags)); DEF_PROC(cuIpcCloseMemHandle, (CUdeviceptr dptr)); libgpuarray-0.7.6/src/loaders/libcuda.h000066400000000000000000000177711326743622600200710ustar00rootroot00000000000000#ifndef LOADER_LIBCUDA_H #define LOADER_LIBCUDA_H #include "util/error.h" /** @cond NEVER */ #ifdef _WIN32 #define CUDAAPI __stdcall #else #define CUDAAPI #endif typedef enum { CUDA_SUCCESS = 0 } CUresult; #if defined(_WIN64) || defined(__LP64__) typedef unsigned long long CUdeviceptr; #else typedef unsigned int CUdeviceptr; #endif typedef int CUdevice; typedef struct CUctx_st *CUcontext; typedef struct CUmod_st *CUmodule; typedef struct CUfunc_st *CUfunction; typedef struct CUevent_st *CUevent; typedef struct CUstream_st *CUstream; typedef struct CUlinkState_st *CUlinkState; typedef enum CUdevice_attribute_enum CUdevice_attribute; typedef enum CUfunction_attribute_enum CUfunction_attribute; typedef enum CUevent_flags_enum CUevent_flags; typedef enum CUctx_flags_enum CUctx_flags; typedef enum CUipcMem_flags_enum CUipcMem_flags; typedef enum CUjit_option_enum CUjit_option; typedef enum CUjitInputType_enum CUjitInputType; #define CU_IPC_HANDLE_SIZE 64 typedef struct CUipcMemHandle_st { char reserved[CU_IPC_HANDLE_SIZE]; } CUipcMemHandle; /** @endcond */ int load_libcuda(error *); /** @cond NEVER */ #define DEF_PROC(name, args) typedef CUresult CUDAAPI t##name args #define DEF_PROC_V2(name, args) DEF_PROC(name, args) #include "libcuda.fn" #undef DEF_PROC_V2 #undef DEF_PROC #define DEF_PROC(name, args) extern t##name *name #define DEF_PROC_V2(name, args) DEF_PROC(name, args) #include "libcuda.fn" #undef DEF_PROC_V2 #undef DEF_PROC enum CUdevice_attribute_enum { CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91 }; enum CUfunction_attribute_enum { CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, CU_FUNC_ATTRIBUTE_NUM_REGS = 4, CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7 }; enum CUevent_flags_enum { CU_EVENT_DEFAULT = 0x0, CU_EVENT_BLOCKING_SYNC = 0x1, CU_EVENT_DISABLE_TIMING = 0x2, CU_EVENT_INTERPROCESS = 0x4 }; enum CUctx_flags_enum { CU_CTX_SCHED_AUTO = 0x00, CU_CTX_SCHED_SPIN = 0x01, CU_CTX_SCHED_YIELD = 0x02, CU_CTX_SCHED_BLOCKING_SYNC = 0x04, CU_CTX_BLOCKING_SYNC = 0x04, CU_CTX_MAP_HOST = 0x08, }; enum CUipcMem_flags_enum { CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 }; enum CUjit_option_enum { CU_JIT_MAX_REGISTERS = 0, CU_JIT_THREADS_PER_BLOCK, CU_JIT_WALL_TIME, CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_OPTIMIZATION_LEVEL, CU_JIT_TARGET_FROM_CUCONTEXT, CU_JIT_TARGET, CU_JIT_FALLBACK_STRATEGY, CU_JIT_GENERATE_DEBUG_INFO, CU_JIT_LOG_VERBOSE, CU_JIT_GENERATE_LINE_INFO, CU_JIT_CACHE_MODE, CU_JIT_NEW_SM3X_OPT, CU_JIT_FAST_COMPILE, CU_JIT_NUM_OPTIONS }; enum CUjitInputType_enum { CU_JIT_INPUT_CUBIN = 0, CU_JIT_INPUT_PTX, CU_JIT_INPUT_FATBINARY, CU_JIT_INPUT_OBJECT, CU_JIT_INPUT_LIBRARY, CU_JIT_NUM_INPUT_TYPES }; /** @endcond */ #endif libgpuarray-0.7.6/src/loaders/libnccl.c000066400000000000000000000022401326743622600200500ustar00rootroot00000000000000#include #include "libnccl.h" #include "dyn_load.h" #include "gpuarray/error.h" #define DEF_PROC(ret, name, args) t##name *name #include "libnccl.fn" #undef DEF_PROC #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) || defined(__APPLE__) /* As far as we know, nccl is not available or buildable on platforms other than linux */ int load_libnccl(error *e) { return error_set(e, GA_UNSUPPORTED_ERROR, "NCCL is not available on plaforms other than linux."); } #else /* Unix */ static const char libname[] = "libnccl.so"; #define DEF_PROC(ret, name, args) \ name = (t##name *)ga_func_ptr(lib, #name, e); \ if (name == NULL) { \ return e->code; \ } static int loaded = 0; int load_libnccl(error *e) { void *lib; if (loaded) return GA_NO_ERROR; lib = ga_load_library(libname, e); if (lib == NULL) return e->code; #include "libnccl.fn" if (ga_func_ptr(lib, "ncclGroupStart", e) == NULL) return error_set(e, GA_LOAD_ERROR, "Found NCCL 1.0 but NCCL 2.0 required"); loaded = 1; return GA_NO_ERROR; } #endif libgpuarray-0.7.6/src/loaders/libnccl.fn000066400000000000000000000023671326743622600202430ustar00rootroot00000000000000DEF_PROC(ncclResult_t, ncclGetUniqueId, (ncclUniqueId* uniqueId)); DEF_PROC(ncclResult_t, ncclCommInitRank, (ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank)); DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm)); DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count)); DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank)); DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result)); DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)); DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream )); DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream )); DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)); libgpuarray-0.7.6/src/loaders/libnccl.h000066400000000000000000000024521326743622600200620ustar00rootroot00000000000000#ifndef LOADER_LIBNCCL_H #define LOADER_LIBNCCL_H #include "util/error.h" /** @cond NEVER */ typedef struct CUstream_st *cudaStream_t; typedef struct ncclComm* ncclComm_t; #define NCCL_UNIQUE_ID_BYTES 128 typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; typedef enum { ncclSuccess = 0 } ncclResult_t; /* Reduction operation selector */ typedef enum { ncclSum = 0, ncclProd = 1, ncclMax = 2, ncclMin = 3, ncclNumOps = 4 } ncclRedOp_t; /* Data types */ typedef enum { ncclInt8 = 0, ncclChar = 0, ncclUint8 = 1, ncclInt32 = 2, ncclInt = 2, ncclUint32 = 3, ncclInt64 = 4, ncclUint64 = 5, ncclFloat16 = 6, ncclHalf = 6, ncclFloat32 = 7, ncclFloat = 7, ncclFloat64 = 8, ncclDouble = 8, ncclNumTypes = 9 } ncclDataType_t; /** @endcond */ int load_libnccl(error *e); /* @cond NEVER */ #define DEF_PROC(ret, name, args) typedef ret t##name args #include "libnccl.fn" #undef DEF_PROC #define DEF_PROC(ret, name, args) extern t##name *name #include "libnccl.fn" #undef DEF_PROC /** @endcond */ #endif libgpuarray-0.7.6/src/loaders/libnvrtc.c000066400000000000000000000031141326743622600202660ustar00rootroot00000000000000/* To be able to use snprintf with any compiler including MSVC2008. */ #include #include "libcuda.h" #include "libnvrtc.h" #include "dyn_load.h" #include "gpuarray/error.h" #define DEF_PROC(rt, name, args) t##name *name #include "libnvrtc.fn" #undef DEF_PROC #define DEF_PROC(rt, name, args) \ name = (t##name *)ga_func_ptr(lib, #name, e); \ if (name == NULL) { \ return e->code; \ } static int loaded = 0; int load_libnvrtc(int major, int minor, error *e) { void *lib; if (loaded) return GA_NO_ERROR; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) { char libname[64]; int n; #ifdef DEBUG fprintf(stderr, "Loading nvrtc %d.%d.\n", major, minor); #endif n = snprintf(libname, sizeof(libname), "nvrtc64_%d%d.dll", major, minor); if (n < 0 || n >= sizeof(libname)) return error_set(e, GA_SYS_ERROR, "snprintf"); lib = ga_load_library(libname, e); } #else /* Unix */ #ifdef __APPLE__ { char libname[128]; int n; #ifdef DEBUG fprintf(stderr, "Loading nvrtc %d.%d.\n", major, minor); #endif n = snprintf(libname, sizeof(libname), "/Developer/NVIDIA/CUDA-%d.%d/lib/libnvrtc.dylib", major, minor); if (n < 0 || n >= sizeof(libname)) return error_set(e, GA_SYS_ERROR, "snprintf"); lib = ga_load_library(libname, e); } #else lib = ga_load_library("libnvrtc.so", e); #endif #endif if (lib == NULL) return e->code; #include "libnvrtc.fn" loaded = 1; return GA_NO_ERROR; } libgpuarray-0.7.6/src/loaders/libnvrtc.fn000066400000000000000000000013111326743622600204440ustar00rootroot00000000000000DEF_PROC(nvrtcResult, nvrtcCreateProgram, (nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames)); DEF_PROC(nvrtcResult, nvrtcCompileProgram, (nvrtcProgram prog, int numOptions, const char **options)); DEF_PROC(nvrtcResult, nvrtcDestroyProgram, (nvrtcProgram *prog)); DEF_PROC(nvrtcResult, nvrtcGetProgramLog, (nvrtcProgram prog, char *log)); DEF_PROC(nvrtcResult, nvrtcGetProgramLogSize, (nvrtcProgram prog, size_t *logSizeRet)); DEF_PROC(nvrtcResult, nvrtcGetPTX, (nvrtcProgram prog, char *ptx)); DEF_PROC(nvrtcResult, nvrtcGetPTXSize, (nvrtcProgram prog, size_t *ptxSizeRet)); DEF_PROC(const char *, nvrtcGetErrorString, (nvrtcResult result)); libgpuarray-0.7.6/src/loaders/libnvrtc.h000066400000000000000000000007671326743622600203060ustar00rootroot00000000000000#ifndef LOADER_LIBNVRTC_H #define LOADER_LIBNVRTC_H #include "util/error.h" /** @cond NEVER */ typedef enum { NVRTC_SUCCESS = 0, } nvrtcResult; typedef struct _nvrtcProgram *nvrtcProgram; /** @endcond */ int load_libnvrtc(int major, int minor, error *e); /** @cond NEVER */ #define DEF_PROC(rt, name, args) typedef rt t##name args #include "libnvrtc.fn" #undef DEF_PROC #define DEF_PROC(rt, name, args) extern t##name *name #include "libnvrtc.fn" #undef DEF_PROC /** @endcond */ #endif libgpuarray-0.7.6/src/loaders/libopencl.c000066400000000000000000000016471326743622600204230ustar00rootroot00000000000000#include #include "libopencl.h" #include "dyn_load.h" #include "gpuarray/error.h" #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) static char libname[] = "OpenCL.dll"; #else /* Unix */ #ifdef __APPLE__ static char libname[] = "/System/Library/Frameworks/OpenCL.framework/OpenCL"; #else static char libname[] = "libOpenCL.so"; #endif #endif #define DEF_PROC(ret, name, args) t##name *name #include "libopencl.fn" #undef DEF_PROC #define DEF_PROC(ret, name, args) \ name = (t##name *)ga_func_ptr(lib, #name, e); \ if (name == NULL) { \ return e->code; \ } static int loaded = 0; int load_libopencl(error *e) { void *lib; if (loaded) return GA_NO_ERROR; lib = ga_load_library(libname, e); if (lib == NULL) return e->code; #include "libopencl.fn" loaded = 1; return GA_NO_ERROR; } libgpuarray-0.7.6/src/loaders/libopencl.fn000066400000000000000000000061261326743622600206010ustar00rootroot00000000000000DEF_PROC(cl_context, clCreateContext, (const cl_context_properties *, cl_uint, const cl_device_id *, void (CL_CALLBACK *)(const char *, const void *, size_t, void *), void *, cl_int *)); DEF_PROC(cl_int, clCompileProgram, (cl_program, cl_uint, const cl_device_id *, const char *, cl_uint, cl_program *, const char **, void (CL_CALLBACK *)(cl_program, void *), void *)); DEF_PROC(cl_program, clLinkProgram, (cl_context, cl_uint, const cl_device_id *, const char *, cl_uint, const cl_program *, void (CL_CALLBACK *)(cl_program, void *), void *, cl_int *)); DEF_PROC(cl_mem, clCreateBuffer, (cl_context, cl_mem_flags, size_t, void *, cl_int *)); DEF_PROC(cl_command_queue, clCreateCommandQueue, (cl_context, cl_device_id, cl_command_queue_properties, cl_int *)); DEF_PROC(cl_kernel, clCreateKernel, (cl_program, const char *, cl_int *)); DEF_PROC(cl_program, clCreateProgramWithBinary, (cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *)); DEF_PROC(cl_program, clCreateProgramWithSource, (cl_context, cl_uint, const char **, const size_t *, cl_int *)); DEF_PROC(cl_int, clEnqueueReadBuffer, (cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *)); DEF_PROC(cl_int, clEnqueueWriteBuffer, (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *)); DEF_PROC(cl_int, clEnqueueCopyBuffer, (cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *)); DEF_PROC(cl_int, clEnqueueNDRangeKernel, (cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *)); DEF_PROC(cl_int, clGetContextInfo, (cl_context, cl_context_info, size_t, void *, size_t *)); DEF_PROC(cl_int, clGetDeviceIDs, (cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *)); DEF_PROC(cl_int, clGetDeviceInfo, (cl_device_id, cl_device_info, size_t, void *, size_t *)); DEF_PROC(cl_int, clGetKernelInfo, (cl_kernel, cl_kernel_info, size_t, void *, size_t *)); DEF_PROC(cl_int, clGetKernelWorkGroupInfo, (cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *)); DEF_PROC(cl_int, clGetMemObjectInfo, (cl_mem, cl_mem_info, size_t, void *, size_t *)); DEF_PROC(cl_int, clGetPlatformIDs, (cl_uint, cl_platform_id *, cl_uint *)); DEF_PROC(cl_int, clGetProgramBuildInfo, (cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *)); DEF_PROC(cl_int, clGetProgramInfo, (cl_program, cl_program_info, size_t, void *, size_t *)); DEF_PROC(cl_int, clReleaseCommandQueue, (cl_command_queue)); DEF_PROC(cl_int, clReleaseContext, (cl_context)); DEF_PROC(cl_int, clReleaseEvent, (cl_event)); DEF_PROC(cl_int, clReleaseKernel, (cl_kernel)); DEF_PROC(cl_int, clReleaseMemObject, (cl_mem)); DEF_PROC(cl_int, clReleaseProgram, (cl_program)); DEF_PROC(cl_int, clRetainContext, (cl_context)); DEF_PROC(cl_int, clRetainEvent, (cl_event)); DEF_PROC(cl_int, clRetainMemObject, (cl_mem)); DEF_PROC(cl_int, clSetKernelArg, (cl_kernel, cl_uint, size_t, const void *)); DEF_PROC(cl_int, clWaitForEvents, (cl_uint, const cl_event *));libgpuarray-0.7.6/src/loaders/libopencl.h000066400000000000000000000377551326743622600204410ustar00rootroot00000000000000#ifndef LOADER_LIBOPENCL_H #define LOADER_LIBOPENCL_H #include "util/error.h" /** @cond NEVER */ #if defined(_WIN32) #define CL_API_CALL __stdcall #define CL_CALLBACK __stdcall #else #define CL_API_CALL #define CL_CALLBACK #endif typedef struct _cl_platform_id * cl_platform_id; typedef struct _cl_device_id * cl_device_id; typedef struct _cl_context * cl_context; typedef struct _cl_command_queue * cl_command_queue; typedef struct _cl_mem * cl_mem; typedef struct _cl_program * cl_program; typedef struct _cl_kernel * cl_kernel; typedef struct _cl_event * cl_event; #if (defined (_WIN32) && defined(_MSC_VER)) typedef signed __int32 cl_int; typedef unsigned __int32 cl_uint; typedef signed __int64 cl_long; typedef unsigned __int64 cl_ulong; typedef unsigned __int16 cl_half; typedef float cl_float; typedef double cl_double; #else #include typedef int32_t cl_int __attribute__((aligned(4))); typedef uint32_t cl_uint __attribute__((aligned(4))); typedef int64_t cl_long __attribute__((aligned(8))); typedef uint64_t cl_ulong __attribute__((aligned(8))); typedef uint16_t cl_half __attribute__((aligned(2))); typedef float cl_float __attribute__((aligned(4))); typedef double cl_double __attribute__((aligned(8))); #endif typedef cl_uint cl_bool; typedef cl_ulong cl_bitfield; typedef cl_uint cl_device_info; typedef cl_bitfield cl_device_type; typedef cl_bitfield cl_command_queue_properties; typedef intptr_t cl_context_properties; typedef cl_uint cl_context_info; typedef cl_uint cl_mem_info; typedef cl_bitfield cl_mem_flags; typedef cl_uint cl_program_info; typedef cl_uint cl_program_build_info; typedef cl_uint cl_kernel_info; typedef cl_uint cl_kernel_work_group_info; /** @endcond */ int load_libopencl(error *); /** @cond NEVER */ #define DEF_PROC(ret, name, args) typedef ret CL_API_CALL t##name args #include "libopencl.fn" #undef DEF_PROC #define DEF_PROC(ret, name, args) extern t##name *name #include "libopencl.fn" #undef DEF_PROC /* What follows is a bunch of defines from the official OpenCL spec. * This allows us to build even if there are no OpenCL implementation * present. */ /* Error codes */ #define CL_SUCCESS 0 #define CL_DEVICE_NOT_FOUND -1 #define CL_DEVICE_NOT_AVAILABLE -2 #define CL_COMPILER_NOT_AVAILABLE -3 #define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 #define CL_OUT_OF_RESOURCES -5 #define CL_OUT_OF_HOST_MEMORY -6 #define CL_PROFILING_INFO_NOT_AVAILABLE -7 #define CL_MEM_COPY_OVERLAP -8 #define CL_IMAGE_FORMAT_MISMATCH -9 #define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 #define CL_BUILD_PROGRAM_FAILURE -11 #define CL_MAP_FAILURE -12 #define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 #define CL_COMPILE_PROGRAM_FAILURE -15 #define CL_LINKER_NOT_AVAILABLE -16 #define CL_LINK_PROGRAM_FAILURE -17 #define CL_DEVICE_PARTITION_FAILED -18 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 #define CL_INVALID_VALUE -30 #define CL_INVALID_DEVICE_TYPE -31 #define CL_INVALID_PLATFORM -32 #define CL_INVALID_DEVICE -33 #define CL_INVALID_CONTEXT -34 #define CL_INVALID_QUEUE_PROPERTIES -35 #define CL_INVALID_COMMAND_QUEUE -36 #define CL_INVALID_HOST_PTR -37 #define CL_INVALID_MEM_OBJECT -38 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 #define CL_INVALID_IMAGE_SIZE -40 #define CL_INVALID_SAMPLER -41 #define CL_INVALID_BINARY -42 #define CL_INVALID_BUILD_OPTIONS -43 #define CL_INVALID_PROGRAM -44 #define CL_INVALID_PROGRAM_EXECUTABLE -45 #define CL_INVALID_KERNEL_NAME -46 #define CL_INVALID_KERNEL_DEFINITION -47 #define CL_INVALID_KERNEL -48 #define CL_INVALID_ARG_INDEX -49 #define CL_INVALID_ARG_VALUE -50 #define CL_INVALID_ARG_SIZE -51 #define CL_INVALID_KERNEL_ARGS -52 #define CL_INVALID_WORK_DIMENSION -53 #define CL_INVALID_WORK_GROUP_SIZE -54 #define CL_INVALID_WORK_ITEM_SIZE -55 #define CL_INVALID_GLOBAL_OFFSET -56 #define CL_INVALID_EVENT_WAIT_LIST -57 #define CL_INVALID_EVENT -58 #define CL_INVALID_OPERATION -59 #define CL_INVALID_GL_OBJECT -60 #define CL_INVALID_BUFFER_SIZE -61 #define CL_INVALID_MIP_LEVEL -62 #define CL_INVALID_GLOBAL_WORK_SIZE -63 #define CL_INVALID_PROPERTY -64 #define CL_INVALID_IMAGE_DESCRIPTOR -65 #define CL_INVALID_COMPILER_OPTIONS -66 #define CL_INVALID_LINKER_OPTIONS -67 #define CL_INVALID_DEVICE_PARTITION_COUNT -68 #define CL_INVALID_PIPE_SIZE -69 #define CL_INVALID_DEVICE_QUEUE -70 #define CL_FALSE 0 #define CL_TRUE 1 /* cl_device_type - bitfield */ #define CL_DEVICE_TYPE_DEFAULT (1 << 0) #define CL_DEVICE_TYPE_CPU (1 << 1) #define CL_DEVICE_TYPE_GPU (1 << 2) #define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) #define CL_DEVICE_TYPE_CUSTOM (1 << 4) #define CL_DEVICE_TYPE_ALL 0xFFFFFFFF /* cl_context_info */ #define CL_CONTEXT_REFERENCE_COUNT 0x1080 #define CL_CONTEXT_DEVICES 0x1081 #define CL_CONTEXT_PROPERTIES 0x1082 #define CL_CONTEXT_NUM_DEVICES 0x1083 /* cl_device_info */ #define CL_DEVICE_TYPE 0x1000 #define CL_DEVICE_VENDOR_ID 0x1001 #define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 #define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 #define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B #define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C #define CL_DEVICE_ADDRESS_BITS 0x100D #define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F #define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 #define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 #define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 #define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 #define CL_DEVICE_IMAGE_SUPPORT 0x1016 #define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 #define CL_DEVICE_MAX_SAMPLERS 0x1018 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A #define CL_DEVICE_SINGLE_FP_CONFIG 0x101B #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E #define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 #define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 #define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 #define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 #define CL_DEVICE_ENDIAN_LITTLE 0x1026 #define CL_DEVICE_AVAILABLE 0x1027 #define CL_DEVICE_COMPILER_AVAILABLE 0x1028 #define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 #define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */ #define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A #define CL_DEVICE_NAME 0x102B #define CL_DEVICE_VENDOR 0x102C #define CL_DRIVER_VERSION 0x102D #define CL_DEVICE_PROFILE 0x102E #define CL_DEVICE_VERSION 0x102F #define CL_DEVICE_EXTENSIONS 0x1030 #define CL_DEVICE_PLATFORM 0x1031 #define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 #define CL_DEVICE_HALF_FP_CONFIG 0x1033 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 #define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */ #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C #define CL_DEVICE_OPENCL_C_VERSION 0x103D #define CL_DEVICE_LINKER_AVAILABLE 0x103E #define CL_DEVICE_BUILT_IN_KERNELS 0x103F #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 #define CL_DEVICE_PARENT_DEVICE 0x1042 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 #define CL_DEVICE_PARTITION_PROPERTIES 0x1044 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 #define CL_DEVICE_PARTITION_TYPE 0x1046 #define CL_DEVICE_REFERENCE_COUNT 0x1047 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 #define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B #define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C #define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D #define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E #define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F #define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050 #define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051 #define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052 #define CL_DEVICE_SVM_CAPABILITIES 0x1053 #define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054 #define CL_DEVICE_MAX_PIPE_ARGS 0x1055 #define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056 #define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057 #define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058 #define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059 #define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A #define CL_DEVICE_IL_VERSION 0x105B #define CL_DEVICE_MAX_NUM_SUB_GROUPS 0x105C #define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D /* cl_command_queue_properties - bitfield */ #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) #define CL_QUEUE_PROFILING_ENABLE (1 << 1) #define CL_QUEUE_ON_DEVICE (1 << 2) #define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3) /* cl_mem_info */ #define CL_MEM_TYPE 0x1100 #define CL_MEM_FLAGS 0x1101 #define CL_MEM_SIZE 0x1102 #define CL_MEM_HOST_PTR 0x1103 #define CL_MEM_MAP_COUNT 0x1104 #define CL_MEM_REFERENCE_COUNT 0x1105 #define CL_MEM_CONTEXT 0x1106 #define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 #define CL_MEM_OFFSET 0x1108 #define CL_MEM_USES_SVM_POINTER 0x1109 /* cl_context_properties */ #define CL_CONTEXT_PLATFORM 0x1084 #define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 /* cl_mem_flags and cl_svm_mem_flags - bitfield */ #define CL_MEM_READ_WRITE (1 << 0) #define CL_MEM_WRITE_ONLY (1 << 1) #define CL_MEM_READ_ONLY (1 << 2) #define CL_MEM_USE_HOST_PTR (1 << 3) #define CL_MEM_ALLOC_HOST_PTR (1 << 4) #define CL_MEM_COPY_HOST_PTR (1 << 5) /* reserved (1 << 6) */ #define CL_MEM_HOST_WRITE_ONLY (1 << 7) #define CL_MEM_HOST_READ_ONLY (1 << 8) #define CL_MEM_HOST_NO_ACCESS (1 << 9) #define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */ #define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */ #define CL_MEM_KERNEL_READ_AND_WRITE (1 << 12) /* cl_program_build_info */ #define CL_PROGRAM_BUILD_STATUS 0x1181 #define CL_PROGRAM_BUILD_OPTIONS 0x1182 #define CL_PROGRAM_BUILD_LOG 0x1183 #define CL_PROGRAM_BINARY_TYPE 0x1184 #define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185 /* cl_kernel_info */ #define CL_KERNEL_FUNCTION_NAME 0x1190 #define CL_KERNEL_NUM_ARGS 0x1191 #define CL_KERNEL_REFERENCE_COUNT 0x1192 #define CL_KERNEL_CONTEXT 0x1193 #define CL_KERNEL_PROGRAM 0x1194 #define CL_KERNEL_ATTRIBUTES 0x1195 #define CL_KERNEL_MAX_NUM_SUB_GROUPS 0x11B9 #define CL_KERNEL_COMPILE_NUM_SUB_GROUPS 0x11BA /* cl_program_info */ #define CL_PROGRAM_REFERENCE_COUNT 0x1160 #define CL_PROGRAM_CONTEXT 0x1161 #define CL_PROGRAM_NUM_DEVICES 0x1162 #define CL_PROGRAM_DEVICES 0x1163 #define CL_PROGRAM_SOURCE 0x1164 #define CL_PROGRAM_BINARY_SIZES 0x1165 #define CL_PROGRAM_BINARIES 0x1166 #define CL_PROGRAM_NUM_KERNELS 0x1167 #define CL_PROGRAM_KERNEL_NAMES 0x1168 #define CL_PROGRAM_IL 0x1169 /* cl_kernel_work_group_info */ #define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 #define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 #define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 #define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 /** @endcond */ #endif libgpuarray-0.7.6/src/private.h000066400000000000000000000354621326743622600165040ustar00rootroot00000000000000#ifndef _PRIVATE #define _PRIVATE /** \cond INTERNAL_DOCS */ /* * This file contains function definition that are shared in multiple * files but not exposed in the interface. */ #include "private_config.h" #include #include #include #include #include #include "util/strb.h" #include "util/error.h" #include "cache.h" #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif #define ADDR32_MAX 4294967295L #define SADDR32_MIN -2147483648L #define SADDR32_MAX 2147483647L struct _gpuarray_buffer_ops; typedef struct _gpuarray_buffer_ops gpuarray_buffer_ops; struct _gpuarray_blas_ops; typedef struct _gpuarray_blas_ops gpuarray_blas_ops; struct _gpuarray_comm_ops; typedef struct _gpuarray_comm_ops gpuarray_comm_ops; #define GPUCONTEXT_HEAD \ const gpuarray_buffer_ops *ops; \ const gpuarray_blas_ops *blas_ops; \ const gpuarray_comm_ops *comm_ops; \ void *blas_handle; \ error *err; \ unsigned int refcnt; \ int flags; \ struct _gpudata *errbuf; \ cache *extcopy_cache; \ char bin_id[64]; \ char tag[8] /* These will go away eventually but are kept to ease the transition for now */ #define GA_CTX_SINGLE_STREAM 0x01 #define GA_CTX_MULTI_THREAD 0x02 struct _gpucontext_props { int dev; int sched; int flags; const char *kernel_cache_path; size_t max_cache_size; size_t initial_cache_size; }; struct _gpucontext { GPUCONTEXT_HEAD; void *ctx_ptr; void *private[11]; }; /* The real gpudata struct is likely bigger but we only care about the first two members for now. */ typedef struct _partial_gpudata { void *devptr; gpucontext *ctx; } partial_gpudata; typedef struct _partial_gpukernel { gpucontext *ctx; } partial_gpukernel; typedef struct _partial_gpucomm { gpucontext* ctx; } partial_gpucomm; struct _gpuarray_buffer_ops { int (*get_platform_count)(unsigned int* platcount); int (*get_device_count)(unsigned int platform, unsigned int* devcount); gpucontext *(*buffer_init)(gpucontext_props *props); void (*buffer_deinit)(gpucontext *ctx); gpudata *(*buffer_alloc)(gpucontext *ctx, size_t sz, void *data, int flags); void (*buffer_retain)(gpudata *b); void (*buffer_release)(gpudata *b); int (*buffer_share)(gpudata *a, gpudata *b); int (*buffer_move)(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz); int (*buffer_read)(void *dst, gpudata *src, size_t srcoff, size_t sz); int (*buffer_write)(gpudata *dst, size_t dstoff, const void *src, size_t sz); int (*buffer_memset)(gpudata *dst, size_t dstoff, int data); int (*kernel_alloc)(gpukernel **k, gpucontext *ctx, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int numargs, const int *typecodes, int flags, char **err_str); void (*kernel_retain)(gpukernel *k); void (*kernel_release)(gpukernel *k); int (*kernel_setarg)(gpukernel *k, unsigned int i, void *a); int (*kernel_call)(gpukernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args); int (*buffer_sync)(gpudata *b); int (*buffer_transfer)(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz); int (*property)(gpucontext *ctx, gpudata *buf, gpukernel *k, int prop_id, void *res); const char *(*ctx_error)(gpucontext *ctx); }; struct _gpuarray_blas_ops { int (*setup)(gpucontext *ctx); void (*teardown)(gpucontext *ctx); int (*hdot)( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); int (*sdot)( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); int (*ddot)( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY); int (*sgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY); int (*dgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY); int (*hgemm)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc); int (*sgemm)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc); int (*dgemm)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc); int (*hger)(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); int (*sger)(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); int (*dger)(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); int (*hgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount); int (*sgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount); int (*dgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount); int (*hgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); int (*sgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); int (*dgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); int (*hgerBatch)(cb_order order, size_t M, size_t N, float alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); int (*sgerBatch)(cb_order order, size_t M, size_t N, float alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); int (*dgerBatch)(cb_order order, size_t M, size_t N, double alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); int (*hgemm3D)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount); int (*sgemm3D)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount); int (*dgemm3D)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, ssize_t strideA, gpudata *B, size_t offB, size_t ldb, ssize_t strideB, double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount); }; struct _gpuarray_comm_ops { int (*comm_new)(gpucomm** comm, gpucontext* ctx, gpucommCliqueId comm_id, int ndev, int rank); void (*comm_free)(gpucomm* comm); int (*generate_clique_id)(gpucontext* ctx, gpucommCliqueId* comm_id); int (*get_count)(const gpucomm* comm, int* count); int (*get_rank)(const gpucomm* comm, int* rank); // collective ops int (*reduce)(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, int opcode, int root, gpucomm* comm); int (*all_reduce)(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm* comm); int (*reduce_scatter)(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm* comm); int (*broadcast)(gpudata* array, size_t offset, size_t count, int typecode, int root, gpucomm* comm); int (*all_gather)(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, gpucomm* comm); }; #define STATIC_ASSERT(COND, MSG) typedef char static_assertion_##MSG[2*(!!(COND))-1] static inline void *memdup(const void *p, size_t s) { void *res = malloc(s); if (res != NULL) memcpy(res, p, s); return res; } int GpuArray_is_c_contiguous(const GpuArray *a); int GpuArray_is_f_contiguous(const GpuArray *a); int GpuArray_is_aligned(const GpuArray *a); extern const gpuarray_type scalar_types[]; extern const gpuarray_type vector_types[]; /* * This function generates the kernel code to perform indexing on var id * from planar index 'i' using the dimensions and strides provided. */ void gpuarray_elem_perdim(strb *sb, unsigned int nd, const size_t *dims, const ssize_t *str, const char *id); void gpukernel_source_with_line_numbers(unsigned int count, const char **news, size_t *newl, strb *src); static inline uint16_t float_to_half(float value) { #define ga__shift 13 #define ga__shiftSign 16 #define ga__infN 0x7F800000 // flt32 infinity #define ga__maxN 0x477FE000 // max flt16 normal as a flt32 #define ga__minN 0x38800000 // min flt16 normal as a flt32 #define ga__signN 0x80000000 // flt32 sign bit #define ga__infC (ga__infN >> ga__shift) #define ga__nanN ((ga__infC + 1) << ga__shift) // minimum flt16 nan as a flt32 #define ga__maxC (ga__maxN >> ga__shift) #define ga__minC (ga__minN >> ga__shift) #define ga__mulN 0x52000000 // (1 << 23) / minN #define ga__subC 0x003FF // max flt32 subnormal down shifted #define ga__maxD (ga__infC - ga__maxC - 1) #define ga__minD (ga__minC - ga__subC - 1) union { float f; int32_t si; uint32_t ui; } v, s; uint32_t sign; v.f = value; sign = v.si & ga__signN; v.si ^= sign; sign >>= ga__shiftSign; // logical shift s.si = ga__mulN; s.si = (int32_t)(s.f * v.f); // correct subnormals v.si ^= (s.si ^ v.si) & -(ga__minN > v.si); v.si ^= (ga__infN ^ v.si) & -((ga__infN > v.si) & (v.si > ga__maxN)); v.si ^= (ga__nanN ^ v.si) & -((ga__nanN > v.si) & (v.si > ga__infN)); v.ui >>= ga__shift; // logical shift v.si ^= ((v.si - ga__maxD) ^ v.si) & -(v.si > ga__maxC); v.si ^= ((v.si - ga__minD) ^ v.si) & -(v.si > ga__subC); return (uint16_t)(v.ui | sign); #undef ga__shift #undef ga__shiftSign #undef ga__infN #undef ga__maxN #undef ga__minN #undef ga__signN #undef ga__infC #undef ga__nanN #undef ga__maxC #undef ga__minC #undef ga__mulN #undef ga__subC #undef ga__maxD #undef ga__minD } #define ISSET(v, fl) ((v) & (fl)) #define ISCLR(v, fl) (!((v) & (fl))) #define FLSET(v, fl) (v |= (fl)) #define FLCLR(v, fl) (v &= ~(fl)) #define GA_CHECK(cmd) \ do { \ int err = (cmd); \ if (err != GA_NO_ERROR) \ return err; \ } while (0) #ifdef __cplusplus } #endif /** \endcond */ #endif libgpuarray-0.7.6/src/private_config.h.in000066400000000000000000000016471326743622600204340ustar00rootroot00000000000000/****************************************************************** * This file is generated from private_config.h.in. Do not edit. * ******************************************************************/ #ifndef PRIVATE_CONFIG_H #define PRIVATE_CONFIG_H #cmakedefine HAVE_STRL #cmakedefine HAVE_MKSTEMP #include #include #include #include "gpuarray/config.h" #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif #ifdef _MSC_VER /* God damn Microsoft ... */ #define snprintf _snprintf #define strdup _strdup #define alloca _alloca #endif #ifdef _MSC_VER #define SPREFIX "I" #else #define SPREFIX "z" #endif #define nelems(a) (sizeof(a)/sizeof(a[0])) #ifndef HAVE_MKSTEMP int mkstemp(char *path); #endif #ifndef HAVE_STRL size_t strlcpy(char *dst, const char *src, size_t size); size_t strlcat(char *dst, const char *src, size_t size); #endif #ifdef __cplusplus } #endif #endif libgpuarray-0.7.6/src/private_cuda.h000066400000000000000000000103341326743622600174670ustar00rootroot00000000000000#ifndef _PRIVATE_CUDA_H #define _PRIVATE_CUDA_H #include "loaders/libcuda.h" #include #include "private.h" #include "gpuarray/buffer.h" #ifdef DEBUG #include #define CTX_TAG "cudactx " #define BUF_TAG "cudabuf " #define KER_TAG "cudakern" #define COMM_TAG "cudacomm" #define TAG_CTX(c) memcpy((c)->tag, CTX_TAG, 8) #define TAG_BUF(b) memcpy((b)->tag, BUF_TAG, 8) #define TAG_KER(k) memcpy((k)->tag, KER_TAG, 8) #define TAG_COMM(co) memcpy((co)->tag, COMM_TAG, 8) #define ASSERT_CTX(c) assert(memcmp((c)->tag, CTX_TAG, 8) == 0) #define ASSERT_BUF(b) assert(memcmp((b)->tag, BUF_TAG, 8) == 0) #define ASSERT_KER(k) assert(memcmp((k)->tag, KER_TAG, 8) == 0) #define ASSERT_COMM(co) assert(memcmp((co)->tag, COMM_TAG, 8) == 0) #define CLEAR(o) memset((o)->tag, 0, 8); #else #define TAG_CTX(c) #define TAG_BUF(b) #define TAG_KER(k) #define TAG_COMM(k) #define ASSERT_CTX(c) #define ASSERT_BUF(b) #define ASSERT_KER(k) #define ASSERT_COMM(k) #define CLEAR(o) #endif /* Keep in sync with the copy in gpuarray/extension.h */ #define DONTFREE 0x10000000 static inline int error_cuda(error *e, const char *msg, CUresult err) { const char *name, *descr; cuGetErrorName(err, &name); cuGetErrorString(err, &descr); return error_fmt(e, GA_IMPL_ERROR, "%s: %s: %s", msg, name, descr); } #define GA_CUDA_EXIT_ON_ERROR(ctx, cmd) \ do { \ int err = (cmd); \ if (err != GA_NO_ERROR) { \ cuda_exit((ctx)); \ return err; \ } \ } while (0) #define CUDA_EXIT_ON_ERROR(ctx, cmd) \ do { \ CUresult err = (cmd); \ if (err != CUDA_SUCCESS) { \ cuda_exit((ctx)); \ return error_cuda((ctx)->err, #cmd, err); \ } \ } while (0) typedef struct _cuda_context { GPUCONTEXT_HEAD; CUcontext ctx; CUstream s; CUstream mem_s; gpudata *freeblocks; size_t cache_size; size_t max_cache_size; cache *kernel_cache; cache *disk_cache; // This is per-context to avoid lock contention unsigned int enter; unsigned char major; unsigned char minor; } cuda_context; /** @cond NEVER */ STATIC_ASSERT(sizeof(cuda_context) <= sizeof(gpucontext), sizeof_struct_gpucontext_cuda); /** @endcond */ /* * About freeblocks. * * Freeblocks is a linked list of gpudata instances that are * considrered to be "free". That is they are not in use anywhere * else in the program. It is used to cache and reuse allocations so * that we can avoid the heavy cost and synchronization of * cuMemAlloc() and cuMemFree(). * * It is ordered by pointer address. When adding back to it, blocks * will be merged with their neighbours, but not across original * allocation lines (which are kept track of with the CUDA_HEAD_ALLOC * flag. */ #define ARCH_PREFIX "compute_" cuda_context *cuda_make_ctx(CUcontext ctx, gpucontext_props *p); CUstream cuda_get_stream(cuda_context *ctx); void cuda_enter(cuda_context *ctx); void cuda_exit(cuda_context *ctx); struct _gpudata { CUdeviceptr ptr; cuda_context *ctx; /* Don't change anything abovbe this without checking struct _partial_gpudata */ CUevent rev; CUevent wev; CUstream ls; /* last stream used */ unsigned int refcnt; int flags; size_t sz; gpudata *next; #ifdef DEBUG char tag[8]; #endif }; gpudata *cuda_make_buf(cuda_context *c, CUdeviceptr p, size_t sz); size_t cuda_get_sz(gpudata *g); int cuda_wait(gpudata *, int); int cuda_record(gpudata *, int); /* private flags are in the upper 16 bits */ #define CUDA_WAIT_READ 0x10000 #define CUDA_WAIT_WRITE 0x20000 #define CUDA_WAIT_FORCE 0x40000 #define CUDA_WAIT_ALL (CUDA_WAIT_READ|CUDA_WAIT_WRITE) #define CUDA_IPC_MEMORY 0x100000 #define CUDA_HEAD_ALLOC 0x200000 #define CUDA_MAPPED_PTR 0x400000 struct _gpukernel { cuda_context *ctx; /* Keep the context first */ CUmodule m; CUfunction k; void **args; size_t bin_sz; void *bin; int *types; unsigned int argcount; unsigned int refcnt; #ifdef DEBUG char tag[8]; #endif }; int get_cc(CUdevice dev, int *maj, int *min, error *e); #endif libgpuarray-0.7.6/src/private_opencl.h000066400000000000000000000057631326743622600200450ustar00rootroot00000000000000#ifndef _GPUARRAY_PRIVATE_OPENCL #define _GPUARRAY_PRIVATE_OPENCL #include "private.h" #include "loaders/libopencl.h" /** @cond NEVER */ #ifdef DEBUG #include #define CTX_TAG "ocl ctx " #define BUF_TAG "ocl buf " #define KER_TAG "ocl kern" #define TAG_CTX(c) memcpy((c)->tag, CTX_TAG, 8) #define TAG_BUF(b) memcpy((b)->tag, BUF_TAG, 8) #define TAG_KER(k) memcpy((k)->tag, KER_TAG, 8) #define ASSERT_CTX(c) assert(memcmp((c)->tag, CTX_TAG, 8) == 0) #define ASSERT_BUF(b) assert(memcmp((b)->tag, BUF_TAG, 8) == 0) #define ASSERT_KER(k) assert(memcmp((k)->tag, KER_TAG, 8) == 0) #define CLEAR(o) memset((o)->tag, 0, 8); #else #define TAG_CTX(c) #define TAG_BUF(b) #define TAG_KER(k) #define ASSERT_CTX(c) #define ASSERT_BUF(b) #define ASSERT_KER(k) #define CLEAR(o) #endif /** @endcond */ const char *cl_error_string(cl_int); static inline int error_cl(error *e, const char *msg, cl_int err) { return error_fmt(e, GA_IMPL_ERROR, "%s: %s", msg, cl_error_string(err)); } #define CL_CHECK(e, cmd) do { \ cl_int err = (cmd); \ if (err != CL_SUCCESS) \ return error_cl(e, #cmd, err); \ } while(0) #define CL_CHECKN(e, cmd) do { \ cl_int err = (cmd); \ if (err != CL_SUCCESS) { \ error_cl(e, #cmd, err); \ return NULL; \ } \ } while(0) #define CL_GET_PROP(e, fn, obj, prop, val) do { \ size_t sz; \ cl_int err; \ CL_CHECK(e, fn (obj, prop, 0, NULL, &sz)); \ val = malloc(sz); \ if (val == NULL) return error_sys(e, "malloc"); \ err = fn (obj, prop, sz, val, NULL); \ if (err != CL_SUCCESS) { \ free(val); \ val = NULL; \ return error_cl(e, #fn, err); \ } \ } while(0) typedef struct _cl_ctx { GPUCONTEXT_HEAD; cl_context ctx; cl_command_queue q; char *exts; char *options; } cl_ctx; /** @cond NEVER */ STATIC_ASSERT(sizeof(cl_ctx) <= sizeof(gpucontext), sizeof_struct_gpucontext_cl); /** @endcond */ struct _gpudata { cl_mem buf; cl_ctx *ctx; /* Don't change anyhting above this without checking struct _partial_gpudata */ cl_event ev; unsigned int refcnt; #ifdef DEBUG char tag[8]; #endif }; struct _gpukernel { cl_ctx *ctx; /* Keep the context first */ cl_kernel k; cl_event ev; cl_event **evr; int *types; unsigned int argcount; unsigned int refcnt; cl_uint num_ev; #ifdef DEBUG char tag[8]; #endif }; cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p); cl_command_queue cl_get_stream(gpucontext *ctx); gpudata *cl_make_buf(gpucontext *c, cl_mem buf); cl_mem cl_get_buf(gpudata *g); #endif libgpuarray-0.7.6/src/util/000077500000000000000000000000001326743622600156245ustar00rootroot00000000000000libgpuarray-0.7.6/src/util/CMakeLists.txt000066400000000000000000000001061326743622600203610ustar00rootroot00000000000000set_rel(UTIL_SRC strb.c error.c xxhash.c integerfactoring.c skein.c ) libgpuarray-0.7.6/src/util/error.c000066400000000000000000000015111326743622600171170ustar00rootroot00000000000000#include #include #include #include "private_config.h" #include "util/error.h" static error _global_err = {{0}, 0}; error *global_err = &_global_err; int error_alloc(error **_e) { error *e; e = calloc(sizeof(error), 1); if (e == NULL) return -1; *_e = e; return 0; } void error_free(error *e) { free(e); } int error_set(error *e, int code, const char *msg) { e->code = code; strlcpy(e->msg, msg, ERROR_MSGBUF_LEN); #ifdef DEBUG fprintf(stderr, "(Debug) ERROR %d: %s\n", e->code, e->msg); #endif return code; } int error_fmt(error *e, int code, const char *fmt, ...) { va_list ap; e->code = code; va_start(ap, fmt); vsnprintf(e->msg, ERROR_MSGBUF_LEN, fmt, ap); va_end(ap); #ifdef DEBUG fprintf(stderr, "(Debug) ERROR %d: %s\n", e->code, e->msg); #endif return code; } libgpuarray-0.7.6/src/util/error.h000066400000000000000000000011161326743622600171250ustar00rootroot00000000000000#ifndef UTIL_ERROR_H #define UTIL_ERROR_H #include #include #include /* 1024 - 4 for the int that goes after */ #define ERROR_MSGBUF_LEN 1020 typedef struct _error { char msg[ERROR_MSGBUF_LEN]; int code; } error; int error_alloc(error **e); void error_free(error *e); int error_set(error *e, int code, const char *msg); int error_fmt(error *e, int code, const char *fmt, ...); extern error *global_err; static inline int error_sys(error *e, const char *msg) { return error_fmt(e, GA_SYS_ERROR, "%s: %s", msg, strerror(errno)); } #endif libgpuarray-0.7.6/src/util/integerfactoring.c000066400000000000000000001147021326743622600213270ustar00rootroot00000000000000/* Includes */ #include #include #include #include #include "integerfactoring.h" /* Detect when to avoid VLAs. */ #if defined(_MSC_VER) || defined(__STDC_NO_VLA__) #define GA_USING_MALLOC_FOR_VLA 1 #endif /* Defines */ #define GA_IS_COMPOSITE 0 #define GA_IS_PRIME 1 #define GA_IS_PROBABLY_PRIME 2 /** * Static Function Prototypes */ /** * @brief Count trailing zeros of a 64-bit integer. * * @param [in] n The integer whose trailing zero count is to be computed. * @return If n != 0, returns trailing zero count; Else returns 64. */ static int gaICtz(uint64_t n); /** * @brief Count leading zeros of a 64-bit integer. * * @param [in] n The integer whose leading zero count is to be computed. * @return If n != 0, returns leading zero count; Else returns 64. */ static int gaIClz(uint64_t n); /** * @brief Integer Modular Addition. * * Computes * * $$a+b \pmod m$$ * * efficiently for 64-bit unsigned integers a, b, m. */ static uint64_t gaIAddMod (uint64_t a, uint64_t b, uint64_t m); /** * @brief Integer Modular Subtraction. * * Computes * * $$a-b \pmod m$$ * * efficiently for 64-bit unsigned integers a, b, m. */ static uint64_t gaISubMod (uint64_t a, uint64_t b, uint64_t m); /** * @brief Integer Modular Average. * * Computes * * $$\frac{a+b}{2} \pmod m$$ * * efficiently for 64-bit unsigned integers a, b, m. */ static uint64_t gaIAvgMod (uint64_t a, uint64_t b, uint64_t m); /** * @brief Integer Modular Multiplication. * * Computes * * $$a*b \pmod m$$ * * efficiently for 64-bit unsigned integers a, b, m. */ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m); /** * @brief Integer Modular Exponentiation. * * Computes * * $$x^a \pmod m$$ * * efficiently for 64-bit unsigned integers x, a, m. */ static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m); /** * @brief Jacobi Symbol * * Computes the Jacobi symbol, notated * * $$(a/n)$$ * * efficiently for 64-bit unsigned integers a, n. */ static int gaIJacobiSymbol(uint64_t a, uint64_t n); /** * @brief Strong Fermat base-a probable prime test. * * @param [in] n An odd integer >= 3. * @param [in] a A witness integer > 0. * @return Non-zero if n is a strong probable prime to base a and zero if n is * composite. */ static int gaIIsPrimeStrongFermat(uint64_t n, uint64_t a); /** * @brief Strong Lucas probable prime test. * * The function uses Selfridge's Method A for selecting D,P,Q. * * @param [in] n An odd integer >= 3. * @return Non-zero if n is a strong probable prime and zero if n is composite. */ static int gaIIsPrimeStrongLucas(uint64_t n); /** * @brief Round up positive n to next 2-, 3- or 5-smooth number and report its * factorization. */ static int gaIFactorize2Smooth(uint64_t n, ga_factor_list* fl); static int gaIFactorize3Smooth(uint64_t n, ga_factor_list* fl); static int gaIFactorize5Smooth(uint64_t n, ga_factor_list* fl); /** * @brief Satisfy individual product limits on "from" by moving factors to * corresponding "to" list. */ static void gaIFLScheduleSatisfyInd(const int n, ga_factor_list* from, ga_factor_list* to, const uint64_t* maxInd); /** * @brief Satisfy global product limit on "from" by moving factors to * corresponding "to" list. */ static void gaIFLScheduleSatisfyTot(const int n, ga_factor_list* from, ga_factor_list* to, const uint64_t maxTot); /** * @brief Optimize "to" by moving factors from "from", under both individual * and global limits. */ static void gaIFLScheduleOpt(const int n, ga_factor_list* from, ga_factor_list* to, const uint64_t maxTot, const uint64_t* maxInd); /** * @brief Schedule block/grid/chunk size, integer version, n checked >= 0. */ static void gaIScheduleChecked(const int n, const uint64_t maxBtot, const uint64_t* maxBind, const uint64_t maxGtot, const uint64_t* maxGind, uint64_t* bs, uint64_t* gs, uint64_t* cs); /** * Function Definitions */ static int gaICtz (uint64_t n){ #if __GNUC__ >= 4 return n ? __builtin_ctzll(n) : 64; #else int z; for(z=0;z<64;z++){ if((n>>z) & 1){break;} } return z; #endif } static int gaIClz (uint64_t n){ #if __GNUC__ >= 4 return n ? __builtin_clzll(n) : 64; #else int z; for(z=63;z>=0;z--){ if((n>>z) & 1){break;} } return 63-z; #endif } static uint64_t gaIAddMod (uint64_t a, uint64_t b, uint64_t m){ a %= m; b %= m; if(m-a > b){ return a+b; }else{ return a+b-m; } } static uint64_t gaISubMod (uint64_t a, uint64_t b, uint64_t m){ a %= m; b %= m; if(a >= b){ return a-b; }else{ return a-b+m; } } static uint64_t gaIAvgMod (uint64_t a, uint64_t b, uint64_t m){ uint64_t s = gaIAddMod(a,b,m); if(s&1){ return (s>>1)+(m>>1)+(s&m&1); }else{ return s>>1; } } static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ #if (__GNUC__ >= 4) && defined(__x86_64__) && !defined(__STRICT_ANSI__) uint64_t r; asm( "mul %2\n\t" "div %3\n\t" : "=&d"(r), "+a"(a) /* Outputs */ : "r"(b), "r"(m) /* Inputs */ : "cc" ); return r; #elif ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) && defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ >= 16 /* Hardcore GCC 4.6+ optimization jazz */ return ((unsigned __int128)a * (unsigned __int128)b) % m; #else const uint64_t TWOPOW32 = (uint64_t)1<<32; int i; a %= m; b %= m; if(m <= TWOPOW32){ /** * Fast path: When performing modulo arithmetic on values <= 2^32, * (a*b) % m gives the correct answer. */ return (a*b) % m; }else{ /** * Slow path: Have to simulate 128-bit arithmetic long division. */ uint64_t ah = a>>32; uint64_t al = (uint32_t)a; uint64_t bh = b>>32; uint64_t bl = (uint32_t)b; uint64_t ahbh = ah*bh; uint64_t ahbl = ah*bl; uint64_t albh = al*bh; uint64_t albl = al*bl; uint64_t md = ahbl+albh; uint64_t lo = albl + (md<<32); uint64_t hi = ahbh + (md>>32); /* Propagate carry-outs from `md` and `lo` into `hi` */ if(lo < albl){hi++;} if(md < ahbl){hi+=TWOPOW32;} /** * Begin 128-bit-by-64-bit remainder. * * 1) Cut down `hi` mod `m`. This implements the first few iterations * of a shift-and-subtract loop, leaving only 64 iterations to go. * 2) Iterate 64 times: * 2.1) Shift left [hi:lo] by 1 bit, into [newHi:newLo]. * 2.2) If: * 2.2.1) newHi < hi, then there was an overflow into bit 128. * The value [1:newHi:newLo] is definitely larger than * m, so we subtract. This situation can only occur if * m > 2^63. * 2.2.2) newHi > m, then we must subtract m out of newHi in * order to bring back newHi within the range [0, m). * 3) The modulo is in hi. */ hi %= m; for(i=0;i<64;i++){ uint64_t newLo = (lo<<1); uint64_t newHi = (hi<<1) + (newLo m){newHi -= m;} hi = newHi; lo = newLo; } return hi; } #endif } static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ uint64_t r; /** * Special cases (order matters!): * - A modulo of 0 makes no sense and a modulo of 1 implies a return value * of 0, since the result must be integer. * - An exponent of 0 requires a return value of 1. * - A base of 0 or 1 requires a return value of 0 or 1. * - An exponent of 1 requires a return value of x. * - An exponent of 2 can be handled by the modulo multiplication directly. */ if(m<=1){ return 0; } x %= m; if(a==0){ return 1; }else if(x<=1){ return x; }else if(a==1){ return x; }else if(a==2){ return gaIMulMod(x,x,m); } /** * Otherwise, perform modular exponentiation by squaring. */ r = 1; while(a){ if(a&1){ r = gaIMulMod(r, x, m); } x = gaIMulMod(x, x, m); a >>= 1; } return r; } static int gaIJacobiSymbol(uint64_t a, uint64_t n){ int s=0; uint64_t e, a1, n1; a %= n; if(a == 1 || n == 1){ return 1; } if(a == 0){ return 0; } e = gaICtz(a); a1 = a >> e; if(e%2 == 0){ s = 1; }else if(n%8 == 1 || n%8 == 7){ s = 1; }else if(n%8 == 3 || n%8 == 5){ s = -1; } if(n%4 == 3 && a1%4 == 3){ s = -s; } n1 = n%a1; return s*gaIJacobiSymbol(n1,a1); } static int gaIIsPrimeStrongFermat(uint64_t n, uint64_t a){ /** * The Fermat strong probable prime test the Miller-Rabin test relies upon * uses integer "witnesses" in an attempt at proving the number composite. * Should it fail to prove an integer composite, it reports the number as * "probably prime". However, if the witnesses are chosen carefully, the * Miller-Rabin test can be made deterministic below a chosen threshold. * * One can use the primes 2 to 37 in order to ensure the correctness of the * identifications for integers under 2^64. * * Jim Sinclair has found that the seven witnesses * 2, 325, 9375, 28178, 450775, 9780504, 1795265022 * also deterministically classify all integers <2^64. * * * The Fermat strong probable prime test states that, for integers * n = d*2^s+1, d odd, s integer >= 0 * a integer (chosen witness) * n is a Fermat strong probable prime if * a^(d ) = 1 mod n or * a^(d*2^r) = -1 mod n for any integer r, 0 <= r < s. * * * The justification for this comes from Fermat's Little Theorem: If n is * prime and a is any integer, then the following always holds: * a^n = a mod n * If n is prime and a is coprime to n, then the following always holds: * a^(n-1) = 1 mod n * * * In effect, the logic goes * * A: The number n is prime. (Statement) * B: The number n does not divide a. (Statement) * C: a^( n-1) = 1 mod n (Statement) * D: The commutative ring Z/nZ is a finite field. (Statement) * E: Finite fields are unique factorization domains. (Statement) * F: x^2 = 1 mod n factorizes as (x+1)(x-1) = 0 mod n. (Statement) * G: x^2 mod n only has the trivial square roots 1 and -1 (Statement) * H: The number n is odd and >= 3. (Statement) * I: The number n-1 equals d*2^s, with d,s int > 0, d odd. (Statement) * J: a^( d) = 1 mod n (Statement) * K: a^(d*2^r) = -1 mod n for some 0 <= r < s. (Statement) * L: a^(d*2^(r+1)) = 1 mod n for some 0 <= r < s. (Statement) * M: a^(d*2^r) != +-1 mod n AND (Statement) * a^(d*2^(r+1)) = 1 mod n for some 0 <= r < s. * * A&B --> C (Proposition: Fermat's Little Theorem) * !C --> !(A&B) = !A|!B (Contrapositive: Fermat's Little Theorem) * A <-> D (Proposition) * E (Proposition: By definition) * F (Proposition: x^2-x+x-1 = x^2-1 mod n) * D&E&F --> G (Proposition: (x+1)(x-1) is the only * factorization) * !G --> !D|!E|!F (Contrapositive: See above) * H&I&J --> C (Proposition: Squaring 1 gives 1) * H&I&K --> L (Proposition: Squaring -1 gives 1) * H&I&L --> C (Proposition: 1, squared or not, gives 1) * H&I&K --> C (Hypothetical Syllogism) * H&I&(J|K) --> C (Union) * H&I&!(J|K) --> M|!C (Proposition: Either squaring * a^(d*2^(s-1)) != +-1 mod n * gives a 1, in which case * M holds, or it does not * give 1 and therefore * a^(n-1) != 1 mod n) * and thus !C holds. * H&I&!(J|K) --> H&I&M | !A | !B (Absorbtion, Hypothetical Syllogism) * H&I&M --> !G (Proposition: x^2 = 1 mod n but x!=+1, * so x^2 - 1 has roots * other than +-1) * H&I&M --> !D|!E|!F (Modus Tollens) * H&I&M --> !D (Disjunctive Syllogism) * H&I&M --> !A (Biconditional) * H&I&!(J|K) --> !A | !A | !B (Hypothethical Syllogism) * H&I&!(J|K)&B --> !A | !A (Absorbtion) * H&I&!(J|K)&B --> !A | !A (Disjunctive Syllogism) * H&I&!(J|K)&B --> !A (Disjunctive Simplification) * ***** Conclusions: ***** * H&I&M --> !A * H&I&!(J|K)&B --> !A * * Broadly speaking, what the above tells us is: * - We can't prove n prime (A), but we can prove it composite (!A). * - Either H&I&M or H&I&!(J|K)&B prove compositeness. * - If H&I&(J|K) for any r, then we've proven C true. If we prove C true, * we can't use the contrapositive of Fermat's Little Theorem, so no * conclusions about the truth-value of A can be made. The test is * inconclusive. Thus this function returns "probably prime". */ uint64_t d, x; int64_t s, r; a %= n; if(a==0){ return GA_IS_PROBABLY_PRIME; } s = gaICtz(n-1); d = (n-1) >> s; x = gaIPowMod(a,d,n); if(x==1 || x==n-1){ return GA_IS_PROBABLY_PRIME; } for(r=0;r=0;i--){ Ut = gaIMulMod(U,V,n); Vt = gaIAvgMod(gaIMulMod(V,V,n), gaIMulMod(D,gaIMulMod(U,U,n),n), n); if((K>>i)&1){ U = gaIAvgMod(Ut,Vt,n); V = gaIAvgMod(Vt,gaIMulMod(D,Ut,n),n); }else{ U = Ut; V = Vt; } } /** * 7. If U0==0, then return "probably prime". Otherwise, return "composite". */ return U==0 ? GA_IS_PROBABLY_PRIME : GA_IS_COMPOSITE; } int gaIIsPrime (uint64_t n){ int hasNoSmallFactors, hasSmallFactors; /** * Check if it is 2, the oddest prime. */ if(n==2){return GA_IS_PRIME;} /** * Check if it is an even integer. */ if((n&1) == 0){return GA_IS_COMPOSITE;} /** * For small integers, read directly the answer in a table. */ if(n<256){ return "nnyynynynnnynynnnynynnnynnnnnyny" "nnnnnynnnynynnnynnnnnynnnnnynynn" "nnnynnnynynnnnnynnnynnnnnynnnnnn" "nynnnynynnnynynnnynnnnnnnnnnnnny" "nnnynnnnnynynnnnnnnnnynynnnnnynn" "nnnynnnynnnnnynnnnnynynnnnnnnnny" "nynnnynynnnnnnnnnnnynnnnnnnnnnny" "nnnynynnnynnnnnynynnnnnnnnnynnnn"[n] == 'y'; } /** * Test small prime factors. */ hasNoSmallFactors = n% 3 && n% 5 && n% 7 && n%11 && n%13 && n%17 && n%19 && n%23 && n%29 && n%31 && n%37 && n%41 && n%43 && n%47 && n%53 && n%59 && n%61 && n%67 && n%71 && n%73 && n%79; hasSmallFactors = !hasNoSmallFactors; if(hasSmallFactors){ return GA_IS_COMPOSITE; } /** * We implement the Baillie-Pomerance-Selfridge-Wagstaff primality checker. * 1) A Fermat base-2 strong probable prime that is also * 2) A Lucas strong probable prime is * 3) Prime. * The BPSW test has no known failure cases and is proven to have no failures * for all numbers under 2^64. It is expected to have failures (composites * classified as "probably prime") but they are expected to be enormous. * * We begin with the Fermat base-2 strong primality test * (Miller-Rabin test with one witness only, a=2). */ return gaIIsPrimeStrongFermat(n, 2) && /** * Assuming this is one of the base-2 Fermat strong probable primes, we run * the Lucas primality test with Selfridge's Method A for selecting D. */ gaIIsPrimeStrongLucas (n ); } int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl){ int infiniteSlack, finiteSlack, greaterThanMaxN, exactFactoring, noKSmoothness, kSmoothness; uint64_t i, x, newX, p, f, c; /** * Insane argument handling. */ if(!fl || (k == 1) || (maxN > 0 && maxN < n)){ return 0; } /** * Handle special cases of n = 0,1,2. */ if(n<=2){ gaIFLInit(fl); gaIFLAddFactors(fl, n, 1); return 1; } /** * Magic-value arguments interpreted and canonicalized. */ exactFactoring = (maxN == (uint64_t) 0); infiniteSlack = (maxN == (uint64_t)-1); noKSmoothness = (k == 0) || (k >= n); finiteSlack = !infiniteSlack; kSmoothness = !noKSmoothness; maxN = exactFactoring ? n : maxN; k = noKSmoothness ? n : k; /** * Try optimal k-smooth optimizers. */ if (k <= 2){gaIFactorize2Smooth(n, fl);} else if(k <= 4){gaIFactorize3Smooth(n, fl);} else {gaIFactorize5Smooth(n, fl);} greaterThanMaxN = finiteSlack && (gaIFLIsOverflowed(fl) || gaIFLGetProduct (fl) > maxN); if(greaterThanMaxN){ if(kSmoothness && k<=6){ /** * We've *proven* there exists no k-smooth n <= maxN, k <= 6. * No use wasting more time here. */ return 0; } /* Otherwise fall-through to factorizer. */ }else{ /** * Either the slack was infinite, or the product did not overflow and * was <= maxN. The k-smoothness criterion is guaranteed by the * factorizer we chose earlier. * * Therefore we have a satisfactory, optimal 2-, 3- or 5-smooth * factorization (although not necessarily an exact one unless it is * the case that maxN == n). We return it. */ return 1; } /** * Master loop. * * We arrive here with finite slack and all optimal 2-, 3- and 5-smooth * factorizers unable to produce a factorization whose product is less * than or equal to maxN. */ for(i=n; i <= maxN; i++){ /** * Do not manipulate the loop index! * Initial subfactor to cut down is x=i. */ x = i; gaIFLInit(fl); /** * Subfactorization always begins with an attempt at an initial * cut-down by factors of 2. Should this result in a 1 (which isn't * technically prime, but indicates a complete factorization), we * report success. */ subfactorize: gaIFLAddFactors(fl, 2, gaICtz(x)); x >>= gaICtz(x); f = 3; /** * Primality test. * * If the remaining factor x is a prime number, it's decision time. One * of two things is true: * * 1) We have a smoothness constraint k and x is <= than it, or we * don't have a smoothness constraint at all (k==n). Both cases are * covered by checking x<=k. * * In this case we add x as the last factor to the factor list and * return affirmatively. * * 2) We have a smoothness constraint and x>k. * * In this case we have to inc/decrement x and begin anew the * sub-factorization. This may cause us to fail out of factorizing * the current i, by exceeding our slack limit. If this happens we * abort the factorization rooted at i and move to the next i. */ primetest: if(x==1 || gaIIsPrime(x)){ if(x <= k){ gaIFLAddFactors(fl, x, 1); return 1; }else{ p = gaIFLGetProduct(fl); newX = n/p; newX += newX*p < n; if(newX < x){ x = newX; goto subfactorize; }else if((maxN - p*x) < p){/* Overflow-free check maxN >= p*(x+1) */ goto nextI; }else{ x++; goto subfactorize; } } } /** * Composite number handler. * * We continue by trying to cut down x by factors of 3+. Should a trial * division by a factor f succeed, all powers of f are factored out of * x and once f no longer divides x evenly, a new primality test is * run. The primality test will be invoked at most 15 times from this loop. */ for(;f<=k && f*f<=x && f<=0xFFFFFFFFU;f+=2){/* Overflow-safe f*f */ if(x%f == 0){ c = 0; do{ x /= f; c++; }while(x%f == 0); gaIFLAddFactors(fl, f, c); goto primetest; } } /* Check before next iteration for 64-bit integer overflow. */ nextI: if(i == 0xFFFFFFFFFFFFFFFF){break;} } /* Failed to factorize. */ return 0; } static int gaIFactorize2Smooth(uint64_t n, ga_factor_list* fl){ n--; n |= n >> 1; n |= n >> 2; n |= n >> 4; n |= n >> 8; n |= n >> 16; n |= n >> 32; n++; gaIFLInit(fl); gaIFLAddFactors(fl, 2, gaICtz(n)); return 1; } static int gaIFactorize3Smooth(uint64_t n, ga_factor_list* fl){ uint64_t nBest=-1, i3Best=0, i3, p3, nCurr; int nlz = gaIClz(n), isBest2to64 = 1; /** * Iterate over all powers of 3, scaling them by the least power-of-2 such * that the result is greater than or equal to n. Report the smallest nBest * so obtained. */ for(i3=0, p3=1;i3<=40;i3++, p3*=3){ nCurr = p3; /** * If the current power of 3 is >= n, then this must be the last * iteration, but perhaps a pure power of 3 is the best choice, so * check for this. */ if(nCurr >= n){ if(isBest2to64 || nBest >= nCurr){ isBest2to64 = 0; nBest = nCurr; i3Best = i3; } break; } /** * Otherwise we have a pure power of 3, p3, less than n, and must * derive the least power of 2 such that p3 multiplied by that power of * 2 is greater than or equal to n. We then compute the product of * both. */ nCurr <<= gaIClz(nCurr) - nlz; if(nCurr= n. But is it the best factorization * so far? */ if(isBest2to64 || nBest >= nCurr){ isBest2to64 = 0; nBest = nCurr; i3Best = i3; if(nCurr == n){ break; } } } /** * Return the smallest n found above. * * nBest and i3Best must be set. */ gaIFLInit(fl); if(isBest2to64){ gaIFLAddFactors(fl, 2, 64); }else{ gaIFLAddFactors(fl, 2, gaICtz(nBest)); gaIFLAddFactors(fl, 3, i3Best); } return 1; } static int gaIFactorize5Smooth(uint64_t n, ga_factor_list* fl){ uint64_t nBest=-1, i3Best=0, i3, p3, i5Best=0, i5, p5, nCurr; int nlz = gaIClz(n), isBest2to64 = 1; /** * Iterate over all products of powers of 5 and 3, scaling them by the * least power-of-2 such that the result is greater than or equal to n. * Report the smallest nBest so obtained. */ for(i5=0, p5=1;i5<=27;i5++, p5*=5){ nCurr = p5; /** * If the current power of 5 is >= n, then this must be the last * iteration, but perhaps a pure power of 5 is the best choice, so * check for this. */ if(nCurr >= n){ if(isBest2to64 || nBest >= nCurr){ isBest2to64 = 0; nBest = nCurr; i3Best = 0; i5Best = i5; } break; } for(i3=0, p3=1;i3<=40;i3++, p3*=3){ /** * Detect when the product p3*p5 would overflow 2^64. */ if(i3){ nCurr = (p3/3)*p5; if(nCurr+nCurr < nCurr || nCurr+nCurr+nCurr < nCurr+nCurr){ break; } } nCurr = p3*p5; /** * If the current product of powers of 3 and 5 is >= n, then this * must be the last iteration, but perhaps a pure product of powers * of 3 and 5 is the best choice, so check for this. */ if(nCurr >= n){ if(isBest2to64 || nBest >= nCurr){ isBest2to64 = 0; nBest = nCurr; i3Best = i3; i5Best = i5; } break; } /** * Otherwise we have a number nCurr, composed purely of factors 3 * and 5, that is less than n. We must derive the least power of 2 * such that nCurr multiplied by that power of 2 is greater than or * equal to n. We then compute the product of both. */ nCurr <<= gaIClz(nCurr) - nlz; if(nCurr= n. But is it the best factorization * so far? */ if(isBest2to64 || nBest >= nCurr){ isBest2to64 = 0; nBest = nCurr; i3Best = i3; i5Best = i5; if(nCurr == n){ goto exit; } } } } /** * Return the smallest n found above. * * nBest and i3Best must be set. */ exit: gaIFLInit(fl); if(isBest2to64){ gaIFLAddFactors(fl, 2, 64); }else{ gaIFLAddFactors(fl, 2, gaICtz(nBest)); gaIFLAddFactors(fl, 3, i3Best); gaIFLAddFactors(fl, 5, i5Best); } return 1; } void gaIFLInit(ga_factor_list* fl){ memset(fl, 0, sizeof(*fl)); } int gaIFLFull(const ga_factor_list* fl){ return fl->d >= 15;/* Strictly speaking, fl->d never exceeds 15. */ } int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p){ int i; /** * Fast case: We're adding 0 powers of f, or any powers of 1. The * value of the factor list (and the integer it represents) is thus * unchanged. */ if(p == 0 || f == 1){ return 1; } /** * Otherwise, the factor list has to change. We scan linearly the factor * list for either a pre-existing spot or an insertion spot. Scanning * linearly over a 15-element array is faster and less complex than binary * search. */ for(i=0;id;i++){ if(fl->f[i] == f){ /** * Factor is already in list. */ fl->p[i] += p; if(fl->p[i] == 0){ /** * We removed all factors f. Bump leftwards the remainder to * maintain sorted order. */ memmove(&fl->f[i], &fl->f[i+1], sizeof(fl->f[i])*(fl->d-i)); memmove(&fl->p[i], &fl->p[i+1], sizeof(fl->p[i])*(fl->d-i)); fl->d--; } return 1; }else if(fl->f[i] > f){ /* Inject the factor at this place in order to keep list sorted, if we have the capacity. */ if(gaIFLFull(fl)){ /* We can't bump the list rightwards, it's full already! */ return 0; } memmove(&fl->f[i+1], &fl->f[i], sizeof(fl->f[i])*(fl->d-i)); memmove(&fl->p[i+1], &fl->p[i], sizeof(fl->p[i])*(fl->d-i)); fl->f[i] = f; fl->p[i] = p; fl->d++; return 1; } } /** * We looked at every factor in the list and f is strictly greater than * all of them. * * If the list is full, we cannot insert f, but if it isn't, we can simply * tack it at the end. */ if(gaIFLFull(fl)){ return 0; }else{ fl->f[fl->d] = f; fl->p[fl->d] = p; fl->d++; return 1; } } int gaIFLGetFactorPower(const ga_factor_list* fl, uint64_t f){ int i; for(i=0;id;i++){ if(fl->f[i] == f){ return fl->p[i]; } } return 0; } uint64_t gaIFLGetProduct(const ga_factor_list* fl){ uint64_t p = 1; int i, j; for(i=0;id;i++){ for(j=0;jp[i];j++){ p *= fl->f[i]; } } return p; } int gaIFLIsOverflowed(const ga_factor_list* fl){ uint64_t p = 1, MAX=-1; int i, j; if(gaIFLGetFactorPower(fl, 0) >= 1){ return 0; } if(gaIFLGetFactorPower(fl, 2) >= 64){ return 1; } for(i=0;id;i++){ for(j=0;jp[i];j++){ if(MAX/p < fl->f[i]){ return 1; } p *= fl->f[i]; } } return 0; } uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl){ return fl->d ? fl->f[fl->d-1] : 1; } uint64_t gaIFLGetSmallestFactor(const ga_factor_list* fl){ return fl->d ? fl->f[0] : 1; } static uint64_t gaIFLGetProductv(int n, const ga_factor_list* fl){ uint64_t p = 1; int i; for(i=0;i 0){ hasFactors = 1; currF = gaIFLGetGreatestFactor(fl+i); if(f <= currF){ f = currF; if(idx){*idx = i;} } } } return hasFactors ? f : 1; } static uint64_t gaIFLGetSmallestFactorv(int n, const ga_factor_list* fl, int* idx){ uint64_t f = -1, currF; int i, hasFactors=0; if(idx){*idx = 0;} for(i=0;i 0){ hasFactors = 1; currF = gaIFLGetSmallestFactor(fl+i); if(f >= currF){ f = currF; if(idx){*idx = i;} } } } return hasFactors ? f : 1; } int gaIFLsprintf(char* str, const ga_factor_list* fl){ int i, j; int total = 0; char* ptr = str; /* Loop over all factors and spit them out. */ for(i=0;id;i++){ for(j=0;jp[i];j++){ total += sprintf(ptr, "%llu*", (unsigned long long)fl->f[i]); if(ptr){ ptr += strlen(ptr); } } } /* If no factors were printed, print 1. */ if(total == 0){ total += sprintf(ptr, "1*"); if(ptr){ ptr += strlen(ptr); } } /* Terminate buffer ('*' -> '\0') and deduct one character. */ total--; if(str){ str[total] = '\0'; } return total; } void gaIFLappend(strb *sb, const ga_factor_list* fl){ int i, j; int noFactorsPrinted = 1; /* Loop over all factors and spit them out. */ for(i=0;id;i++){ for(j=0;jp[i];j++){ noFactorsPrinted = 0; strb_appendf(sb, "%llu*", (unsigned long long)fl->f[i]); } } /** * If no factors were printed, print 1. * Otherwise, delete final '*'. */ if(noFactorsPrinted){ strb_appendf(sb, "1"); }else{ sb->s[--sb->l] = '\0'; } } static void gaIScheduleChecked(const int n, const uint64_t maxBtot, const uint64_t* maxBind, const uint64_t maxGtot, const uint64_t* maxGind, uint64_t* bs, uint64_t* gs, uint64_t* cs){ int i; uint64_t kBS, kGS, k; /** * Allocate a VLA or similar. * * C89 neither allows VLAs nor a check beforehand that n>0 to avoid UB. The * check for n>0 was thus done in our caller. */ #if GA_USING_MALLOC_FOR_VLA ga_factor_list* factBS = malloc(n * sizeof(*factBS)); ga_factor_list* factGS = malloc(n * sizeof(*factGS)); ga_factor_list* factCS = malloc(n * sizeof(*factCS)); #else ga_factor_list factBS[n]; ga_factor_list factGS[n]; ga_factor_list factCS[n]; #endif /** * Factorize the provided integers under their k-smoothness constraint. * Use the strictest of either the block or grid constraints on each * dimension. */ for(i=0;i maxInd[i]){ if(p%f){ f = gaIFLGetGreatestFactor(from+i); } p /= f; gaIFLAddFactors(from+i, f, -1); gaIFLAddFactors(to +i, f, +1); } } } static void gaIFLScheduleSatisfyTot(const int n, ga_factor_list* from, ga_factor_list* to, const uint64_t maxTot){ int a, i, c; uint64_t f, p; p = gaIFLGetProductv(n, from); a = 0; while(p > maxTot){ f = gaIFLGetSmallestFactorv(n, from, &a); c = gaIFLGetFactorPower (from+a, f); for(i=c-1;i>=0 && p>maxTot;i--){ p /= f; gaIFLAddFactors(from+a, f, -1); gaIFLAddFactors(to +a, f, +1); } } } static void gaIFLScheduleOpt(const int n, ga_factor_list* from, ga_factor_list* to, const uint64_t maxTot, const uint64_t* maxInd){ int i, j, k; uint64_t maxFTot, maxFInd, currF, f, pTot = 1; #if GA_USING_MALLOC_FOR_VLA uint64_t* pInd = malloc(n * sizeof(*pInd)); #else uint64_t pInd[n]; #endif /* Muzzle compiler about a random function being unused. */ (void)gaIFLGetGreatestFactorv; /** * Check whether optimization is possible. */ for(i=0;i=0;j--){ currF = from[i].f[j]; if(currF <= maxFTot && currF <= maxFInd && currF >= f){ f = currF; k = i; break; } } } if(k == -1){ break; } gaIFLAddFactors(from+k, f, -1); gaIFLAddFactors(to +k, f, +1); pInd[k] *= f; pTot *= f; maxFTot = maxTot/pTot; }while(maxFTot>1 && f>1); #if GA_USING_MALLOC_FOR_VLA free(pInd); #endif } libgpuarray-0.7.6/src/util/integerfactoring.h000066400000000000000000000231101326743622600213240ustar00rootroot00000000000000/* Include Guards */ #ifndef INTEGERFACTORING_H #define INTEGERFACTORING_H /* Includes */ #include #include "gpuarray/config.h" #include "util/strb.h" /* Defines */ /* C++ Extern "C" Guard */ #ifdef __cplusplus extern "C" { #endif /* Data Structure Prototypes & Typedefs */ struct ga_factor_list_; typedef struct ga_factor_list_ ga_factor_list; /* Data Structures */ /** * @brief The GA_FACTOR_LIST struct. * * Contains the list of distinct prime factors of a 64-bit unsigned integer, as * well as the powers of those factors. * * There can be at most 15 such distinct factors, since the product of the * first 16 primes (2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53) exceeds * the maximum unsigned number of 2^64-1. Moreover, there can be at most 63 * factors all together, since 2^64 exceeds 2^64-1, so only an 8-bit number is * required to store the powers. * * The 15th (last) element of the factor list is always 0 and has power 0, * and serves as a sort of sentinel. */ struct ga_factor_list_{ uint64_t f[16];/* Factors */ uint8_t p[16];/* Powers of factors */ int d; /* Number of distinct factors. */ }; /* Functions */ /** * @brief Checks whether an integer is prime. * * @param [in] n The integer whose primality is to be checked. * @return 1 if prime; 0 if not prime. * * NB: This is *not* a probabilistic primality checker. For all integers it can * be given as input, it will correctly report "prime" or "composite". * NB: Internally, this function uses the Miller-Rabin test, which *is* * probabilistic, and may falsely report a number as prime when in fact it * is composite. However, this function uses a deterministic set of * Miller-Rabin "witnesses", which ensures that there are no strong * probable primes equal to or below 2^64-1 (the size of the input * argument). This set of witnesses is * * $$a = 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, and 37$$ * * See https://oeis.org/A014233 */ int gaIIsPrime(uint64_t n); /** * @brief Factorize a positive integer into a list of factors satisfying * certain properties. * * The function factorizes a 64-bit, positive integer into a list of factors. * This factorization can be made "approximate"; That is, the product of the * factors returned can be slightly greater than the input number. The * maximum increase is controlled by a "slack" parameter maxN, as follows: * * $$\texttt{n} \le \prod(\mathrm{fact}(\texttt{n}) \le \texttt{maxN}$$ * * The advantage of offering some slack to the factorizer is that in return, * the factorizer may succeed in outputting a factorization with smaller * factors. The maxN slack parameter must be 0 or be greater than or equal to * n, but it is completely useless to set it beyond 2n. * * When maxN is equal to -1 (2^64 - 1), or is greater than or equal to 2n, no * upper limit is placed on the output factor list's product, but this * implementation guarantees its product will not exceed 2n. This is because * there always exists a power of two that lies between n and 2n, and since * this factorization involves only powers of the smallest prime (2), it is a * valid factorization under any valid k-smoothness constraint, and so may be * returned. * * When maxN is equal to 0 (no increase in value allowed), an exact factoring * is requested. * * The factorization can also be constrained by a (k)-smoothness constraint. * A k-smooth number n has no prime factors greater than k. If the factorizer * is asked to factor with k-smoothness a number with prime factors greater * than k, it will search, within the slack space, for a slightly larger * number that is k-smooth and return that number's factoring. With maxN == n * and a k-smoothness constraint, this function reports whether or not the * number is k-smooth. * * When k is equal to 0, equal to -1 (2^64 - 1), or is greater than or equal * to n, no k-smoothness constraints are imposed. An exact factoring is * required. * * @param [in] n The integer to be factorized. Must be >0. * @param [in] maxN The "slack" parameter. The factor list returned will * not have a product that exceeds this number. * @param [in] k The k-smoothness constraint. k is the largest * acceptable factor in the output factor list. The * factorizer will, effectively, treat any number all of * whose prime factors exceed k as a prime. * @param [out] fl The output factor list. Does *NOT* need to be * initialized. * @return Non-zero if a factorization is found that satisfies both slack and * smoothness constraints; Zero if no such factorization is found. * If this function returns zero, the last factor in the factor * list is not guaranteed to be prime. */ int gaIFactorize(uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl); /** * @brief Initialize a factors list to all-factors- and all-powers-zero. * * Such a factors list represents 1, since 0^0 = 1. */ void gaIFLInit(ga_factor_list* fl); /** * @brief Reports whether another *distinct* factor can be added to the factor * list safely. * * @return Returns zero if there are less than 15 distinct factors in the list * and non-zero otherwise. */ int gaIFLFull(const ga_factor_list* fl); /** * @brief Add a factor f with power p to the factor list. * * If factor f was already present in the factor list, increments * the corresponding power by p. Otherwise, adds the new factor f to * the list, if there is still space, and sets the power to p. * * Maintains factor list in sorted order. * * @return Non-zero if factor successfully added; Zero otherwise. */ int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p); /** * @brief Get the power of a given factor within a factor list. * * @return The number of times a factor occurs within the current * factorization. If it does not occur, return 0. */ int gaIFLGetFactorPower(const ga_factor_list* fl, uint64_t f); /** * @brief Compute the product of the factors stored in the factors list. * * NB: This function may return an overflowed result. To detect if it will, * please call gaIFLIsOverflowed(fl). */ uint64_t gaIFLGetProduct(const ga_factor_list* fl); /** * @brief Check whether the factor list produces a number >= 2^64. */ int gaIFLIsOverflowed(const ga_factor_list* fl); /** * @brief Get the greatest factor in the factors list. */ uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl); /** * @brief Get the smallest factor in the factors list. */ uint64_t gaIFLGetSmallestFactor(const ga_factor_list* fl); /** * @brief Print out the factor list in a human-readable form, sprintf()-style. * * @param [out] str A string into which to print out the factor list. If the * factor list is a result of gaIFactorize(), then the * maximum length of buffer required is 128 bytes. * If str is NULL, nothing is printed. * @param [in] fl The factor list to be printed. * @return The number of characters that would have been printed * out, assuming an unbounded, non-NULL buffer. */ int gaIFLsprintf(char* str, const ga_factor_list* fl); /** * @brief Print out the factor list in a human-readable form. * * @param [out] sb A string into which to print out the factor list. If the * factor list is a result of gaIFactorize(), then the * maximum length of buffer required is 128 bytes. * @param [in] fl The factor list to be printed. */ void gaIFLappend(strb *sb, const ga_factor_list* fl); /** * @brief Schedule block size, grid size and what's left over that fits in * neither, which will be called "chunk" size, subject to constraints. * * @param [in] n Number of dimensions of the problem. The arrays * maxBind, maxGind, factBS, factGS, factCS must have * n elements. * @param [in] maxBtot The product of the block sizes in all n dimensions * will not exceed this value. * @param [in] maxBind The block size in dimensions i=0..n-1 will not * exceed maxBind[i]. * @param [in] maxGtot The product of the grid sizes in all n dimensions * will not exceed this value. * @param [in] maxGind The grid size in dimensions i=0..n-1 will not * exceed maxGind[i]. * @param [in,out] factBS The block size for dimensions 0..n-1, as a factor list. * @param [in,out] factGS The grid size for dimensions 0..n-1, as a factor list. * @param [in,out] factCS The chunk size for dimensions 0..n-1, as a factor list. */ void gaIFLSchedule(const int n, const uint64_t maxBtot, const uint64_t* maxBind, const uint64_t maxGtot, const uint64_t* maxGind, ga_factor_list* factBS, ga_factor_list* factGS, ga_factor_list* factCS); void gaISchedule (const int n, const uint64_t maxBtot, const uint64_t* maxBind, const uint64_t maxGtot, const uint64_t* maxGind, uint64_t* bs, uint64_t* gs, uint64_t* cs); /* End C++ Extern "C" Guard */ #ifdef __cplusplus } #endif /* End Include Guards */ #endif libgpuarray-0.7.6/src/util/skein.c000066400000000000000000000254351326743622600171120ustar00rootroot00000000000000/*********************************************************************** ** ** Implementation of the Skein hash function. ** ** Source code author: Doug Whiting, 2008. ** ** This algorithm and source code is released to the public domain. ** ************************************************************************/ #include /* get the memcpy/memset functions */ #include "skein.h" /* get the Skein API definitions */ #define MK_64 SKEIN_MK_64 /* blkSize = 512 bits. hashSize = 512 bits */ static const u64b_t SKEIN_512_IV_512[] = { MK_64(0x4903ADFF,0x749C51CE), MK_64(0x0D95DE39,0x9746DF03), MK_64(0x8FD19341,0x27C79BCE), MK_64(0x9A255629,0xFF352CB1), MK_64(0x5DB62599,0xDF6CA7B0), MK_64(0xEABE394C,0xA9D5C3F4), MK_64(0x991112C7,0x1A75B523), MK_64(0xAE18A40B,0x660FCC33) }; static void Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt) { size_t n; for (n = 0; n < bCnt; n++) dst[n] = (u08b_t)(src[n>>3] >> (8*(n&7))); } static void Skein_Get64_LSB_First(u64b_t *dst, const u08b_t *src, size_t wCnt) { size_t n; for (n=0; n<8*wCnt; n+=8) dst[n/8] = (((u64b_t) src[n ])) + (((u64b_t) src[n+1]) << 8) + (((u64b_t) src[n+2]) << 16) + (((u64b_t) src[n+3]) << 24) + (((u64b_t) src[n+4]) << 32) + (((u64b_t) src[n+5]) << 40) + (((u64b_t) src[n+6]) << 48) + (((u64b_t) src[n+7]) << 56) ; } static u64b_t Skein_Swap64(u64b_t in) { u64b_t o; u08b_t *out = (u08b_t *)&o; out[7] = in >> 56; out[6] = in >> 48; out[5] = in >> 40; out[4] = in >> 32; out[3] = in >> 24; out[2] = in >> 16; out[1] = in >> 8; out[0] = in; return o; } /*****************************************************************/ /* Function to process blkCnt (nonzero) full block(s) of data. */ #define BLK_BITS (WCNT*64) /* some useful definitions for \ code here */ #define KW_TWK_BASE (0) #define KW_KEY_BASE (3) #define ks (kw + KW_KEY_BASE) #define ts (kw + KW_TWK_BASE) #define RotL_64(x,N) (((x) << (N)) | ((x) >> (64-(N)))) static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const u08b_t *blkPtr, size_t blkCnt, size_t byteCntAdd) { enum { WCNT = SKEIN_512_STATE_WORDS }; #define RCNT (SKEIN_512_ROUNDS_TOTAL/8) u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ u64b_t X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */ u64b_t w [WCNT]; /* local copy of input block */ Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; do { /* this implementation only supports 2**64 input bytes (no carry out here) */ ts[0] += byteCntAdd; /* update processed length */ /* precompute the key schedule for this block */ ks[0] = ctx->X[0]; ks[1] = ctx->X[1]; ks[2] = ctx->X[2]; ks[3] = ctx->X[3]; ks[4] = ctx->X[4]; ks[5] = ctx->X[5]; ks[6] = ctx->X[6]; ks[7] = ctx->X[7]; ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; ts[2] = ts[0] ^ ts[1]; Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ X0 = w[0] + ks[0]; /* do the first full key injection */ X1 = w[1] + ks[1]; X2 = w[2] + ks[2]; X3 = w[3] + ks[3]; X4 = w[4] + ks[4]; X5 = w[5] + ks[5] + ts[0]; X6 = w[6] + ks[6] + ts[1]; X7 = w[7] + ks[7]; blkPtr += SKEIN_512_BLOCK_BYTES; /* run the rounds */ #define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \ X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \ #define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) /* unrolled */ \ Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) #define I512(R) \ X0 += ks[((R)+1) % 9]; /* inject the key schedule value */ \ X1 += ks[((R)+2) % 9]; \ X2 += ks[((R)+3) % 9]; \ X3 += ks[((R)+4) % 9]; \ X4 += ks[((R)+5) % 9]; \ X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ X7 += ks[((R)+8) % 9] + (R)+1; { #define R512_8_rounds(R) /* do 8 full rounds */ \ R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ I512(2*(R)); \ R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ I512(2*(R)+1); /* and key injection */ R512_8_rounds( 0); #define R512_Unroll_R(NN) (SKEIN_512_ROUNDS_TOTAL/8 > (NN)) #if R512_Unroll_R( 1) R512_8_rounds( 1); #endif #if R512_Unroll_R( 2) R512_8_rounds( 2); #endif #if R512_Unroll_R( 3) R512_8_rounds( 3); #endif #if R512_Unroll_R( 4) R512_8_rounds( 4); #endif #if R512_Unroll_R( 5) R512_8_rounds( 5); #endif #if R512_Unroll_R( 6) R512_8_rounds( 6); #endif #if R512_Unroll_R( 7) R512_8_rounds( 7); #endif #if R512_Unroll_R( 8) R512_8_rounds( 8); #endif #if R512_Unroll_R( 9) R512_8_rounds( 9); #endif #if R512_Unroll_R(10) R512_8_rounds(10); #endif #if R512_Unroll_R(11) R512_8_rounds(11); #endif #if R512_Unroll_R(12) R512_8_rounds(12); #endif #if R512_Unroll_R(13) R512_8_rounds(13); #endif #if R512_Unroll_R(14) R512_8_rounds(14); #endif } /* do the final "feedforward" xor, update context chaining vars */ ctx->X[0] = X0 ^ w[0]; ctx->X[1] = X1 ^ w[1]; ctx->X[2] = X2 ^ w[2]; ctx->X[3] = X3 ^ w[3]; ctx->X[4] = X4 ^ w[4]; ctx->X[5] = X5 ^ w[5]; ctx->X[6] = X6 ^ w[6]; ctx->X[7] = X7 ^ w[7]; ts[1] &= ~SKEIN_T1_FLAG_FIRST; } while (--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; } /*****************************************************************/ /* 512-bit Skein */ /*****************************************************************/ /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* init the context for a straight hashing operation */ int Skein_512_Init(Skein_512_Ctxt_t *ctx) { ctx->h.hashBitLen = 512; /* output hash bit count */ memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X)); /* Set up to process the data message portion of the hash (default) */ Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ return SKEIN_SUCCESS; } /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* process the input bytes */ int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) { size_t n; Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ /* process full blocks, if any */ if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) { if (ctx->h.bCnt) { /* finish up any buffered message data */ n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ if (n) { Skein_assert(n < msgByteCnt); /* check on our logic here */ memcpy(&ctx->bb.b[ctx->h.bCnt],msg,n); msgByteCnt -= n; msg += n; ctx->h.bCnt += n; } Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES); Skein_512_Process_Block(ctx,ctx->bb.b,1,SKEIN_512_BLOCK_BYTES); ctx->h.bCnt = 0; } /* now process any remaining full blocks, directly from input message data */ if (msgByteCnt > SKEIN_512_BLOCK_BYTES) { n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */ Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES); msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; msg += n * SKEIN_512_BLOCK_BYTES; } Skein_assert(ctx->h.bCnt == 0); } /* copy any remaining source message data bytes into b[] */ if (msgByteCnt) { Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES); memcpy(&ctx->bb.b[ctx->h.bCnt],msg,msgByteCnt); ctx->h.bCnt += msgByteCnt; } return SKEIN_SUCCESS; } /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* finalize the hash computation and output the result */ int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) { size_t i,n,byteCnt; u64b_t X[SKEIN_512_STATE_WORDS]; Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ memset(&ctx->bb.b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); Skein_512_Process_Block(ctx,ctx->bb.b,1,ctx->h.bCnt); /* process the final block */ /* now output the result */ byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ /* run Threefish in "counter mode" to generate output */ memset(ctx->bb.b,0,sizeof(ctx->bb.b)); /* zero out b[], so it can hold the counter */ memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) { ctx->bb.l[0] = Skein_Swap64((u64b_t) i); /* build the counter block */ Skein_Start_New_Type(ctx,OUT_FINAL); Skein_512_Process_Block(ctx,ctx->bb.b,1,sizeof(u64b_t)); /* run "counter mode" */ n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ if (n >= SKEIN_512_BLOCK_BYTES) n = SKEIN_512_BLOCK_BYTES; Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ } return SKEIN_SUCCESS; } int Skein_512(const u08b_t *msg, size_t msgByteCnt, u08b_t *hashVal) { Skein_512_Ctxt_t ctx; if (Skein_512_Init(&ctx)) return SKEIN_FAIL; if (Skein_512_Update(&ctx, msg, msgByteCnt)) return SKEIN_FAIL; if (Skein_512_Final(&ctx, hashVal)) return SKEIN_FAIL; return SKEIN_SUCCESS; } libgpuarray-0.7.6/src/util/skein.h000066400000000000000000000143431326743622600171130ustar00rootroot00000000000000#ifndef _SKEIN_H_ #define _SKEIN_H_ 1 /************************************************************************** ** ** Interface declarations and internal definitions for Skein hashing. ** ** Source code author: Doug Whiting, 2008. ** ** This algorithm and source code is released to the public domain. ** *************************************************************************** ** ** The following compile-time switches may be defined to control some ** tradeoffs between speed, code size, error checking, and security. ** ** The "default" note explains what happens when the switch is not defined. ** ** SKEIN_ERR_CHECK -- how error checking is handled inside Skein ** code. If not defined, most error checking ** is disabled (for performance). Otherwise, ** the switch value is interpreted as: ** 0: use assert() to flag errors ** 1: return SKEIN_FAIL to flag errors ** ***************************************************************************/ #ifdef __cplusplus extern "C" { #endif #include /* get size_t definition */ #include typedef unsigned int uint_t; typedef uint8_t u08b_t; typedef uint64_t u64b_t; enum { SKEIN_SUCCESS = 0, /* return codes from Skein calls */ SKEIN_FAIL = 1 }; #define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ #define SKEIN_512_STATE_WORDS ( 8) #define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) #define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) #define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) typedef struct { size_t hashBitLen; /* size of hash result, in bits */ size_t bCnt; /* current byte count in buffer b[] */ u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */ } Skein_Ctxt_Hdr_t; typedef struct { /* 512-bit Skein hash context structure */ Skein_Ctxt_Hdr_t h; /* common header context variables */ u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ union Skein_512_Ctxt_b_u { u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ u64b_t l[SKEIN_512_BLOCK_BYTES/8]; } bb; } Skein_512_Ctxt_t; /* Skein APIs for (incremental) "straight hashing" */ int Skein_512_Init (Skein_512_Ctxt_t *ctx); int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); int Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); int Skein_512(const u08b_t *msg, size_t msgByteCnt, u08b_t *hashVal); /***************************************************************** ** "Internal" Skein definitions ** -- not needed for sequential hashing API, but will be ** helpful for other uses of Skein (e.g., tree hash mode). ** -- included here so that they can be shared between ** reference and optimized code. ******************************************************************/ /* tweak word T[1]: bit field starting positions */ #define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ #define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ #define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ #define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ /* tweak word T[1]: flag bit definition(s) */ #define SKEIN_T1_FLAG_FIRST (((u64b_t) 1 ) << SKEIN_T1_POS_FIRST) #define SKEIN_T1_FLAG_FINAL (((u64b_t) 1 ) << SKEIN_T1_POS_FINAL) /* tweak word T[1]: block type field */ #define SKEIN_BLK_TYPE_MSG (48) /* message processing */ #define SKEIN_BLK_TYPE_OUT (63) /* output stage */ #define SKEIN_T1_BLK_TYPE(T) (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) #define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ #define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ #define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) #define SKEIN_MK_64(hi32,lo32) ((lo32) + (((u64b_t) (hi32)) << 32)) #define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) /* ** Skein macros for setting tweak words, etc. **/ #define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} #define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) #define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) /* set both tweak words at once */ #define Skein_Set_T0_T1(ctxPtr,T0,T1) \ { \ Skein_Set_T0(ctxPtr,(T0)); \ Skein_Set_T1(ctxPtr,(T1)); \ } /* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */ #define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } /************************************************** ** "Internal" Skein definitions for error checking ***************************************************/ #include #define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /* caller error */ #define Skein_assert(x) assert(x) /* internal error */ /***************************************************************** ** Skein block function constants (shared across Ref and Opt code) ******************************************************************/ enum { /* Skein_512 round rotation constants */ R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22, }; #ifdef __cplusplus } #endif #endif /* ifndef _SKEIN_H_ */ libgpuarray-0.7.6/src/util/strb.c000066400000000000000000000035571326743622600167540ustar00rootroot00000000000000#define _CRT_SECURE_NO_WARNINGS #include #include #ifdef _MSC_VER #include #define read _read #define write _write #else #include #endif #include "util/strb.h" strb *strb_alloc(size_t i) { strb *res = malloc(sizeof(strb)); if (res == NULL) return NULL; res->s = malloc(i); if (res->s == NULL) { free(res); return NULL; } res->a = i; res->l = 0; return res; } void strb_free(strb *sb) { free(sb->s); free(sb); } int strb_grow(strb *sb, size_t n) { char *s; if (strb_error(sb)) return -1; if (sb->a == 0 && n < 1024) n = 1024; if (sb->a > n) n = sb->a; /* overflow */ if (SIZE_MAX - sb->a < n) { strb_seterror(sb); return -1; } s = realloc(sb->s, sb->a + n); if (s == NULL) { strb_seterror(sb); return -1; } sb->s = s; sb->a += n; return 0; } void strb_appendf(strb *sb, const char *f, ...) { va_list ap; int s; va_start(ap, f); #ifdef _MSC_VER s = _vscprintf(f, ap); #else s = vsnprintf(NULL, 0, f, ap); #endif va_end(ap); if (s < 0) { strb_seterror(sb); return; } s += 1; if (strb_ensure(sb, s)) return; va_start(ap, f); s = vsnprintf(sb->s+sb->l, s, f, ap); va_end(ap); sb->l += s; } void strb_read(strb *sb, int fd, size_t sz) { ssize_t res; char *b; if (strb_ensure(sb, sz)) return; b = sb->s + sb->l; sb->l += sz; while (sz) { res = read(fd, b, sz); if (res == -1 || res == 0) { if (res == -1 && (errno == EAGAIN || errno == EINTR)) continue; strb_seterror(sb); return; } sz -= (size_t)res; b += (size_t)res; } } int strb_write(int fd, strb *sb) { ssize_t res; size_t l = sb->l; char *b = sb->s; while (l) { res = write(fd, b, l); if (res == -1) { if (errno == EAGAIN || errno == EINTR) continue; return -1; } l -= (size_t)res; b += (size_t)res; } return 0; } libgpuarray-0.7.6/src/util/strb.h000066400000000000000000000116641326743622600167570ustar00rootroot00000000000000#ifndef STRB_H #define STRB_H #include "private_config.h" #ifdef __cplusplus extern "C" { #endif #ifdef CONFUSE_EMACS } #endif /* * Main strb structure. * `s`: pointer to character data, not guaranteed to be nul-terminated. * `l`: current length of valid data in `s`. * `a`: current length of allocated data in `s`. Always >= l. */ typedef struct _strb { char *s; size_t l; size_t a; } strb; /* * Static initializer for a stack or globalc declaration of an strb. * Usage: * strb sb = STRB_STATIC_INIT; * * It is an error to leave an strb uninitialized. */ #define STRB_STATIC_INIT {NULL, 0, 0} /* * Return a pointer to a dynamically allocated strb with `s` bytes * preallocated in its data member. * * The returned pointer needs to be freed with strb_free(). * * Returns NULL on error. */ strb *strb_alloc(size_t s); /* * Frees an strb that was dynamically allocated. * * Don't call this for stack of global declarations, see strb_clear() instead. */ void strb_free(strb *sb); /* * Return a pointer to a dynamically allocated strb with a default * initial size. See strb_alloc() for defails. */ #define strb_new() strb_alloc(1024) /* * Resets the length to 0. Also clears error mode. */ static inline void strb_reset(strb *sb) { sb->l = 0; } /* * Place the strb in error mode where further attempts to append * data will silently fail. */ static inline int strb_seterror(strb *sb) { sb->l = (size_t)-1; return -1; } /* * Returns true if the strb is in error mode. */ static inline int strb_error(strb *sb) { return sb->l == (size_t)-1; } /* * Clear any allocation the strb may have done and reset all of its * members to the initial state. The strb can be used as new after * this call. */ static inline void strb_clear(strb *sb) { free(sb->s); sb->s = NULL; sb->a = 0; sb->l = 0; } /* * Grow the allocation of the strb by at least `s`. * * This should almost never be called directly. Use strb_ensure() * instead. */ int strb_grow(strb *sb, size_t s); /* * Make sure there is space to store at least `s` bytes of data after * the current data. * * Since the auto-allocation algorithm is tuned to small-ish strings * (below 4kb), it may be better from a performance point of view to * preallocate space yourself, using strb_ensure() with a large * number. */ static inline int strb_ensure(strb *sb, size_t s) { if (strb_error(sb)) return -1; if (sb->a - sb->l < s) return strb_grow(sb, s); return 0; } /* * Append a character to the data. */ static inline void strb_appendc(strb *sb, char c) { if (strb_ensure(sb, 1)) return; sb->s[sb->l++] = c; } /* * Append a NUL ('\0') to the data. */ #define strb_append0(s) strb_appendc(s, '\0') /* * Appends `n` bytes from buffer `s`. */ static inline void strb_appendn(strb *sb, const char *s, size_t n) { if (strb_ensure(sb, n)) return; memcpy(sb->s+sb->l, s, n); sb->l += n; } /* * Appends the content of the nul-terminated string `s`, excluding the * final nul. */ static inline void strb_appends(strb *sb, const char *s) { strb_appendn(sb, s, strlen(s)); } /* * Appends the content of another strb. */ static inline void strb_appendb(strb *sb, const strb *sb2) { strb_appendn(sb, sb2->s, sb2->l); } /* * Appends the result of a sprintf using the format string `f` and * following arguments, excluding terminating nul. * * Unlike sprintf, this function makes sure not to run off the end of * memory and behaves like asprintf in that respect. * * A format error will place the strb in error mode. */ void strb_appendf(strb *sb, const char *f, ...); /* * Reads from the file specified by the given file descriptor. * * This will read `sz` bytes from the file descriptor. Insufficient * data is handled as a read error. * * A read error will place the strb in error mode. */ void strb_read(strb *sb, int fd, size_t sz); /* * Write the content of an strb to the specified file descriptor. * * Write errors will be signaled by a nonzero return value. */ int strb_write(int fd, strb *sb); /* * Returns a C string from the content of the strb. * * Returns the `s` member of the strb after ensuring that a * terminating nul is appended. This value must be freed with * free(). * * If the strb is in error mode, this function will clear it and * return NULL. * * The strb should not be reused after this function is called (nor * should it be cleared). * * This behavior makes it easy for functions that build a string and * return the result as a C string. */ static inline char *strb_cstr(strb *sb) { strb_append0(sb); if (strb_error(sb)) { strb_clear(sb); return NULL; } sb->l--; return sb->s; } #ifdef DEBUG /* * Use this for debugging. It prints the content of the strb to the C * stream fd. May give strange results if the string contains binary * data. */ static inline void strb_dump(strb *sb, FILE *fd) { if (!strb_error(sb)) fwrite(sb->s, sb->l, 1, fd); } #endif #ifdef __cplusplus } #endif #endif libgpuarray-0.7.6/src/util/xxhash.c000066400000000000000000000333461326743622600173040ustar00rootroot00000000000000/* xxHash - Fast Hash algorithm Copyright (C) 2012-2015, Yann Collet BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - xxHash source repository : https://github.com/Cyan4973/xxHash */ /* XXH_FORCE_NATIVE_FORMAT : * By default, xxHash library provides endian-independant Hash values, based on little-endian convention. * Results are therefore identical for little-endian and big-endian CPU. * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. * Should endian-independance be of no importance for your application, you may set the #define below to 1, * to improve speed for Big-endian CPU. * This option has no impact on Little_Endian CPU. */ #define XXH_FORCE_NATIVE_FORMAT 1 /* XXH_USELESS_ALIGN_BRANCH : * This is a minor performance trick, only useful with lots of very small keys. * It means : don't make a test between aligned/unaligned, because performance will be the same. * It saves one initial branch per hash. */ #if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) # define XXH_USELESS_ALIGN_BRANCH 1 #endif /************************************** * Compiler Specific Options ***************************************/ #ifdef _MSC_VER /* Visual Studio */ # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ # define FORCE_INLINE static __forceinline #else # if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ # ifdef __GNUC__ # define FORCE_INLINE static inline __attribute__((always_inline)) # else # define FORCE_INLINE static inline # endif # else # define FORCE_INLINE static # endif /* __STDC_VERSION__ */ #endif /************************************** * Includes & Memory related functions ***************************************/ #include "xxhash.h" /* Modify the local functions below should you wish to use some other memory routines */ /* for memcpy() */ #include static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } /************************************** * Basic Types ***************************************/ #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ # include typedef uint8_t BYTE; typedef uint16_t U16; typedef uint32_t U32; typedef int32_t S32; typedef uint64_t U64; #else typedef unsigned char BYTE; typedef unsigned short U16; typedef unsigned int U32; typedef signed int S32; typedef unsigned long long U64; #endif static U32 XXH_read32(const void* memPtr) { U32 val; memcpy(&val, memPtr, sizeof(val)); return val; } /****************************************** * Compiler-specific Functions and Macros ******************************************/ #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) /* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ #if defined(_MSC_VER) # define XXH_rotl32(x,r) _rotl(x,r) #else # define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) #endif #if defined(_MSC_VER) /* Visual Studio */ # define XXH_swap32 _byteswap_ulong #elif GCC_VERSION >= 403 # define XXH_swap32 __builtin_bswap32 #else static U32 XXH_swap32 (U32 x) { return ((x << 24) & 0xff000000 ) | ((x << 8) & 0x00ff0000 ) | ((x >> 8) & 0x0000ff00 ) | ((x >> 24) & 0x000000ff ); } #endif /*************************************** * Architecture Macros ***************************************/ typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example one the compiler command line */ #ifndef XXH_CPU_LITTLE_ENDIAN static const int one = 1; # define XXH_CPU_LITTLE_ENDIAN (*(const char*)(&one)) #endif /***************************** * Memory reads *****************************/ typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) { if (align==XXH_unaligned) return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); else return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); } FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) { return XXH_readLE32_align(ptr, endian, XXH_unaligned); } /*************************************** * Macros ***************************************/ #define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(!!(c)) }; } /* use only *after* variable declarations */ /*************************************** * Constants ***************************************/ #define PRIME32_1 2654435761U #define PRIME32_2 2246822519U #define PRIME32_3 3266489917U #define PRIME32_4 668265263U #define PRIME32_5 374761393U /***************************** * Simple Hash Functions *****************************/ FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) { const BYTE* p = (const BYTE*)input; const BYTE* bEnd = p + len; U32 h32; #define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) if (len>=16) { const BYTE* const limit = bEnd - 16; U32 v1 = seed + PRIME32_1 + PRIME32_2; U32 v2 = seed + PRIME32_2; U32 v3 = seed + 0; U32 v4 = seed - PRIME32_1; do { v1 += XXH_get32bits(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; v2 += XXH_get32bits(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; v3 += XXH_get32bits(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; v4 += XXH_get32bits(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; } while (p<=limit); h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); } else { h32 = seed + PRIME32_5; } h32 += (U32) len; while (p+4<=bEnd) { h32 += XXH_get32bits(p) * PRIME32_3; h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; p+=4; } while (p> 15; h32 *= PRIME32_2; h32 ^= h32 >> 13; h32 *= PRIME32_3; h32 ^= h32 >> 16; return h32; } unsigned int XXH32 (const void* input, size_t len, unsigned int seed) { #if 0 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ XXH32_state_t state; XXH32_reset(&state, seed); XXH32_update(&state, input, len); return XXH32_digest(&state); #else XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; # if !defined(XXH_USELESS_ALIGN_BRANCH) if ((((size_t)input) & 3) == 0) /* Input is 4-bytes aligned, leverage the speed benefit */ { if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); else return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); } # endif if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); else return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); #endif } /**************************************************** * Advanced Hash Functions ****************************************************/ /*** Allocation ***/ typedef struct { U64 total_len; U32 seed; U32 v1; U32 v2; U32 v3; U32 v4; U32 mem32[4]; /* defined as U32 for alignment */ U32 memsize; } XXH_istate32_t; /*** Hash feed ***/ XXH_errorcode XXH32_reset(XXH32_state_t* state_in, unsigned int seed) { XXH_istate32_t* state = (XXH_istate32_t*) state_in; state->seed = seed; state->v1 = seed + PRIME32_1 + PRIME32_2; state->v2 = seed + PRIME32_2; state->v3 = seed + 0; state->v4 = seed - PRIME32_1; state->total_len = 0; state->memsize = 0; return XXH_OK; } FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian) { XXH_istate32_t* state = (XXH_istate32_t *) state_in; const BYTE* p = (const BYTE*)input; const BYTE* const bEnd = p + len; #ifdef XXH_ACCEPT_NULL_INPUT_POINTER if (input==NULL) return XXH_ERROR; #endif state->total_len += len; if (state->memsize + len < 16) /* fill in tmp buffer */ { XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); state->memsize += (U32)len; return XXH_OK; } if (state->memsize) /* some data left from previous update */ { XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); { const U32* p32 = state->mem32; state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++; state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++; state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++; state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++; } p += 16-state->memsize; state->memsize = 0; } if (p <= bEnd-16) { const BYTE* const limit = bEnd - 16; U32 v1 = state->v1; U32 v2 = state->v2; U32 v3 = state->v3; U32 v4 = state->v4; do { v1 += XXH_readLE32(p, endian) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; v2 += XXH_readLE32(p, endian) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; v3 += XXH_readLE32(p, endian) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; v4 += XXH_readLE32(p, endian) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; } while (p<=limit); state->v1 = v1; state->v2 = v2; state->v3 = v3; state->v4 = v4; } if (p < bEnd) { XXH_memcpy(state->mem32, p, bEnd-p); state->memsize = (int)(bEnd-p); } return XXH_OK; } XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) { XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH32_update_endian(state_in, input, len, XXH_littleEndian); else return XXH32_update_endian(state_in, input, len, XXH_bigEndian); } FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian) { const XXH_istate32_t* state = (const XXH_istate32_t*) state_in; const BYTE * p = (const BYTE*)state->mem32; const BYTE* bEnd = (const BYTE*)(state->mem32) + state->memsize; U32 h32; if (state->total_len >= 16) { h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); } else { h32 = state->seed + PRIME32_5; } h32 += (U32) state->total_len; while (p+4<=bEnd) { h32 += XXH_readLE32(p, endian) * PRIME32_3; h32 = XXH_rotl32(h32, 17) * PRIME32_4; p+=4; } while (p> 15; h32 *= PRIME32_2; h32 ^= h32 >> 13; h32 *= PRIME32_3; h32 ^= h32 >> 16; return h32; } unsigned int XXH32_digest (const XXH32_state_t* state_in) { XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH32_digest_endian(state_in, XXH_littleEndian); else return XXH32_digest_endian(state_in, XXH_bigEndian); } libgpuarray-0.7.6/src/util/xxhash.h000066400000000000000000000113511326743622600173010ustar00rootroot00000000000000#ifndef XXHASH_H #define XXHASH_H /* xxHash - Extremely Fast Hash algorithm Header File Copyright (C) 2012-2015, Yann Collet. BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - xxHash source repository : https://github.com/Cyan4973/xxHash */ /* Notice extracted from xxHash homepage : xxHash is an extremely fast Hash algorithm, running at RAM speed limits. It also successfully passes all tests from the SMHasher suite. Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) Name Speed Q.Score Author xxHash 5.4 GB/s 10 CrapWow 3.2 GB/s 2 Andrew MumurHash 3a 2.7 GB/s 10 Austin Appleby SpookyHash 2.0 GB/s 10 Bob Jenkins SBox 1.4 GB/s 9 Bret Mulvey Lookup3 1.2 GB/s 9 Bob Jenkins SuperFastHash 1.2 GB/s 1 Paul Hsieh CityHash64 1.05 GB/s 10 Pike & Alakuijala FNV 0.55 GB/s 5 Fowler, Noll, Vo CRC32 0.43 GB/s 9 MD5-32 0.33 GB/s 10 Ronald L. Rivest SHA1-32 0.28 GB/s 10 Q.Score is a measure of quality of the hash function. It depends on successfully passing SMHasher test set. 10 is a perfect score. */ #if defined (__cplusplus) extern "C" { #endif /***************************** * Definitions *****************************/ #include "gpuarray/config.h" typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; /***************************** * Simple Hash Functions *****************************/ unsigned int XXH32 (const void* input, size_t length, unsigned seed); /* XXH32() : Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input". The memory between input & input+length must be valid (allocated and read-accessible). "seed" can be used to alter the result predictably. This function successfully passes all SMHasher tests. Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */ /***************************** * Advanced Hash Functions *****************************/ typedef struct { long long ll[ 6]; } XXH32_state_t; /* These structures allow static allocation of XXH states. States must then be initialized using XXH32_reset() before first use. */ XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned seed); XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); unsigned int XXH32_digest (const XXH32_state_t* statePtr); /* These functions calculate the xxHash of an input provided in multiple smaller packets, as opposed to an input provided as a single block. XXH state space must first be allocated, using either static or dynamic method provided above. Start a new hash by initializing state with a seed, using XXHnn_reset(). Then, feed the hash state by calling XXHnn_update() as many times as necessary. Obviously, input must be valid, meaning allocated and read accessible. The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. Finally, you can produce a hash anytime, by using XXHnn_digest(). This function returns the final nn-bits hash. You can nonetheless continue feeding the hash state with more input, and therefore get some new hashes, by calling again XXHnn_digest(). When you are done, don't forget to free XXH state space, using typically XXHnn_freeState(). */ #if defined (__cplusplus) } #endif #endif libgpuarray-0.7.6/tests/000077500000000000000000000000001326743622600152225ustar00rootroot00000000000000libgpuarray-0.7.6/tests/CMakeLists.txt000066400000000000000000000121551326743622600177660ustar00rootroot00000000000000include(CheckCSourceCompiles) include(CheckLibraryExists) find_package(PkgConfig) pkg_search_module(CHECK check) if(CHECK_FOUND) if(CHECK_VERSION VERSION_LESS 0.10.0) MESSAGE( "Check version older than 0.10.0" ) set(CHECK_FOUND 0) endif() else() find_path(CHECK_INCLUDE_DIRS check.h) find_library(CHECK_LIBRARIES NAMES check) if(CHECK_INCLUDE_DIRS AND CHECK_LIBRARIES) set(CHECK_CFLAGS) set(CHECK_LIBRARY_DIRS) set(CHECK_FOUND 1) endif() if(CHECK_FOUND) set(CMAKE_REQUIRED_FLAGS ${CHECK_C_FLAGS} ${CHECK_LDFLAGS_OTHERS}) set(CMAKE_REQUIRED_INCLUDES ${CHECK_INCLUDE_DIRS}) CHECK_LIBRARY_EXISTS(pthread pthread_create "" HAVE_PTHREAD) if (HAVE_PTHREAD) set(CHECK_LIBRARIES ${CHECK_LIBRARIES} pthread) endif (HAVE_PTHREAD) CHECK_LIBRARY_EXISTS(rt nanosleep "" HAVE_LIBRT) if (HAVE_LIBRT) set(CHECK_LIBRARIES ${CHECK_LIBRARIES} rt) endif (HAVE_LIBRT) CHECK_LIBRARY_EXISTS(m cos "" HAVE_LIBM) if (HAVE_LIBM) set(CHECK_LIBRARIES ${CHECK_LIBRARIES} m) endif (HAVE_LIBM) set(CMAKE_REQUIRED_LIBRARIES ${CHECK_LIBRARIES}) CHECK_C_SOURCE_COMPILES( "#include int main() { ck_assert_ptr_ne(NULL, NULL); }" CHECK_FUNCS) if (NOT CHECK_FUNCS) set(CHECK_FOUND 0) endif() endif() endif() if(CHECK_FOUND) enable_testing() include_directories("${CMAKE_SOURCE_DIR}/src") include_directories("${CMAKE_CURRENT_SOURCE_DIR}") include_directories(${CHECK_INCLUDE_DIRS}) link_directories(${CHECK_LIBRARY_DIRS}) foreach(flag ${CHECK_C_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}") endforeach() foreach(flag ${CHECK_LDFLAGS_OTHER}) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}") endforeach() add_executable(check_types main.c check_types.c) target_link_libraries(check_types ${CHECK_LIBRARIES} gpuarray) add_test(test_types "${CMAKE_CURRENT_BINARY_DIR}/check_types") add_executable(check_util main.c check_util.c) target_link_libraries(check_util ${CHECK_LIBRARIES} gpuarray) add_test(test_util "${CMAKE_CURRENT_BINARY_DIR}/check_util") add_executable(check_util_integerfactoring main.c check_util_integerfactoring.c) target_link_libraries(check_util_integerfactoring ${CHECK_LIBRARIES} gpuarray-static) add_test(test_util_integerfactoring "${CMAKE_CURRENT_BINARY_DIR}/check_util_integerfactoring") add_executable(check_reduction main.c device.c check_reduction.c) target_link_libraries(check_reduction ${CHECK_LIBRARIES} gpuarray) add_test(test_reduction "${CMAKE_CURRENT_BINARY_DIR}/check_reduction") add_executable(check_array main.c device.c check_array.c) target_link_libraries(check_array ${CHECK_LIBRARIES} gpuarray) add_test(test_array "${CMAKE_CURRENT_BINARY_DIR}/check_array") add_executable(check_blas main.c device.c check_blas.c) target_link_libraries(check_blas ${CHECK_LIBRARIES} gpuarray) add_test(test_blas "${CMAKE_CURRENT_BINARY_DIR}/check_blas") add_executable(check_elemwise main.c device.c check_elemwise.c) target_link_libraries(check_elemwise ${CHECK_LIBRARIES} gpuarray) add_test(test_elemwise "${CMAKE_CURRENT_BINARY_DIR}/check_elemwise") add_executable(check_error main.c check_error.c) target_link_libraries(check_error ${CHECK_LIBRARIES} gpuarray) add_test(test_error "${CMAKE_CURRENT_BINARY_DIR}/check_error") add_executable(check_buffer main.c device.c check_buffer.c) target_link_libraries(check_buffer ${CHECK_LIBRARIES} gpuarray) add_test(test_buffer "${CMAKE_CURRENT_BINARY_DIR}/check_buffer") find_package(MPI) if (MPI_C_FOUND) add_executable(check_buffer_collectives main.c device.c communicator.c check_buffer_collectives.c ) target_link_libraries(check_buffer_collectives ${CHECK_LIBRARIES} ${MPI_C_LIBRARIES} gpuarray ) target_include_directories(check_buffer_collectives PRIVATE ${MPI_C_INCLUDE_PATH} ) add_executable(check_collectives main.c device.c communicator.c check_collectives.c ) target_link_libraries(check_collectives ${CHECK_LIBRARIES} ${MPI_C_LIBRARIES} gpuarray ) target_include_directories(check_collectives PRIVATE ${MPI_C_INCLUDE_PATH} ) set_target_properties(check_buffer_collectives check_collectives PROPERTIES COMPILE_DEFINITIONS TEST_COLLECTIVES COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}" LINK_FLAGS "${MPI_C_LINK_FLAGS}" ) set(_NUM_DEVS $ENV{NUM_DEVS}) if(NOT _NUM_DEVS) set(_NUM_DEVS 1) endif() set(_DEV_NAMES $ENV{DEV_NAMES}) if(NOT _DEV_NAMES) set(_DEV_NAMES "cuda") endif() separate_arguments(_DEV_NAMES) add_test(NAME test_buffer_collectives COMMAND "${MPIEXEC}" ${MPIEXEC_NUMPROC_FLAG} ${_NUM_DEVS} ${MPIEXEC_PREFLAGS} "${CMAKE_CURRENT_BINARY_DIR}/check_buffer_collectives" ${MPIEXEC_POSTFLAGS} ${_DEV_NAMES}) add_test(NAME test_collectives COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${_NUM_DEVS} ${MPIEXEC_PREFLAGS} "${CMAKE_CURRENT_BINARY_DIR}/check_collectives" ${MPIEXEC_POSTFLAGS} ${_DEV_NAMES}) else() message(WARNING "Cannot find MPI") message(WARNING "Checks on collectives and buffer_collectives will not be built or performed.") endif() ELSE(CHECK_FOUND) MESSAGE("Tests disabled because Check was not found") ENDIF(CHECK_FOUND) libgpuarray-0.7.6/tests/check_array.c000066400000000000000000000171471326743622600176530ustar00rootroot00000000000000#include #include #include #include "gpuarray/array.h" #include "gpuarray/error.h" #include "gpuarray/types.h" extern void *ctx; void setup(void); void teardown(void); #define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR) START_TEST(test_take1_ok) { GpuArray base; GpuArray idx; GpuArray res; GpuArray v; GpuArray vidx; GpuArray vres; const uint32_t data[24] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}; uint32_t buf[12 * 24]; const size_t data_dims[1] = {24}; long indexes[12]; size_t dims[3]; ga_assert_ok(GpuArray_empty(&base, ctx, GA_UINT, 1, data_dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&base, data, sizeof(data))); dims[0] = 12; ga_assert_ok(GpuArray_empty(&idx, ctx, GA_LONG, 1, dims, GA_C_ORDER)); dims[1] = 6; ga_assert_ok(GpuArray_empty(&res, ctx, GA_UINT, 2, dims, GA_C_ORDER)); /* test v[[1, 0]] on 1d (4) */ indexes[0] = 1; indexes[1] = 0; ga_assert_ok(GpuArray_write(&idx, indexes, sizeof(long) * 2)); ga_assert_ok(GpuArray_view(&v, &base)); ga_assert_ok(GpuArray_view(&vidx, &idx)); ga_assert_ok(GpuArray_view(&vres, &res)); v.dimensions[0] = 4; GpuArray_fix_flags(&v); vidx.dimensions[0] = 2; GpuArray_fix_flags(&vidx); vres.nd = 1; vres.dimensions[0] = vidx.dimensions[0]; vres.strides[0] = v.strides[0]; GpuArray_fix_flags(&vres); ga_assert_ok(GpuArray_take1(&vres, &v, &vidx, 0)); ga_assert_ok(GpuArray_read(buf, sizeof(uint32_t) * 2, &vres)); ck_assert(buf[0] == 1); ck_assert(buf[1] == 0); /* test v[[2, 3, -1]] on 2d (4, 5) */ GpuArray_clear(&v); GpuArray_clear(&vidx); GpuArray_clear(&vres); indexes[0] = 2; indexes[1] = 3; indexes[2] = -1; ga_assert_ok(GpuArray_write(&idx, indexes, sizeof(ssize_t) * 3)); ga_assert_ok(GpuArray_view(&v, &base)); ga_assert_ok(GpuArray_view(&vidx, &idx)); ga_assert_ok(GpuArray_view(&vres, &res)); vidx.dimensions[0] = 3; GpuArray_fix_flags(&vidx); dims[0] = 4; dims[1] = 6; ga_assert_ok(GpuArray_reshape_inplace(&v, 2, dims, GA_ANY_ORDER)); v.dimensions[1] = 5; v.strides[0] = v.dimensions[1] * v.strides[1]; GpuArray_fix_flags(&v); dims[0] = 3; dims[1] = 24; ga_assert_ok(GpuArray_reshape_inplace(&vres, 2, dims, GA_C_ORDER)); vres.dimensions[1] = v.dimensions[1]; vres.strides[0] = v.strides[0]; GpuArray_fix_flags(&vres); ga_assert_ok(GpuArray_take1(&vres, &v, &vidx, 0)); ga_assert_ok(GpuArray_read(buf, sizeof(uint32_t) * 15, &vres)); ck_assert(buf[0] == 10); ck_assert(buf[1] == 11); ck_assert(buf[2] == 12); ck_assert(buf[3] == 13); ck_assert(buf[4] == 14); ck_assert(buf[5] == 15); ck_assert(buf[6] == 16); ck_assert(buf[7] == 17); ck_assert(buf[8] == 18); ck_assert(buf[9] == 19); ck_assert(buf[10] == 15); ck_assert(buf[11] == 16); ck_assert(buf[12] == 17); ck_assert(buf[13] == 18); ck_assert(buf[14] == 19); /* test v[[3, 3, 1, 1, 2, 2, 0, 0, -1, -2, -3, -4]] on 3d */ GpuArray_clear(&v); GpuArray_clear(&vidx); GpuArray_clear(&vres); indexes[0] = 3; indexes[1] = 3; indexes[2] = 1; indexes[3] = 1; indexes[4] = 2; indexes[5] = 2; indexes[6] = 0; indexes[7] = 0; indexes[8] = -1; indexes[9] = -2; indexes[10] = -3; indexes[11] = -4; ga_assert_ok(GpuArray_write(&idx, indexes, sizeof(indexes))); ga_assert_ok(GpuArray_view(&v, &base)); ga_assert_ok(GpuArray_view(&vidx, &idx)); ga_assert_ok(GpuArray_view(&vres, &res)); dims[0] = 4; dims[1] = 2; dims[2] = 3; ga_assert_ok(GpuArray_reshape_inplace(&v, 3, dims, GA_ANY_ORDER)); dims[0] = 12; dims[1] = 2; dims[2] = 3; ga_assert_ok(GpuArray_reshape_inplace(&vres, 3, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_take1(&vres, &v, &vidx, 0)); ga_assert_ok(GpuArray_read(buf, sizeof(uint32_t) * 72, &vres)); /* 0 */ ck_assert(buf[0] == 18); ck_assert(buf[1] == 19); ck_assert(buf[2] == 20); ck_assert(buf[3] == 21); ck_assert(buf[4] == 22); ck_assert(buf[5] == 23); /* 1 */ ck_assert(buf[6] == 18); ck_assert(buf[7] == 19); ck_assert(buf[8] == 20); ck_assert(buf[9] == 21); ck_assert(buf[10] == 22); ck_assert(buf[11] == 23); /* 2 */ ck_assert(buf[12] == 6); ck_assert(buf[13] == 7); ck_assert(buf[14] == 8); ck_assert(buf[15] == 9); ck_assert(buf[16] == 10); ck_assert(buf[17] == 11); /* 3 */ ck_assert(buf[18] == 6); ck_assert(buf[19] == 7); ck_assert(buf[20] == 8); ck_assert(buf[21] == 9); ck_assert(buf[22] == 10); ck_assert(buf[23] == 11); /* 4 */ ck_assert(buf[24] == 12); ck_assert(buf[25] == 13); ck_assert(buf[26] == 14); ck_assert(buf[27] == 15); ck_assert(buf[28] == 16); ck_assert(buf[29] == 17); /* 5 */ ck_assert(buf[30] == 12); ck_assert(buf[31] == 13); ck_assert(buf[32] == 14); ck_assert(buf[33] == 15); ck_assert(buf[34] == 16); ck_assert(buf[35] == 17); /* 6 */ ck_assert(buf[36] == 0); ck_assert(buf[37] == 1); ck_assert(buf[38] == 2); ck_assert(buf[39] == 3); ck_assert(buf[40] == 4); ck_assert(buf[41] == 5); /* 7 */ ck_assert(buf[42] == 0); ck_assert(buf[43] == 1); ck_assert(buf[44] == 2); ck_assert(buf[45] == 3); ck_assert(buf[46] == 4); ck_assert(buf[47] == 5); /* 8 */ ck_assert(buf[48] == 18); ck_assert(buf[49] == 19); ck_assert(buf[50] == 20); ck_assert(buf[51] == 21); ck_assert(buf[52] == 22); ck_assert(buf[53] == 23); /* 9 */ ck_assert(buf[54] == 12); ck_assert(buf[55] == 13); ck_assert(buf[56] == 14); ck_assert(buf[57] == 15); ck_assert(buf[58] == 16); ck_assert(buf[59] == 17); /* 10 */ ck_assert(buf[60] == 6); ck_assert(buf[61] == 7); ck_assert(buf[62] == 8); ck_assert(buf[63] == 9); ck_assert(buf[64] == 10); ck_assert(buf[65] == 11); /* 11 */ ck_assert(buf[66] == 0); ck_assert(buf[67] == 1); ck_assert(buf[68] == 2); ck_assert(buf[69] == 3); ck_assert(buf[70] == 4); ck_assert(buf[71] == 5); } END_TEST START_TEST(test_take1_offset) { const uint32_t data[4] = {0, 1, 2, 3}; const size_t data_dims[1] = {4}; const size_t out_dims[1] = {2}; const uint32_t idx[4] = {20, 3, 3, 2}; GpuArray v; GpuArray i; GpuArray r; ga_assert_ok(GpuArray_empty(&v, ctx, GA_UINT, 1, data_dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&v, data, sizeof(data))); ga_assert_ok(GpuArray_empty(&i, ctx, GA_UINT, 1, data_dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&i, idx, sizeof(idx))); ga_assert_ok(GpuArray_empty(&r, ctx, GA_UINT, 1, out_dims, GA_C_ORDER)); /* Fake subtensor for offset */ i.offset += 8; i.dimensions[0] = 2; GpuArray_fix_flags(&i); ga_assert_ok(GpuArray_take1(&r, &v, &i, 1)); /* The actual results are not important, this is just to check that we don't trigger the out of bounds check */ } END_TEST START_TEST(test_reshape_0) { /* This tests that we don't segfault when reshaping 0-sized arrays */ const size_t odims[3] = {24, 0, 33}; const size_t ndims1[3] = {0, 24, 33}; const size_t ndims2[3] = {24, 33, 0}; GpuArray v; ga_assert_ok(GpuArray_empty(&v, ctx, GA_FLOAT, 3, odims, GA_C_ORDER)); ga_assert_ok(GpuArray_reshape_inplace(&v, 3, ndims1, GA_ANY_ORDER)); ga_assert_ok(GpuArray_reshape_inplace(&v, 3, odims, GA_ANY_ORDER)); ga_assert_ok(GpuArray_reshape_inplace(&v, 3, ndims2, GA_ANY_ORDER)); } END_TEST Suite *get_suite(void) { Suite *s = suite_create("array"); TCase *tc = tcase_create("take1"); tcase_add_checked_fixture(tc, setup, teardown); tcase_set_timeout(tc, 8.0); tcase_add_test(tc, test_take1_ok); tcase_add_test(tc, test_take1_offset); tcase_add_test(tc, test_reshape_0); suite_add_tcase(s, tc); return s; } libgpuarray-0.7.6/tests/check_blas.c000066400000000000000000000071361326743622600174530ustar00rootroot00000000000000#include #include #include "gpuarray/array.h" #include "gpuarray/blas.h" #include "gpuarray/error.h" #include "gpuarray/types.h" extern void *ctx; void setup(void); void teardown(void); #define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR) static inline void ck_assert_fbuf_eq(const float *b, const float *r, unsigned int n) { unsigned int i; for (i = 0; i < n; i++) { ck_assert_msg(b[i] == r[i], "Difference at %u: %f != %f(ref)", i, b[i], r[i]); } } START_TEST(test_gemmBatch_3d_C) { GpuArray A; GpuArray B; GpuArray C; size_t dims[3] = {2, 3, 3}; float data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9}; const float res[] = {30, 36, 42, 66, 81, 96, 102, 126, 150, 30, 36, 42, 66, 81, 96, 102, 126, 150}; ga_assert_ok(GpuArray_empty(&A, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&B, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&C, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&A, data, sizeof(data))); ga_assert_ok(GpuArray_write(&B, data, sizeof(data))); ga_assert_ok(GpuArray_rgemmBatch_3d(cb_no_trans, cb_no_trans, 1, &A, &B, 0, &C, 1)); ga_assert_ok(GpuArray_read(data, sizeof(data), &C)); ck_assert_fbuf_eq(data, res, sizeof(res)/sizeof(float)); } END_TEST START_TEST(test_gemmBatch_3d_F) { GpuArray A; GpuArray B; GpuArray C; size_t dims[3] = {2, 3, 3}; float data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9}; const float res[] = {42, 78, 78, 60, 114, 114, 51, 69, 96, 66, 39, 111, 54, 54, 90, 78, 78, 132}; ga_assert_ok(GpuArray_empty(&A, ctx, GA_FLOAT, 3, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_empty(&B, ctx, GA_FLOAT, 3, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_empty(&C, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&A, data, sizeof(data))); ga_assert_ok(GpuArray_write(&B, data, sizeof(data))); ga_assert_ok(GpuArray_rgemmBatch_3d(cb_no_trans, cb_no_trans, 1, &A, &B, 0, &C, 0)); ga_assert_ok(GpuArray_read(data, sizeof(data), &C)); ck_assert_fbuf_eq(data, res, sizeof(res)/sizeof(float)); } END_TEST START_TEST(test_gemmBatch_3d_S) { GpuArray A; GpuArray B; GpuArray C; ssize_t t; size_t dims[3] = {2, 3, 3}; float data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9}; const float res[] = {14, 32, 50, 50, 122, 194, 32, 77, 122, 26, 62, 98, 17, 53, 89, 44, 107, 170}; ga_assert_ok(GpuArray_empty(&A, ctx, GA_FLOAT, 3, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_empty(&B, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&C, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&A, data, sizeof(data))); ga_assert_ok(GpuArray_write(&B, data, sizeof(data))); A.strides[0] = 8; A.strides[1] = 24; A.strides[2] = 4; GpuArray_fix_flags(&A); t = B.strides[1]; B.strides[1] = B.strides[2]; B.strides[2] = t; GpuArray_fix_flags(&B); ga_assert_ok(GpuArray_rgemmBatch_3d(cb_no_trans, cb_no_trans, 1, &A, &B, 0, &C, 1)); ga_assert_ok(GpuArray_read(data, sizeof(data), &C)); ck_assert_fbuf_eq(data, res, sizeof(res)/sizeof(float)); } END_TEST Suite *get_suite(void) { Suite *s = suite_create("blas"); TCase *tc = tcase_create("all"); tcase_add_checked_fixture(tc, setup, teardown); tcase_set_timeout(tc, 16.0); tcase_add_test(tc, test_gemmBatch_3d_C); tcase_add_test(tc, test_gemmBatch_3d_F); tcase_add_test(tc, test_gemmBatch_3d_S); suite_add_tcase(s, tc); return s; } libgpuarray-0.7.6/tests/check_buffer.c000066400000000000000000000107751326743622600200060ustar00rootroot00000000000000#include #include "gpuarray/buffer.h" #include "gpuarray/error.h" #include "private.h" extern void *ctx; void setup(void); void teardown(void); static unsigned int refcnt(gpudata *b) { unsigned int res; int err; err = gpudata_property(b, GA_BUFFER_PROP_REFCNT, &res); ck_assert(err == GA_NO_ERROR); return res; } START_TEST(test_buffer_alloc) { gpudata *d; d = gpudata_alloc(ctx, 0, NULL, 0, NULL); ck_assert(d != NULL); ck_assert_int_eq(refcnt(d), 1); gpudata_release(d); d = gpudata_alloc(ctx, 1, NULL, 0, NULL); ck_assert(d != NULL); ck_assert_int_eq(refcnt(d), 1); gpudata_release(d); d = gpudata_alloc(ctx, 1024, NULL, 0, NULL); ck_assert(d != NULL); ck_assert_int_eq(refcnt(d), 1); gpudata_release(d); } END_TEST START_TEST(test_buffer_retain_release) { gpudata *d; gpudata *d2; d = gpudata_alloc(ctx, 1024, NULL, 0, NULL); ck_assert(d != NULL); ck_assert_int_eq(refcnt(d), 1); d2 = gpudata_alloc(ctx, 1024, NULL, 0, NULL); ck_assert(d2 != NULL); ck_assert_int_eq(refcnt(d2), 1); gpudata_retain(d); ck_assert_int_eq(refcnt(d), 2); gpudata_release(d); ck_assert_int_eq(refcnt(d), 1); gpudata_retain(d); gpudata_retain(d2); gpudata_retain(d); ck_assert_int_eq(refcnt(d), 3); ck_assert_int_eq(refcnt(d2), 2); gpudata_release(d); ck_assert_int_eq(refcnt(d), 2); ck_assert_int_eq(refcnt(d2), 2); gpudata_release(d); gpudata_release(d2); ck_assert_int_eq(refcnt(d), 1); ck_assert_int_eq(refcnt(d2), 1); gpudata_release(d); ck_assert_int_eq(refcnt(d2), 1); gpudata_release(d2); } END_TEST START_TEST(test_buffer_share) { gpudata *d; gpudata *d2; d = gpudata_alloc(ctx, 1024, NULL, 0, NULL); ck_assert(d != NULL); d2 = gpudata_alloc(ctx, 1024, NULL, 0, NULL); ck_assert(d2 != NULL); ck_assert_int_eq(gpudata_share(d, d2, NULL), 0); ck_assert_int_eq(gpudata_share(d, d, NULL), 1); } END_TEST START_TEST(test_buffer_read_write) { const int32_t data[] = {0, 1, 2, 3, 4, 5, 6, 7}; int32_t buf[nelems(data)]; gpudata *d; int err; unsigned int i; d = gpudata_alloc(ctx, sizeof(data), NULL, 0, NULL); ck_assert(d != NULL); err = gpudata_write(d, 0, data, sizeof(data)); ck_assert_int_eq(err, GA_NO_ERROR); memset(buf, 0, sizeof(data)); err = gpudata_read(buf, d, 0, sizeof(data)); ck_assert_int_eq(err, GA_NO_ERROR); for (i = 0; i < nelems(data); i++) { ck_assert_int_eq(data[i], buf[i]); } memset(buf, 0, sizeof(data)); err = gpudata_read(buf, d, sizeof(int32_t), sizeof(data) - sizeof(int32_t)); ck_assert_int_eq(err, GA_NO_ERROR); for (i = 0; i < nelems(data) - 1; i++) { ck_assert_int_eq(data[i + 1], buf[i]); } err = gpudata_write(d, sizeof(int32_t) * 2, data, sizeof(data) - (sizeof(int32_t) * 2)); ck_assert_int_eq(err, GA_NO_ERROR); memset(buf, 0, sizeof(data)); err = gpudata_read(buf, d, 0, sizeof(data)); ck_assert_int_eq(err, GA_NO_ERROR); for (i = 0; i < nelems(data) - 2; i++) { ck_assert_int_eq(data[i], buf[i + 2]); } for (i = 0; i < 2; i++) { ck_assert_int_eq(data[i], buf[i]); } gpudata_release(d); } END_TEST START_TEST(test_buffer_move) { const int32_t data[] = {0, 1, 2, 3, 4, 5, 6, 7}; int32_t buf[nelems(data)]; gpudata *d; gpudata *d2; int err; unsigned int i; d = gpudata_alloc(ctx, sizeof(data), NULL, 0, NULL); ck_assert(d != NULL); d2 = gpudata_alloc(ctx, sizeof(data) * 2, NULL, 0, NULL); ck_assert(d2 != NULL); err = gpudata_write(d, 0, data, sizeof(data)); ck_assert(err == GA_NO_ERROR); err = gpudata_move(d2, sizeof(data), d, 0, sizeof(data)); ck_assert(err == GA_NO_ERROR); err = gpudata_read(buf, d2, sizeof(data), sizeof(data)); ck_assert(err == GA_NO_ERROR); for (i = 0; i < nelems(data); i++) { ck_assert_int_eq(buf[i], data[i]); } err = gpudata_move(d2, 0, d, sizeof(uint32_t), sizeof(data) - sizeof(uint32_t)); ck_assert(err == GA_NO_ERROR); err = gpudata_read(buf, d2, 0, sizeof(data)); ck_assert(err == GA_NO_ERROR); for (i = 0; i < nelems(data) - 1; i++) { ck_assert_int_eq(buf[i], data[i + 1]); } gpudata_release(d); gpudata_release(d2); } END_TEST Suite *get_suite(void) { Suite *s = suite_create("buffer"); TCase *tc = tcase_create("API"); tcase_add_checked_fixture(tc, setup, teardown); tcase_add_test(tc, test_buffer_alloc); tcase_add_test(tc, test_buffer_retain_release); tcase_add_test(tc, test_buffer_share); tcase_add_test(tc, test_buffer_read_write); tcase_add_test(tc, test_buffer_move); suite_add_tcase(s, tc); return s; } libgpuarray-0.7.6/tests/check_buffer_collectives.c000066400000000000000000001151621326743622600223760ustar00rootroot00000000000000#include #include #include #include #include #include #include "gpuarray/buffer.h" #include "gpuarray/buffer_collectives.h" #include "gpuarray/error.h" #include "gpuarray/types.h" #define SIZE 128 #define ROOT_RANK 0 #define EPS 1.0e-9 extern gpucontext* ctx; extern gpucomm* comm; extern int comm_ndev; extern int comm_rank; extern void setup_comm(void); extern void teardown_comm(void); #define STR(x) _STR(x) #define _STR(x) #x #define ABS_DIFF(a, b) fabs((double)(b - a)) #define MAX_ABS_DIFF(A, B, N, res) \ do { \ double locdelta; \ int loci; \ res = 0; \ for (loci = 0; loci < N; ++loci) { \ locdelta = ABS_DIFF(A[loci], B[loci]); \ if (locdelta > res) \ res = locdelta; \ } \ } while (0) typedef unsigned long ulong; #define PRINTV(ar, N, t) \ do { \ int li; \ printf("%s\n", STR(ar)); \ for (li = 0; li < (N); ++li) { \ printf(STR(t) " ", ar[li]); \ } \ printf("\n"); \ printf("\n"); \ } while (0) #define PRINTVF(ar, N) PRINTV(ar, N, %.2f) #define PRINTVI(ar, N) PRINTV(ar, N, %i) #define PRINTVL(ar, N) PRINTV(ar, N, %li) #define PRINTVUL(ar, N) PRINTV(ar, N, %lu) /******************************************************************************* * Test helper buffer functions for collectives * *******************************************************************************/ START_TEST(test_gpucomm_get_count) { int gpucount = 0, err = 0; err = gpucomm_get_count(comm, &gpucount); ck_assert_int_eq(err, GA_NO_ERROR); ck_assert_int_eq(gpucount, comm_ndev); } END_TEST START_TEST(test_gpucomm_get_rank) { int rank = 0, err = 0; err = gpucomm_get_rank(comm, &rank); ck_assert_int_eq(err, GA_NO_ERROR); ck_assert_int_eq(rank, comm_rank); } END_TEST /******************************************************************************* * Test buffer collective functions * *******************************************************************************/ #define INIT_ARRAYS(insize, outsize) \ int err; \ void* Av, * RESv, * EXPv; \ gpudata* Adev, *RESdev; \ \ Av = calloc((insize), sizeof(char)); \ if (Av == NULL) \ ck_abort_msg("system memory allocation failed"); \ RESv = calloc((outsize), sizeof(char)); \ if (RESv == NULL) \ ck_abort_msg("system memory allocation failed"); \ EXPv = calloc((outsize), sizeof(char)); \ if (EXPv == NULL) \ ck_abort_msg("system memory allocation failed"); \ Adev = gpudata_alloc(ctx, (insize), NULL, 0, &err); \ ck_assert_ptr_ne(Adev, NULL); \ RESdev = gpudata_alloc(ctx, (outsize), NULL, 0, &err); \ ck_assert_ptr_ne(RESdev, NULL); #define DESTROY_ARRAYS() \ free(Av); \ free(RESv); \ free(EXPv); \ gpudata_release(Adev); \ gpudata_release(RESdev); #define TEST_REDUCE(systype, gatype, mpitype, coloptype, epsilon, print) \ START_TEST(test_gpucomm_reduce_##gatype##_##coloptype) { \ systype* A, * RES, * EXP; \ int i, count; \ INIT_ARRAYS(SIZE, SIZE) \ \ A = (systype*)Av; \ RES = (systype*)RESv; \ EXP = (systype*)EXPv; \ \ count = SIZE / sizeof(systype); \ for (i = 0; i < count; ++i) \ A[i] = comm_rank + 2; \ err = gpudata_write(Adev, 0, A, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ \ err = gpucomm_reduce(Adev, 0, RESdev, 0, count, GA_##gatype, \ GA_##coloptype, ROOT_RANK, comm); \ ck_assert_int_eq(err, GA_NO_ERROR); \ gpudata_sync(RESdev); \ gpudata_sync(Adev); \ \ err = MPI_Reduce(A, EXP, count, MPI_##mpitype, MPI_##coloptype, ROOT_RANK, \ MPI_COMM_WORLD); \ ck_assert_msg(err == MPI_SUCCESS, \ "openmpi error: cannot produced expected"); \ \ if (comm_rank == ROOT_RANK) { \ systype res; \ err = gpudata_read(RES, RESdev, 0, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ MAX_ABS_DIFF(RES, EXP, count, res); \ if (!(res <= epsilon)) { \ print(RES, count); \ print(EXP, count); \ ck_abort_msg( \ "gpudata_reduce with %s type and %s op produced max abs err %.1f", \ STR(GA_##gatype), STR(GA_##coloptype), (double)res); \ } \ } \ \ DESTROY_ARRAYS() \ } \ END_TEST #define TEST_REDUCE_FAIL(tname, count, gatype, gaoptype, offsrc, experror) \ START_TEST(test_gpucomm_reduce_fail_##tname) { \ INIT_ARRAYS(SIZE, SIZE) \ err = gpucomm_reduce(Adev, (offsrc), RESdev, 0, (count), gatype, gaoptype, \ ROOT_RANK, comm); \ ck_assert_int_eq(err, (experror)); \ gpudata_sync(RESdev); \ gpudata_sync(Adev); \ DESTROY_ARRAYS() \ } \ END_TEST /** * \note Untested for: half datatype, `same context`, `dest offset`. * (because root has different behaviour than non root ranks) */ // Success tests TEST_REDUCE(int, INT, INT, SUM, 0, PRINTVI) TEST_REDUCE(int, INT, INT, PROD, 0, PRINTVI) TEST_REDUCE(int, INT, INT, MAX, 0, PRINTVI) TEST_REDUCE(int, INT, INT, MIN, 0, PRINTVI) TEST_REDUCE(float, FLOAT, FLOAT, SUM, EPS, PRINTVF) TEST_REDUCE(float, FLOAT, FLOAT, PROD, EPS, PRINTVF) TEST_REDUCE(float, FLOAT, FLOAT, MAX, EPS, PRINTVF) TEST_REDUCE(float, FLOAT, FLOAT, MIN, EPS, PRINTVF) TEST_REDUCE(double, DOUBLE, DOUBLE, SUM, EPS, PRINTVF) TEST_REDUCE(double, DOUBLE, DOUBLE, PROD, EPS, PRINTVF) TEST_REDUCE(double, DOUBLE, DOUBLE, MAX, EPS, PRINTVF) TEST_REDUCE(double, DOUBLE, DOUBLE, MIN, EPS, PRINTVF) TEST_REDUCE(long, LONG, LONG, SUM, 0, PRINTVL) TEST_REDUCE(long, LONG, LONG, PROD, 0, PRINTVL) TEST_REDUCE(long, LONG, LONG, MAX, 0, PRINTVL) TEST_REDUCE(long, LONG, LONG, MIN, 0, PRINTVL) TEST_REDUCE(ulong, ULONG, UNSIGNED_LONG, SUM, 0, PRINTVUL) TEST_REDUCE(ulong, ULONG, UNSIGNED_LONG, PROD, 0, PRINTVUL) TEST_REDUCE(ulong, ULONG, UNSIGNED_LONG, MAX, 0, PRINTVUL) TEST_REDUCE(ulong, ULONG, UNSIGNED_LONG, MIN, 0, PRINTVUL) // Failure tests TEST_REDUCE_FAIL(datatype, SIZE / sizeof(int), -1, GA_SUM, 0, GA_INVALID_ERROR) TEST_REDUCE_FAIL(optype, SIZE / sizeof(int), GA_INT, -1, 0, GA_INVALID_ERROR) TEST_REDUCE_FAIL(src_offset, SIZE / sizeof(int), GA_INT, GA_SUM, SIZE - sizeof(int), GA_VALUE_ERROR) TEST_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, GA_XLARGE_ERROR) #define TEST_ALL_REDUCE(systype, gatype, mpitype, coloptype, epsilon, print) \ START_TEST(test_gpucomm_all_reduce_##gatype##_##coloptype) { \ systype* A, * RES, * EXP; \ systype res; \ int i, count; \ INIT_ARRAYS(SIZE, SIZE) \ \ A = (systype*)Av; \ RES = (systype*)RESv; \ EXP = (systype*)EXPv; \ \ count = SIZE / sizeof(systype); \ for (i = 0; i < count; ++i) \ A[i] = comm_rank + 2; \ err = gpudata_write(Adev, 0, A, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ \ err = gpucomm_all_reduce(Adev, 0, RESdev, 0, count, GA_##gatype, \ GA_##coloptype, comm); \ ck_assert_int_eq(err, GA_NO_ERROR); \ gpudata_sync(RESdev); \ gpudata_sync(Adev); \ \ err = MPI_Allreduce(A, EXP, count, MPI_##mpitype, MPI_##coloptype, \ MPI_COMM_WORLD); \ ck_assert_msg(err == MPI_SUCCESS, \ "openmpi error: cannot produced expected"); \ \ err = gpudata_read(RES, RESdev, 0, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ MAX_ABS_DIFF(RES, EXP, count, res); \ if (!(res <= epsilon)) { \ print(RES, count); \ print(EXP, count); \ ck_abort_msg( \ "gpudata_all_reduce with %s type and %s op produced max abs err " \ "%.1f", \ STR(GA_##gatype), STR(GA_##coloptype), (double)res); \ } \ \ DESTROY_ARRAYS() \ } \ END_TEST #define TEST_ALL_REDUCE_FAIL(tname, count, gatype, gaoptype, offsrc, offdest, \ experror) \ START_TEST(test_gpucomm_all_reduce_fail_##tname) { \ INIT_ARRAYS(SIZE, SIZE) \ err = gpucomm_all_reduce(Adev, (offsrc), RESdev, (offdest), (count), \ gatype, gaoptype, comm); \ ck_assert_int_eq(err, (experror)); \ gpudata_sync(RESdev); \ gpudata_sync(Adev); \ DESTROY_ARRAYS() \ } \ END_TEST /** * \note Untested for: half datatype, `same context` */ // Success tests TEST_ALL_REDUCE(int, INT, INT, SUM, 0, PRINTVI) TEST_ALL_REDUCE(int, INT, INT, PROD, 0, PRINTVI) TEST_ALL_REDUCE(int, INT, INT, MAX, 0, PRINTVI) TEST_ALL_REDUCE(int, INT, INT, MIN, 0, PRINTVI) TEST_ALL_REDUCE(float, FLOAT, FLOAT, SUM, EPS, PRINTVF) TEST_ALL_REDUCE(float, FLOAT, FLOAT, PROD, EPS, PRINTVF) TEST_ALL_REDUCE(float, FLOAT, FLOAT, MAX, EPS, PRINTVF) TEST_ALL_REDUCE(float, FLOAT, FLOAT, MIN, EPS, PRINTVF) TEST_ALL_REDUCE(double, DOUBLE, DOUBLE, SUM, EPS, PRINTVF) TEST_ALL_REDUCE(double, DOUBLE, DOUBLE, PROD, EPS, PRINTVF) TEST_ALL_REDUCE(double, DOUBLE, DOUBLE, MAX, EPS, PRINTVF) TEST_ALL_REDUCE(double, DOUBLE, DOUBLE, MIN, EPS, PRINTVF) TEST_ALL_REDUCE(long, LONG, LONG, SUM, 0, PRINTVL) TEST_ALL_REDUCE(long, LONG, LONG, PROD, 0, PRINTVL) TEST_ALL_REDUCE(long, LONG, LONG, MAX, 0, PRINTVL) TEST_ALL_REDUCE(long, LONG, LONG, MIN, 0, PRINTVL) TEST_ALL_REDUCE(ulong, ULONG, UNSIGNED_LONG, SUM, 0, PRINTVUL) TEST_ALL_REDUCE(ulong, ULONG, UNSIGNED_LONG, PROD, 0, PRINTVUL) TEST_ALL_REDUCE(ulong, ULONG, UNSIGNED_LONG, MAX, 0, PRINTVUL) TEST_ALL_REDUCE(ulong, ULONG, UNSIGNED_LONG, MIN, 0, PRINTVUL) // Failure tests TEST_ALL_REDUCE_FAIL(datatype, SIZE / sizeof(int), -1, GA_SUM, 0, 0, GA_INVALID_ERROR) TEST_ALL_REDUCE_FAIL(optype, SIZE / sizeof(int), GA_INT, -1, 0, 0, GA_INVALID_ERROR) TEST_ALL_REDUCE_FAIL(src_offset, SIZE / sizeof(int), GA_INT, GA_SUM, SIZE - sizeof(int), 0, GA_VALUE_ERROR) TEST_ALL_REDUCE_FAIL(dest_offset, SIZE / sizeof(int), GA_INT, GA_SUM, 0, SIZE - sizeof(int), GA_VALUE_ERROR) TEST_ALL_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0, GA_XLARGE_ERROR) #define TEST_REDUCE_SCATTER(systype, gatype, mpitype, coloptype, epsilon, \ print) \ START_TEST(test_gpucomm_reduce_scatter_##gatype##_##coloptype) { \ systype* A, * RES, * EXP; \ systype res; \ int i, count; \ int recvcount; \ int* recvcounts; \ INIT_ARRAYS(SIZE, SIZE / comm_ndev) \ \ A = (systype*)Av; \ RES = (systype*)RESv; \ EXP = (systype*)EXPv; \ \ count = SIZE / sizeof(systype); \ for (i = 0; i < count; ++i) \ A[i] = comm_rank + 2; \ err = gpudata_write(Adev, 0, A, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ \ recvcount = count / comm_ndev; \ err = gpucomm_reduce_scatter(Adev, 0, RESdev, 0, recvcount, GA_##gatype, \ GA_##coloptype, comm); \ ck_assert_int_eq(err, GA_NO_ERROR); \ gpudata_sync(RESdev); \ gpudata_sync(Adev); \ \ recvcounts = (int*)malloc(comm_ndev * sizeof(int)); \ if (recvcounts == NULL) \ ck_abort_msg("system memory allocation failed"); \ for (i = 0; i < comm_ndev; ++i) \ recvcounts[i] = recvcount; \ err = MPI_Reduce_scatter(A, EXP, recvcounts, MPI_##mpitype, \ MPI_##coloptype, MPI_COMM_WORLD); \ free(recvcounts); \ ck_assert_msg(err == MPI_SUCCESS, \ "openmpi error: cannot produced expected"); \ \ err = gpudata_read(RES, RESdev, 0, SIZE / comm_ndev); \ ck_assert_int_eq(err, GA_NO_ERROR); \ MAX_ABS_DIFF(RES, EXP, recvcount, res); \ if (!(res <= epsilon)) { \ print(RES, recvcount); \ print(EXP, recvcount); \ ck_abort_msg( \ "gpudata_reduce_scatter with %s type and %s op produced " \ "max abs err %f", \ STR(GA_##gatype), STR(GA_##coloptype), (double)res); \ } \ \ DESTROY_ARRAYS() \ } \ END_TEST #define TEST_REDUCE_SCATTER_FAIL(tname, count, gatype, gaoptype, offsrc, \ offdest, experror) \ START_TEST(test_gpucomm_reduce_scatter_fail_##tname) { \ INIT_ARRAYS(SIZE, SIZE / comm_ndev) \ err = gpucomm_reduce_scatter(Adev, (offsrc), RESdev, (offdest), (count), \ gatype, gaoptype, comm); \ ck_assert_int_eq(err, (experror)); \ gpudata_sync(RESdev); \ gpudata_sync(Adev); \ DESTROY_ARRAYS() \ } \ END_TEST /** * \note Untested for: half datatype, `same context` */ // Success tests TEST_REDUCE_SCATTER(int, INT, INT, SUM, 0, PRINTVI) TEST_REDUCE_SCATTER(int, INT, INT, PROD, 0, PRINTVI) TEST_REDUCE_SCATTER(int, INT, INT, MAX, 0, PRINTVI) TEST_REDUCE_SCATTER(int, INT, INT, MIN, 0, PRINTVI) TEST_REDUCE_SCATTER(float, FLOAT, FLOAT, SUM, EPS, PRINTVF) TEST_REDUCE_SCATTER(float, FLOAT, FLOAT, PROD, EPS, PRINTVF) TEST_REDUCE_SCATTER(float, FLOAT, FLOAT, MAX, EPS, PRINTVF) TEST_REDUCE_SCATTER(float, FLOAT, FLOAT, MIN, EPS, PRINTVF) TEST_REDUCE_SCATTER(double, DOUBLE, DOUBLE, SUM, EPS, PRINTVF) TEST_REDUCE_SCATTER(double, DOUBLE, DOUBLE, PROD, EPS, PRINTVF) TEST_REDUCE_SCATTER(double, DOUBLE, DOUBLE, MAX, EPS, PRINTVF) TEST_REDUCE_SCATTER(double, DOUBLE, DOUBLE, MIN, EPS, PRINTVF) TEST_REDUCE_SCATTER(long, LONG, LONG, SUM, 0, PRINTVL) TEST_REDUCE_SCATTER(long, LONG, LONG, PROD, 0, PRINTVL) TEST_REDUCE_SCATTER(long, LONG, LONG, MAX, 0, PRINTVL) TEST_REDUCE_SCATTER(long, LONG, LONG, MIN, 0, PRINTVL) TEST_REDUCE_SCATTER(ulong, ULONG, UNSIGNED_LONG, SUM, 0, PRINTVUL) TEST_REDUCE_SCATTER(ulong, ULONG, UNSIGNED_LONG, PROD, 0, PRINTVUL) TEST_REDUCE_SCATTER(ulong, ULONG, UNSIGNED_LONG, MAX, 0, PRINTVUL) TEST_REDUCE_SCATTER(ulong, ULONG, UNSIGNED_LONG, MIN, 0, PRINTVUL) // Failure tests #define outcount SIZE / sizeof(int) / comm_ndev TEST_REDUCE_SCATTER_FAIL(datatype, outcount, -1, GA_SUM, 0, 0, GA_INVALID_ERROR) TEST_REDUCE_SCATTER_FAIL(optype, outcount, GA_INT, -1, 0, 0, GA_INVALID_ERROR) TEST_REDUCE_SCATTER_FAIL(src_offset, outcount, GA_INT, GA_SUM, SIZE - sizeof(int), 0, GA_VALUE_ERROR) TEST_REDUCE_SCATTER_FAIL(dest_offset, outcount, GA_INT, GA_SUM, 0, SIZE / comm_ndev - sizeof(int), GA_VALUE_ERROR) TEST_REDUCE_SCATTER_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0, GA_XLARGE_ERROR) #define TEST_BROADCAST(systype, gatype, mpitype, epsilon, print) \ START_TEST(test_gpucomm_broadcast_##gatype) { \ systype* RES, * EXP; \ systype res; \ int i, count; \ INIT_ARRAYS(SIZE, SIZE) \ \ RES = (systype*)RESv; \ EXP = (systype*)EXPv; \ \ count = SIZE / sizeof(systype); \ for (i = 0; i < count; ++i) { \ RES[i] = comm_rank + 1; \ EXP[i] = RES[i]; \ } \ err = gpudata_write(RESdev, 0, RES, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ \ err = gpucomm_broadcast(RESdev, 0, count, GA_##gatype, ROOT_RANK, comm); \ ck_assert_int_eq(err, GA_NO_ERROR); \ gpudata_sync(RESdev); \ \ err = MPI_Bcast(EXP, count, MPI_##mpitype, ROOT_RANK, MPI_COMM_WORLD); \ ck_assert_msg(err == MPI_SUCCESS, \ "openmpi error: cannot produced expected"); \ \ err = gpudata_read(RES, RESdev, 0, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ MAX_ABS_DIFF(RES, EXP, count, res); \ if (!(res <= epsilon)) { \ print(RES, count); \ print(EXP, count); \ ck_abort_msg("gpudata_broadcast with %s type produced max abs err %f", \ STR(GA_##gatype), (double)res); \ } \ \ DESTROY_ARRAYS() \ } \ END_TEST #define TEST_BROADCAST_FAIL(tname, count, gatype, offsrc, experror) \ START_TEST(test_gpucomm_broadcast_fail_##tname) { \ INIT_ARRAYS(SIZE, SIZE) \ err = \ gpucomm_broadcast(RESdev, (offsrc), (count), gatype, ROOT_RANK, comm); \ ck_assert_int_eq(err, (experror)); \ gpudata_sync(RESdev); \ DESTROY_ARRAYS() \ } \ END_TEST /** * \note Untested for: half datatype, `same context` */ // Success tests TEST_BROADCAST(int, INT, INT, 0, PRINTVI) TEST_BROADCAST(char, BYTE, BYTE, 0, PRINTVI) TEST_BROADCAST(float, FLOAT, FLOAT, EPS, PRINTVF) TEST_BROADCAST(double, DOUBLE, DOUBLE, EPS, PRINTVF) TEST_BROADCAST(long, LONG, LONG, 0, PRINTVL) TEST_BROADCAST(ulong, ULONG, UNSIGNED_LONG, 0, PRINTVUL) // Failure tests TEST_BROADCAST_FAIL(datatype, SIZE / sizeof(int), -1, 0, GA_INVALID_ERROR) TEST_BROADCAST_FAIL(src_offset, SIZE / sizeof(int), GA_INT, SIZE - sizeof(int), GA_VALUE_ERROR) TEST_BROADCAST_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, 0, GA_XLARGE_ERROR) #define TEST_ALL_GATHER(systype, gatype, mpitype, epsilon, print) \ START_TEST(test_gpucomm_all_gather_##gatype) { \ systype* A, * RES, * EXP; \ systype res; \ int i, count, sendcount; \ INIT_ARRAYS(SIZE / comm_ndev, SIZE) \ \ A = (systype*)Av; \ RES = (systype*)RESv; \ EXP = (systype*)EXPv; \ \ count = SIZE / sizeof(systype); \ sendcount = count / comm_ndev; \ for (i = 0; i < sendcount; ++i) \ A[i] = comm_rank + 1; \ err = gpudata_write(Adev, 0, A, SIZE / comm_ndev); \ ck_assert_int_eq(err, GA_NO_ERROR); \ \ err = \ gpucomm_all_gather(Adev, 0, RESdev, 0, sendcount, GA_##gatype, comm); \ ck_assert_int_eq(err, GA_NO_ERROR); \ gpudata_sync(RESdev); \ gpudata_sync(Adev); \ \ err = MPI_Allgather(A, sendcount, MPI_##mpitype, EXP, sendcount, \ MPI_##mpitype, MPI_COMM_WORLD); \ ck_assert_msg(err == MPI_SUCCESS, \ "openmpi error: cannot produced expected"); \ \ err = gpudata_read(RES, RESdev, 0, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ MAX_ABS_DIFF(RES, EXP, count, res); \ if (!(res <= epsilon)) { \ print(RES, count); \ print(EXP, count); \ ck_abort_msg("gpudata_all_gather with %s type produced max abs err %f", \ STR(GA_##gatype), (double)res); \ } \ \ DESTROY_ARRAYS() \ } \ END_TEST #define TEST_ALL_GATHER_FAIL(tname, count, gatype, offsrc, offdest, experror) \ START_TEST(test_gpucomm_all_gather_fail_##tname) { \ INIT_ARRAYS(SIZE / comm_ndev, SIZE) \ err = gpucomm_all_gather(Adev, (offsrc), RESdev, (offdest), (count), \ gatype, comm); \ ck_assert_int_eq(err, (experror)); \ gpudata_sync(RESdev); \ gpudata_sync(Adev); \ DESTROY_ARRAYS() \ } \ END_TEST /** * \note Untested for: half datatype, `same context` */ // Success tests TEST_ALL_GATHER(int, INT, INT, 0, PRINTVI) TEST_ALL_GATHER(char, BYTE, BYTE, 0, PRINTVI) TEST_ALL_GATHER(float, FLOAT, FLOAT, EPS, PRINTVF) TEST_ALL_GATHER(double, DOUBLE, DOUBLE, EPS, PRINTVF) TEST_ALL_GATHER(long, LONG, LONG, 0, PRINTVL) TEST_ALL_GATHER(ulong, ULONG, UNSIGNED_LONG, 0, PRINTVUL) // Failure tests #define incount SIZE / sizeof(int) / comm_ndev TEST_ALL_GATHER_FAIL(datatype, incount, -1, 0, 0, GA_INVALID_ERROR) TEST_ALL_GATHER_FAIL(src_offset, incount, GA_INT, SIZE / comm_ndev - sizeof(int), 0, GA_VALUE_ERROR) TEST_ALL_GATHER_FAIL(dest_offset, incount, GA_INT, 0, SIZE - sizeof(int), GA_VALUE_ERROR) TEST_ALL_GATHER_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, 0, 0, GA_XLARGE_ERROR) Suite* get_suite(void) { Suite* s; TCase* helps; TCase* reds; TCase* redf; TCase* areds; TCase* aredf; TCase* redscs; TCase* redscf; TCase* bcasts; TCase* bcastf; TCase* agats; TCase* agatf; s = suite_create("buffer_collectives_API"); helps = tcase_create("test_helpers"); tcase_add_unchecked_fixture(helps, setup_comm, teardown_comm); tcase_add_test(helps, test_gpucomm_get_count); tcase_add_test(helps, test_gpucomm_get_rank); reds = tcase_create("test_reduce"); tcase_add_unchecked_fixture(reds, setup_comm, teardown_comm); tcase_add_test(reds, test_gpucomm_reduce_INT_SUM); tcase_add_test(reds, test_gpucomm_reduce_INT_PROD); tcase_add_test(reds, test_gpucomm_reduce_INT_MAX); tcase_add_test(reds, test_gpucomm_reduce_INT_MIN); tcase_add_test(reds, test_gpucomm_reduce_FLOAT_SUM); tcase_add_test(reds, test_gpucomm_reduce_FLOAT_PROD); tcase_add_test(reds, test_gpucomm_reduce_FLOAT_MAX); tcase_add_test(reds, test_gpucomm_reduce_FLOAT_MIN); tcase_add_test(reds, test_gpucomm_reduce_DOUBLE_SUM); tcase_add_test(reds, test_gpucomm_reduce_DOUBLE_PROD); tcase_add_test(reds, test_gpucomm_reduce_DOUBLE_MAX); tcase_add_test(reds, test_gpucomm_reduce_DOUBLE_MIN); tcase_add_test(reds, test_gpucomm_reduce_LONG_SUM); tcase_add_test(reds, test_gpucomm_reduce_LONG_PROD); tcase_add_test(reds, test_gpucomm_reduce_LONG_MAX); tcase_add_test(reds, test_gpucomm_reduce_LONG_MIN); tcase_add_test(reds, test_gpucomm_reduce_ULONG_SUM); tcase_add_test(reds, test_gpucomm_reduce_ULONG_PROD); tcase_add_test(reds, test_gpucomm_reduce_ULONG_MAX); tcase_add_test(reds, test_gpucomm_reduce_ULONG_MIN); redf = tcase_create("test_reduce_fail"); tcase_add_unchecked_fixture(redf, setup_comm, teardown_comm); tcase_add_test(redf, test_gpucomm_reduce_fail_datatype); tcase_add_test(redf, test_gpucomm_reduce_fail_optype); tcase_add_test(redf, test_gpucomm_reduce_fail_src_offset); tcase_add_test(redf, test_gpucomm_reduce_fail_elemcount); areds = tcase_create("test_all_reduce"); tcase_add_unchecked_fixture(areds, setup_comm, teardown_comm); tcase_add_test(areds, test_gpucomm_all_reduce_INT_SUM); tcase_add_test(areds, test_gpucomm_all_reduce_INT_PROD); tcase_add_test(areds, test_gpucomm_all_reduce_INT_MAX); tcase_add_test(areds, test_gpucomm_all_reduce_INT_MIN); tcase_add_test(areds, test_gpucomm_all_reduce_FLOAT_SUM); tcase_add_test(areds, test_gpucomm_all_reduce_FLOAT_PROD); tcase_add_test(areds, test_gpucomm_all_reduce_FLOAT_MAX); tcase_add_test(areds, test_gpucomm_all_reduce_FLOAT_MIN); tcase_add_test(areds, test_gpucomm_all_reduce_DOUBLE_SUM); tcase_add_test(areds, test_gpucomm_all_reduce_DOUBLE_PROD); tcase_add_test(areds, test_gpucomm_all_reduce_DOUBLE_MAX); tcase_add_test(areds, test_gpucomm_all_reduce_DOUBLE_MIN); tcase_add_test(areds, test_gpucomm_all_reduce_LONG_SUM); tcase_add_test(areds, test_gpucomm_all_reduce_LONG_PROD); tcase_add_test(areds, test_gpucomm_all_reduce_LONG_MAX); tcase_add_test(areds, test_gpucomm_all_reduce_LONG_MIN); tcase_add_test(areds, test_gpucomm_all_reduce_ULONG_SUM); tcase_add_test(areds, test_gpucomm_all_reduce_ULONG_PROD); tcase_add_test(areds, test_gpucomm_all_reduce_ULONG_MAX); tcase_add_test(areds, test_gpucomm_all_reduce_ULONG_MIN); aredf = tcase_create("test_all_reduce_fail"); tcase_add_unchecked_fixture(aredf, setup_comm, teardown_comm); tcase_add_test(aredf, test_gpucomm_all_reduce_fail_datatype); tcase_add_test(aredf, test_gpucomm_all_reduce_fail_optype); tcase_add_test(aredf, test_gpucomm_all_reduce_fail_src_offset); tcase_add_test(aredf, test_gpucomm_all_reduce_fail_dest_offset); tcase_add_test(aredf, test_gpucomm_all_reduce_fail_elemcount); redscs = tcase_create("test_reduce_scatter"); tcase_add_unchecked_fixture(redscs, setup_comm, teardown_comm); tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_SUM); tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_PROD); tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_MAX); tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_MIN); tcase_add_test(redscs, test_gpucomm_reduce_scatter_FLOAT_SUM); tcase_add_test(redscs, test_gpucomm_reduce_scatter_FLOAT_PROD); tcase_add_test(redscs, test_gpucomm_reduce_scatter_FLOAT_MAX); tcase_add_test(redscs, test_gpucomm_reduce_scatter_FLOAT_MIN); tcase_add_test(redscs, test_gpucomm_reduce_scatter_DOUBLE_SUM); tcase_add_test(redscs, test_gpucomm_reduce_scatter_DOUBLE_PROD); tcase_add_test(redscs, test_gpucomm_reduce_scatter_DOUBLE_MAX); tcase_add_test(redscs, test_gpucomm_reduce_scatter_DOUBLE_MIN); tcase_add_test(redscs, test_gpucomm_reduce_scatter_LONG_SUM); tcase_add_test(redscs, test_gpucomm_reduce_scatter_LONG_PROD); tcase_add_test(redscs, test_gpucomm_reduce_scatter_LONG_MAX); tcase_add_test(redscs, test_gpucomm_reduce_scatter_LONG_MIN); tcase_add_test(redscs, test_gpucomm_reduce_scatter_ULONG_SUM); tcase_add_test(redscs, test_gpucomm_reduce_scatter_ULONG_PROD); tcase_add_test(redscs, test_gpucomm_reduce_scatter_ULONG_MAX); tcase_add_test(redscs, test_gpucomm_reduce_scatter_ULONG_MIN); redscf = tcase_create("test_reduce_scatter_fail"); tcase_add_unchecked_fixture(redscf, setup_comm, teardown_comm); tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_datatype); tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_optype); tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_src_offset); tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_dest_offset); tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_elemcount); bcasts = tcase_create("test_broadcast"); tcase_add_unchecked_fixture(bcasts, setup_comm, teardown_comm); tcase_add_test(bcasts, test_gpucomm_broadcast_INT); tcase_add_test(bcasts, test_gpucomm_broadcast_BYTE); tcase_add_test(bcasts, test_gpucomm_broadcast_FLOAT); tcase_add_test(bcasts, test_gpucomm_broadcast_DOUBLE); tcase_add_test(bcasts, test_gpucomm_broadcast_LONG); tcase_add_test(bcasts, test_gpucomm_broadcast_ULONG); bcastf = tcase_create("test_broadcast_fail"); tcase_add_unchecked_fixture(bcastf, setup_comm, teardown_comm); tcase_add_test(bcastf, test_gpucomm_broadcast_fail_datatype); tcase_add_test(bcastf, test_gpucomm_broadcast_fail_src_offset); tcase_add_test(bcastf, test_gpucomm_broadcast_fail_elemcount); agats = tcase_create("test_all_gather"); tcase_add_unchecked_fixture(agats, setup_comm, teardown_comm); tcase_add_test(agats, test_gpucomm_all_gather_INT); tcase_add_test(agats, test_gpucomm_all_gather_BYTE); tcase_add_test(agats, test_gpucomm_all_gather_FLOAT); tcase_add_test(agats, test_gpucomm_all_gather_DOUBLE); tcase_add_test(agats, test_gpucomm_all_gather_LONG); tcase_add_test(agats, test_gpucomm_all_gather_ULONG); agatf = tcase_create("test_all_gather_fail"); tcase_add_unchecked_fixture(agatf, setup_comm, teardown_comm); tcase_add_test(agatf, test_gpucomm_all_gather_fail_datatype); tcase_add_test(agatf, test_gpucomm_all_gather_fail_src_offset); tcase_add_test(agatf, test_gpucomm_all_gather_fail_dest_offset); tcase_add_test(agatf, test_gpucomm_all_gather_fail_elemcount); suite_add_tcase(s, helps); suite_add_tcase(s, reds); suite_add_tcase(s, redf); suite_add_tcase(s, areds); suite_add_tcase(s, aredf); suite_add_tcase(s, redscs); suite_add_tcase(s, redscf); suite_add_tcase(s, bcasts); suite_add_tcase(s, bcastf); suite_add_tcase(s, agats); suite_add_tcase(s, agatf); return s; } libgpuarray-0.7.6/tests/check_collectives.c000066400000000000000000000226451326743622600210500ustar00rootroot00000000000000#include #include #include #include #include #include "gpuarray/array.h" #include "gpuarray/buffer.h" #include "gpuarray/buffer_collectives.h" #include "gpuarray/collectives.h" #include "gpuarray/error.h" #include "gpuarray/types.h" #define ROOT_RANK 0 #define ND 2 #define ROWS 32 #define COLS 16 extern gpucontext* ctx; extern gpucomm* comm; extern int comm_ndev; extern int comm_rank; extern void setup_comm(void); extern void teardown_comm(void); #define STR(x) _STR(x) #define _STR(x) #x #define COUNT_ERRORS(A, B, M, N, res) \ do { \ int loci, locj; \ res = 0; \ for (loci = 0; loci < (M); ++loci) { \ for (locj = 0; locj < (N); ++locj) { \ if ((A)[loci][locj] != (B)[loci][locj]) \ res++; \ } \ } \ } while (0) /******************************************************************************* * Test array functions for collective operations * *******************************************************************************/ #define INIT_ARRAYS(inrows, incols, outrows, outcols) \ int(*A)[(incols)]; \ int(*RES)[(outcols)]; \ int(*EXP)[(outcols)]; \ size_t indims[ND]; \ size_t outdims[ND]; \ const ssize_t outstrds[ND] = {sizeof(*RES), sizeof(int)}; \ int err; \ size_t i, j, outsize; \ GpuArray Adev; \ GpuArray RESdev; \ \ A = (int(*)[(incols)])calloc((inrows), sizeof(*A)); \ if (A == NULL) \ ck_abort_msg("system memory allocation failed"); \ RES = (int(*)[(outcols)])calloc((outrows), sizeof(*RES)); \ if (RES == NULL) \ ck_abort_msg("system memory allocation failed"); \ EXP = (int(*)[(outcols)])calloc((outrows), sizeof(*EXP)); \ if (EXP == NULL) \ ck_abort_msg("system memory allocation failed"); \ indims[0] = (inrows); \ indims[1] = (incols); \ outdims[0] = (outrows); \ outdims[1] = (outcols); \ outsize = outdims[0] * outstrds[0]; \ \ for (i = 0; i < indims[0]; ++i) \ for (j = 0; j < indims[1]; ++j) \ A[i][j] = comm_rank + 2; \ \ err = GpuArray_empty(&Adev, ctx, GA_INT, ND, indims, GA_C_ORDER); \ ck_assert_int_eq(err, GA_NO_ERROR); \ err = GpuArray_write(&Adev, A, sizeof(*A) * inrows); \ ck_assert_int_eq(err, GA_NO_ERROR); \ err = GpuArray_empty(&RESdev, ctx, GA_INT, ND, outdims, GA_C_ORDER); \ ck_assert_int_eq(err, GA_NO_ERROR); #define DESTROY_ARRAYS() \ GpuArray_clear(&RESdev); \ GpuArray_clear(&Adev); \ free(A); \ free(RES); \ free(EXP); /** * \note Untested for `not proper element count` , `not agreeing typecode`, `not * aligned`. */ START_TEST(test_GpuArray_reduce) { int res; INIT_ARRAYS(ROWS, COLS, ROWS, COLS); if (comm_rank == ROOT_RANK) { err = GpuArray_reduce(&Adev, &RESdev, GA_SUM, ROOT_RANK, comm); ck_assert_int_eq(err, GA_NO_ERROR); GpuArray_sync(&RESdev); GpuArray_sync(&Adev); } else { err = GpuArray_reduce_from(&Adev, GA_SUM, ROOT_RANK, comm); ck_assert_int_eq(err, GA_NO_ERROR); GpuArray_sync(&Adev); } err = MPI_Reduce(A, EXP, ROWS * COLS, MPI_INT, MPI_SUM, ROOT_RANK, MPI_COMM_WORLD); ck_assert_msg(err == MPI_SUCCESS, "openmpi error: cannot produced expected"); if (comm_rank == ROOT_RANK) { err = GpuArray_read(RES, outsize, &RESdev); ck_assert_int_eq(err, GA_NO_ERROR); COUNT_ERRORS(RES, EXP, ROWS, COLS, res); ck_assert_msg(res == 0, "GpuArray_reduce with %s op produced errors in %d places", STR(GA_SUM), res); } DESTROY_ARRAYS(); } END_TEST /** * \note Untested for: `not proper element count` , `not agreeing typecode`, * `not * aligned`. */ START_TEST(test_GpuArray_all_reduce) { int res; INIT_ARRAYS(ROWS, COLS, ROWS, COLS); err = GpuArray_all_reduce(&Adev, &RESdev, GA_SUM, comm); ck_assert_int_eq(err, GA_NO_ERROR); GpuArray_sync(&RESdev); GpuArray_sync(&Adev); err = MPI_Allreduce(A, EXP, ROWS * COLS, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ck_assert_msg(err == MPI_SUCCESS, "openmpi error: cannot produced expected"); err = GpuArray_read(RES, outsize, &RESdev); ck_assert_int_eq(err, GA_NO_ERROR); COUNT_ERRORS(RES, EXP, ROWS, COLS, res); ck_assert_msg(res == 0, "GpuArray_all_reduce with %s op produced errors in %d places", STR(GA_SUM), res); DESTROY_ARRAYS(); } END_TEST /** * \note Untested for `not proper element count` , `not agreeing typecode`, `not * aligned`. */ START_TEST(test_GpuArray_reduce_scatter) { int res; int* recvcounts; // In order for C contiguous arrays to be combined/split successfully they // should // split along the smallest axis (the one with the bigger stride). INIT_ARRAYS(ROWS, COLS, ROWS / comm_ndev, COLS); err = GpuArray_reduce_scatter(&Adev, &RESdev, GA_SUM, comm); ck_assert_int_eq(err, GA_NO_ERROR); GpuArray_sync(&RESdev); GpuArray_sync(&Adev); recvcounts = (int*)malloc(comm_ndev * sizeof(int)); if (recvcounts == NULL) ck_abort_msg("system memory allocation failed"); for (i = 0; i < (size_t)comm_ndev; ++i) recvcounts[i] = ROWS * COLS / comm_ndev; err = MPI_Reduce_scatter(A, EXP, recvcounts, MPI_INT, MPI_SUM, MPI_COMM_WORLD); free(recvcounts); ck_assert_msg(err == MPI_SUCCESS, "openmpi error: cannot produced expected"); err = GpuArray_read(RES, outsize, &RESdev); ck_assert_int_eq(err, GA_NO_ERROR); COUNT_ERRORS(RES, EXP, ROWS / comm_ndev, COLS, res); ck_assert_msg( res == 0, "GpuArray_reduce_scatter with %s op produced errors in %d places", STR(GA_SUM), res); DESTROY_ARRAYS(); } END_TEST /** * \note Untested for `not aligned`. */ START_TEST(test_GpuArray_broadcast) { int res; INIT_ARRAYS(ROWS, COLS, ROWS, COLS); for (i = 0; i < indims[0]; ++i) for (j = 0; j < indims[1]; ++j) EXP[i][j] = A[i][j]; err = GpuArray_broadcast(&Adev, ROOT_RANK, comm); ck_assert_int_eq(err, GA_NO_ERROR); GpuArray_sync(&Adev); err = MPI_Bcast(EXP, ROWS * COLS, MPI_INT, ROOT_RANK, MPI_COMM_WORLD); ck_assert_msg(err == MPI_SUCCESS, "openmpi error: cannot produced expected"); err = GpuArray_read(RES, outsize, &Adev); ck_assert_int_eq(err, GA_NO_ERROR); COUNT_ERRORS(RES, EXP, ROWS, COLS, res); ck_assert_msg(res == 0, "GpuArray_broadcast produced errors in %d places", res); DESTROY_ARRAYS(); } END_TEST /** * \note Untested for `not proper element count` , `not agreeing typecode`, `not * aligned`. */ START_TEST(test_GpuArray_all_gather) { int res; // In order for C contiguous arrays to be combined/split successfully they // should // split along the smallest axis (the one with the bigger stride). INIT_ARRAYS(ROWS / comm_ndev, COLS, ROWS, COLS); err = GpuArray_all_gather(&Adev, &RESdev, comm); ck_assert_int_eq(err, GA_NO_ERROR); GpuArray_sync(&RESdev); GpuArray_sync(&Adev); err = MPI_Allgather(A, ROWS * COLS / comm_ndev, MPI_INT, EXP, ROWS * COLS / comm_ndev, MPI_INT, MPI_COMM_WORLD); ck_assert_msg(err == MPI_SUCCESS, "openmpi error: cannot produced expected"); err = GpuArray_read(RES, outsize, &RESdev); ck_assert_int_eq(err, GA_NO_ERROR); COUNT_ERRORS(RES, EXP, ROWS, COLS, res); ck_assert_msg(res == 0, "GpuArray_all_gather produced errors in %d places", res); DESTROY_ARRAYS(); } END_TEST Suite* get_suite(void) { Suite* s = suite_create("collectives"); TCase* tc = tcase_create("API"); tcase_add_checked_fixture(tc, setup_comm, teardown_comm); tcase_add_test(tc, test_GpuArray_reduce); tcase_add_test(tc, test_GpuArray_all_reduce); tcase_add_test(tc, test_GpuArray_reduce_scatter); tcase_add_test(tc, test_GpuArray_broadcast); tcase_add_test(tc, test_GpuArray_all_gather); suite_add_tcase(s, tc); return s; } libgpuarray-0.7.6/tests/check_elemwise.c000066400000000000000000000502671326743622600203470ustar00rootroot00000000000000#include #include "gpuarray/array.h" #include "gpuarray/buffer.h" #include "gpuarray/elemwise.h" #include "gpuarray/error.h" #include "gpuarray/types.h" #if CHECK_MINOR_VERSION < 11 #ifndef CK_FLOATING_DIG # define CK_FLOATING_DIG 6 #endif /* CK_FLOATING_DIG */ #define _ck_assert_floating(X, OP, Y, TP, TM) do { \ TP _ck_x = (X); \ TP _ck_y = (Y); \ ck_assert_msg(_ck_x OP _ck_y, \ "Assertion '%s' failed: %s == %.*"TM"g, %s == %.*"TM"g", \ #X" "#OP" "#Y, \ #X, (int)CK_FLOATING_DIG, _ck_x, \ #Y, (int)CK_FLOATING_DIG, _ck_y); \ } while (0) #define ck_assert_float_eq(X, Y) _ck_assert_floating(X, ==, Y, float, "") #endif extern void *ctx; void setup(void); void teardown(void); #define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR) /* float 16 table (0 through 10) */ static const uint16_t F16[10] = {0x0000, 0x3c00, 0x4000, 0x4200, 0x4400, 0x4500, 0x4600, 0x4700, 0x4800, 0x4880}; START_TEST(test_contig_simple) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; static const uint32_t data1[3] = {1, 2, 3}; static const uint32_t data2[3] = {4, 5, 6}; uint32_t data3[3] = {0}; size_t dims[1]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; dims[0] = 3; ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 1, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 1, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 1, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_UINT; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_UINT; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_UINT; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 1, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); ck_assert_int_eq(data3[0], 5); ck_assert_int_eq(data3[1], 7); ck_assert_int_eq(data3[2], 9); } END_TEST START_TEST(test_contig_f16) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; static uint16_t data1[3]; static uint16_t data2[3]; uint16_t data3[3] = {0}; size_t dims[1]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; data1[0] = F16[1]; data1[1] = F16[2]; data1[2] = F16[3]; data2[0] = F16[4]; data2[1] = F16[5]; data2[2] = F16[6]; dims[0] = 3; ga_assert_ok(GpuArray_empty(&a, ctx, GA_HALF, 1, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); ga_assert_ok(GpuArray_empty(&b, ctx, GA_HALF, 1, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); ga_assert_ok(GpuArray_empty(&c, ctx, GA_HALF, 1, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_HALF; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_HALF; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_HALF; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 1, GE_CONVERT_F16); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); ck_assert_int_eq(data3[0], F16[5]); ck_assert_int_eq(data3[1], F16[7]); ck_assert_int_eq(data3[2], F16[9]); } END_TEST START_TEST(test_contig_0) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; size_t dims[1]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; dims[0] = 0; ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 1, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 1, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 1, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_UINT; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_UINT; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_UINT; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 1, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE)); } END_TEST START_TEST(test_basic_simple) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; static const uint32_t data1[3] = {1, 2, 3}; static const uint32_t data2[3] = {4, 5, 6}; uint32_t data3[3] = {0}; size_t dims[2]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; dims[0] = 1; dims[1] = 3; ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 2, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_UINT; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_UINT; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_UINT; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 2, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); ck_assert_int_eq(data3[0], 5); ck_assert_int_eq(data3[1], 7); ck_assert_int_eq(data3[2], 9); } END_TEST START_TEST(test_basic_f16) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; static uint16_t data1[3]; static uint16_t data2[3]; uint16_t data3[3] = {0}; size_t dims[2]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; data1[0] = F16[1]; data1[1] = F16[2]; data1[2] = F16[3]; data2[0] = F16[4]; data2[1] = F16[5]; data2[2] = F16[6]; dims[0] = 1; dims[1] = 3; ga_assert_ok(GpuArray_empty(&a, ctx, GA_HALF, 2, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); ga_assert_ok(GpuArray_empty(&b, ctx, GA_HALF, 2, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); ga_assert_ok(GpuArray_empty(&c, ctx, GA_HALF, 2, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_HALF; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_HALF; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_HALF; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 2, GE_CONVERT_F16); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); ck_assert_int_eq(data3[0], F16[5]); ck_assert_int_eq(data3[1], F16[7]); ck_assert_int_eq(data3[2], F16[9]); } END_TEST START_TEST(test_basic_offset) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; static const uint32_t data1[3] = {1, 2, 3}; static const uint32_t data2[3] = {4, 5, 6}; uint32_t data3[3] = {0}; size_t dims[2]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; dims[0] = 1; dims[1] = 6; ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 2, dims, GA_C_ORDER)); /* Simulate indexing */ a.offset = 12; a.dimensions[1] = 3; GpuArray_fix_flags(&a); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); dims[1] = 3; ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_UINT; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_UINT; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_UINT; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 2, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); ck_assert_int_eq(data3[0], 5); ck_assert_int_eq(data3[1], 7); ck_assert_int_eq(data3[2], 9); } END_TEST START_TEST(test_basic_scalar) { GpuArray a; GpuArray b; GpuArray c; uint32_t x = 2; GpuElemwise *ge; static const uint32_t data1[3] = {1, 2, 3}; static const uint32_t data2[2] = {4, 5}; uint32_t data3[6] = {0}; size_t dims[2]; gpuelemwise_arg args[4] = {{0}}; void *rargs[4]; dims[0] = 1; dims[1] = 3; ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 2, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); dims[0] = 2; dims[1] = 1; ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); dims[1] = 3; ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_UINT; args[0].flags = GE_READ; args[1].name = "x"; args[1].typecode = GA_UINT; args[1].flags = GE_SCALAR; args[2].name = "b"; args[2].typecode = GA_UINT; args[2].flags = GE_READ; args[3].name = "c"; args[3].typecode = GA_UINT; args[3].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + x * b", 4, args, 2, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &x; rargs[2] = &b; rargs[3] = &c; ga_assert_ok(GpuElemwise_call(ge, rargs, GE_BROADCAST)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); ck_assert_int_eq(data3[0], 9); ck_assert_int_eq(data3[1], 10); ck_assert_int_eq(data3[2], 11); ck_assert_int_eq(data3[3], 11); ck_assert_int_eq(data3[4], 12); ck_assert_int_eq(data3[5], 13); } END_TEST START_TEST(test_basic_scalar_dtype) { GpuArray x; GpuArray y; float a = 1.1f; GpuElemwise *ge; static const int32_t data1[4] = {0, 1, 2, 3}; static const float data2[4] = {2.0, 2.0, 2.0, 2.0}; float data3[4]; size_t dims[2] = {2, 2}; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; ga_assert_ok(GpuArray_empty(&x, ctx, GA_INT, 2, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&x, data1, sizeof(data1))); ga_assert_ok(GpuArray_empty(&y, ctx, GA_FLOAT, 2, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_write(&y, data2, sizeof(data2))); args[0].name = "a"; args[0].typecode = GA_FLOAT; args[0].flags = GE_SCALAR; args[1].name = "x"; args[1].typecode = GA_INT; args[1].flags = GE_READ; args[2].name = "y"; args[2].typecode = GA_FLOAT; args[2].flags = GE_READ|GE_WRITE; ge = GpuElemwise_new(ctx, "", "y = a * x + y", 3, args, 2, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &x; rargs[2] = &y; ga_assert_ok(GpuElemwise_call(ge, rargs, 0)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &y)); ck_assert_float_eq(data3[0], 2.0f); ck_assert_float_eq(data3[1], 4.2f); ck_assert_float_eq(data3[2], 3.1f); ck_assert_float_eq(data3[3], 5.3f); } END_TEST START_TEST(test_basic_remove1) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; static const uint32_t data1[6] = {1, 2, 3, 4, 5, 6}; static const uint32_t data2[6] = {7, 8, 9, 10, 11, 12}; uint32_t data3[6] = {0}; size_t dims[4]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; dims[0] = 1; dims[1] = 3; dims[2] = 2; dims[3] = 1; ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 4, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 4, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 4, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_UINT; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_UINT; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_UINT; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 0, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ga_assert_ok(GpuElemwise_call(ge, rargs, 0)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); ck_assert_int_eq(data3[0], 8); ck_assert_int_eq(data3[1], 12); ck_assert_int_eq(data3[2], 11); ck_assert_int_eq(data3[3], 15); ck_assert_int_eq(data3[4], 14); ck_assert_int_eq(data3[5], 18); } END_TEST START_TEST(test_basic_broadcast) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; static const uint32_t data1[3] = {1, 2, 3}; static const uint32_t data2[2] = {4, 5}; uint32_t data3[6] = {0}; size_t dims[2]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; dims[0] = 1; dims[1] = 3; ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 2, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); dims[0] = 2; dims[1] = 1; ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); dims[0] = 2; dims[1] = 3; ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_UINT; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_UINT; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_UINT; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 2, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ck_assert_int_eq(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE), GA_VALUE_ERROR); ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE | GE_BROADCAST)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); ck_assert_int_eq(data3[0], 5); ck_assert_int_eq(data3[1], 6); ck_assert_int_eq(data3[2], 7); ck_assert_int_eq(data3[3], 6); ck_assert_int_eq(data3[4], 7); ck_assert_int_eq(data3[5], 8); } END_TEST START_TEST(test_basic_padshape) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; static const uint32_t data1[3] = {1, 2, 3}; static const uint32_t data2[2] = {4, 5}; uint32_t data3[6] = {0}; size_t dims[2]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; dims[0] = 3; ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 1, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); dims[0] = 2; dims[1] = 1; ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); dims[0] = 2; dims[1] = 3; ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_UINT; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_UINT; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_UINT; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 2, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ck_assert_int_eq(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE), GA_VALUE_ERROR); ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE | GE_BROADCAST | GE_PADSHAPE)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); ck_assert_int_eq(data3[0], 5); ck_assert_int_eq(data3[1], 6); ck_assert_int_eq(data3[2], 7); ck_assert_int_eq(data3[3], 6); ck_assert_int_eq(data3[4], 7); ck_assert_int_eq(data3[5], 8); } END_TEST START_TEST(test_basic_collapse) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; static const uint32_t data1[6] = {1, 2, 3, 4, 5, 6}; static const uint32_t data2[6] = {7, 8, 9, 10, 11, 12}; uint32_t data3[6] = {0}; size_t dims[2]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; dims[0] = 2; dims[1] = 3; ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 2, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_UINT; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_UINT; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_UINT; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 2, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ga_assert_ok(GpuElemwise_call(ge, rargs, 0)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); ck_assert_int_eq(data3[0], 8); ck_assert_int_eq(data3[1], 10); ck_assert_int_eq(data3[2], 12); ck_assert_int_eq(data3[3], 14); ck_assert_int_eq(data3[4], 16); ck_assert_int_eq(data3[5], 18); } END_TEST START_TEST(test_basic_neg_strides) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; static const uint32_t data1[6] = {1, 2, 3, 4, 5, 6}; static const uint32_t data2[6] = {7, 8, 9, 10, 11, 12}; uint32_t data3[6] = {0}; size_t dims[1]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; ssize_t starts[1]; ssize_t stops[1]; ssize_t steps[1]; dims[0] = 6; ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 1, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 1, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); starts[0] = 5; stops[0] = -1; steps[0] = -1; ga_assert_ok(GpuArray_index_inplace(&b, starts, stops, steps)); ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 1, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_UINT; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_UINT; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_UINT; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 1, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ga_assert_ok(GpuElemwise_call(ge, rargs, 0)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); ck_assert_int_eq(data3[0], 13); ck_assert_int_eq(data3[1], 13); ck_assert_int_eq(data3[2], 13); ck_assert_int_eq(data3[3], 13); ck_assert_int_eq(data3[4], 13); ck_assert_int_eq(data3[5], 13); } END_TEST START_TEST(test_basic_0) { GpuArray a; GpuArray b; GpuArray c; GpuElemwise *ge; size_t dims[2]; gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; dims[0] = 0; dims[1] = 3; ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 2, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER)); args[0].name = "a"; args[0].typecode = GA_UINT; args[0].flags = GE_READ; args[1].name = "b"; args[1].typecode = GA_UINT; args[1].flags = GE_READ; args[2].name = "c"; args[2].typecode = GA_UINT; args[2].flags = GE_WRITE; ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 2, 0); ck_assert_ptr_ne(ge, NULL); rargs[0] = &a; rargs[1] = &b; rargs[2] = &c; ga_assert_ok(GpuElemwise_call(ge, rargs, 0)); } END_TEST Suite *get_suite(void) { Suite *s = suite_create("elemwise"); TCase *tc = tcase_create("contig"); tcase_set_timeout(tc, 8.0); tcase_add_checked_fixture(tc, setup, teardown); tcase_add_test(tc, test_contig_simple); tcase_add_test(tc, test_contig_f16); tcase_add_test(tc, test_contig_0); suite_add_tcase(s, tc); tc = tcase_create("basic"); tcase_set_timeout(tc, 8.0); tcase_add_checked_fixture(tc, setup, teardown); tcase_add_test(tc, test_basic_simple); tcase_add_test(tc, test_basic_f16); tcase_add_test(tc, test_basic_scalar); tcase_add_test(tc, test_basic_scalar_dtype); tcase_add_test(tc, test_basic_offset); tcase_add_test(tc, test_basic_remove1); tcase_add_test(tc, test_basic_broadcast); tcase_add_test(tc, test_basic_padshape); tcase_add_test(tc, test_basic_collapse); tcase_add_test(tc, test_basic_neg_strides); tcase_add_test(tc, test_basic_0); suite_add_tcase(s, tc); return s; } libgpuarray-0.7.6/tests/check_error.c000066400000000000000000000010041326743622600176470ustar00rootroot00000000000000#include #include "gpuarray/error.h" START_TEST(test_error_str) { const char *msg; msg = gpuarray_error_str(-1); ck_assert_str_eq(msg, "Unknown GA error"); msg = gpuarray_error_str(99); ck_assert_str_eq(msg, "Unknown GA error"); msg = gpuarray_error_str(GA_NO_ERROR); ck_assert_str_eq(msg, "No error"); } END_TEST Suite *get_suite(void) { Suite *s = suite_create("error"); TCase *tc = tcase_create("All"); tcase_add_test(tc, test_error_str); suite_add_tcase(s, tc); return s; } libgpuarray-0.7.6/tests/check_reduction.c000066400000000000000000000252561326743622600205310ustar00rootroot00000000000000#include #include #include #include #include #include #include #include extern void *ctx; void setup(void); void teardown(void); /* Defines */ #define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR) /** * PRNG based on PCG XSH RR 64/32 (LCG) * * Used to generate random data for the kernel tests. */ /* Forward Declarations */ static uint32_t pcgRor32 (uint32_t x, uint32_t n); static void pcgSeed (uint64_t seed); static uint32_t pcgRand (void); static double pcgRand01(void); /* Definitions */ static uint64_t pcgS = 1;/* State */ static const uint64_t pcgM = 6364136223846793005;/* Multiplier */ static const uint64_t pcgA = 1442695040888963407;/* Addend */ static uint32_t pcgRor32 (uint32_t x, uint32_t n){ return (n &= 0x1F) ? x>>n | x<<(32-n) : x; } static void pcgSeed (uint64_t seed){ pcgS = seed; } static uint32_t pcgRand (void){ pcgS = pcgS*pcgM + pcgA; /** * PCG does something akin to an unbalanced Feistel round to blind the LCG * state: * * The rightmost 59 bits are involved in an xorshift by 18. * The leftmost 5 bits select a rotation of the 32 bits 58:27. */ return pcgRor32((pcgS^(pcgS>>18))>>27, pcgS>>59); } static double pcgRand01(void){ uint64_t u = pcgRand(), l = pcgRand(); uint64_t x = u<<32 | l; return x /18446744073709551616.0; } /** * Test cases. */ START_TEST(test_reduction){ /** * We test here a reduction of some random 3D tensor on the first and * third dimensions. */ GpuArray gaSrc; GpuArray gaMax; GpuArray gaArgmax; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const unsigned reduxList[] = {0,2}; float *pSrc = calloc(sizeof(*pSrc), prodDims); float *pMax = calloc(sizeof(*pMax), dims[1]); unsigned long *pArgmax = calloc(sizeof(*pArgmax), dims[1]); ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); ck_assert_ptr_ne(pArgmax, NULL); /** * Initialize source data. */ pcgSeed(1); for(i=0;i gtMax){ gtMax = v; gtArgmax = i*dims[2] + k; } } } ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); } /** * Deallocate. */ free(pSrc); free(pMax); free(pArgmax); GpuArray_clear(&gaSrc); GpuArray_clear(&gaMax); GpuArray_clear(&gaArgmax); }END_TEST START_TEST(test_idxtranspose){ /** * We test here the same reduction as test_reduction, except with a * reversed reduxList {2,0} instead of {0,2}. That should lead to a * transposition of the argmax "coordinates" and thus a change in its * "flattened" output version. */ GpuArray gaSrc; GpuArray gaMax; GpuArray gaArgmax; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; size_t rdxDims[1] = {50}; size_t rdxProdDims = rdxDims[0]; const unsigned reduxList[] = {2,0}; float *pSrc = calloc(sizeof(*pSrc), prodDims); float *pMax = calloc(sizeof(*pMax), rdxProdDims); unsigned long *pArgmax = calloc(sizeof(*pArgmax), rdxProdDims); ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); ck_assert_ptr_ne(pArgmax, NULL); /** * Initialize source data. */ pcgSeed(1); for(i=0;i gtMax){ gtMax = v; gtArgmax = k*dims[0] + i; } } } ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); } /** * Deallocate. */ free(pSrc); free(pMax); free(pArgmax); GpuArray_clear(&gaSrc); GpuArray_clear(&gaMax); GpuArray_clear(&gaArgmax); }END_TEST START_TEST(test_veryhighrank){ /** * Here we test a reduction of a random 8D tensor on four dimensions. */ GpuArray gaSrc; GpuArray gaMax; GpuArray gaArgmax; size_t dstIdx; size_t i,j,k,l,m,n,o,p; size_t dims [8] = {1171,373,2,1,2,1,2,1}; size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; const unsigned reduxList[] = {2,4,7,5}; float *pSrc = calloc(sizeof(*pSrc), prodDims); float *pMax = calloc(sizeof(*pMax), rdxProdDims); unsigned long *pArgmax = calloc(sizeof(*pArgmax), rdxProdDims); ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); ck_assert_ptr_ne(pArgmax, NULL); /** * Initialize source data. */ pcgSeed(1); for(i=0;i gtMax){ gtMax = v; gtArgmax = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n; } } } } } dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; ck_assert_msg(gtMax == pMax[dstIdx], "Max value mismatch!"); ck_assert_msg(gtArgmax == pArgmax[dstIdx], "Argmax value mismatch!"); } } } } /** * Deallocate. */ free(pSrc); free(pMax); free(pArgmax); GpuArray_clear(&gaSrc); GpuArray_clear(&gaMax); GpuArray_clear(&gaArgmax); }END_TEST START_TEST(test_alldimsreduced){ /** * We test here a reduction of some random 3D tensor on all dimensions. */ GpuArray gaSrc; GpuArray gaMax; GpuArray gaArgmax; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const unsigned reduxList[] = {0,1,2}; size_t gtArgmax; float gtMax; float *pSrc = calloc(sizeof(*pSrc), prodDims); float *pMax = calloc(1, sizeof(*pMax)); unsigned long *pArgmax = calloc(1, sizeof(*pArgmax)); ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); ck_assert_ptr_ne(pArgmax, NULL); /** * Initialize source data. */ pcgSeed(1); for(i=0;i gtMax){ gtMax = v; gtArgmax = (i*dims[1] + j)*dims[2] + k; } } } } ck_assert_msg(gtMax == pMax[0], "Max value mismatch!"); ck_assert_msg(gtArgmax == pArgmax[0], "Argmax value mismatch!"); /** * Deallocate. */ free(pSrc); free(pMax); free(pArgmax); GpuArray_clear(&gaSrc); GpuArray_clear(&gaMax); GpuArray_clear(&gaArgmax); }END_TEST Suite *get_suite(void) { Suite *s = suite_create("reduction"); TCase *tc = tcase_create("basic"); tcase_add_checked_fixture(tc, setup, teardown); tcase_set_timeout(tc, 15.0); tcase_add_test(tc, test_reduction); tcase_add_test(tc, test_idxtranspose); tcase_add_test(tc, test_veryhighrank); tcase_add_test(tc, test_alldimsreduced); suite_add_tcase(s, tc); return s; } libgpuarray-0.7.6/tests/check_types.c000066400000000000000000000036511326743622600176740ustar00rootroot00000000000000#include #include "gpuarray/error.h" #include "gpuarray/types.h" #include "gpuarray/util.h" static gpuarray_type t; static gpuarray_type t2; START_TEST(test_register_type) { int typecode; const gpuarray_type *pt, *pt2; /* Check that registration works */ t.cluda_name = "void"; t.size = 0xf0f0; t.align = 0xabcd; typecode = gpuarray_register_type(&t, NULL); ck_assert(typecode != -1); ck_assert(t.typecode == typecode); pt = gpuarray_get_type(typecode); ck_assert(pt != NULL); ck_assert(pt == &t); /* Check that a second type does not overwrite the first */ t2.cluda_name = "potato"; t2.size = 0x0f0f; t2.align = 0xdcba; typecode = gpuarray_register_type(&t2, NULL); ck_assert(typecode != -1); ck_assert(t2.typecode == typecode); ck_assert(t.typecode != typecode); /* Check that the first type did not move */ pt2 = gpuarray_get_type(t.typecode); ck_assert(pt2 == pt); } END_TEST START_TEST(test_get_type) { const gpuarray_type *pt; pt = gpuarray_get_type(0); ck_assert(pt->typecode == 0); pt = gpuarray_get_type(GA_FLOAT); ck_assert(pt->typecode == GA_FLOAT); pt = gpuarray_get_type(GA_NBASE); ck_assert(pt->typecode == -1); pt = gpuarray_get_type(GA_DELIM); ck_assert(pt->typecode == -1); pt = gpuarray_get_type(GA_DOUBLE2); ck_assert(pt->typecode == GA_DOUBLE2); pt = gpuarray_get_type(GA_ENDVEC); ck_assert(pt->typecode == -1); pt = gpuarray_get_type(512); ck_assert(pt->typecode == -1); pt = gpuarray_get_type(513); ck_assert(pt->typecode == -1); } END_TEST START_TEST(test_get_elsize) { ck_assert(gpuarray_get_elsize(GA_INT) == 4); ck_assert(gpuarray_get_elsize(GA_DELIM) == 0); } END_TEST Suite *get_suite(void) { Suite *s = suite_create("types"); TCase *tc = tcase_create("All"); tcase_add_test(tc, test_register_type); tcase_add_test(tc, test_get_type); tcase_add_test(tc, test_get_elsize); suite_add_tcase(s, tc); return s; } libgpuarray-0.7.6/tests/check_util.c000066400000000000000000000074271326743622600175120ustar00rootroot00000000000000#include #include #include "gpuarray/buffer.h" #include "gpuarray/util.h" START_TEST(test_register_type) { int tcode; gpuarray_type *t = malloc(sizeof(*t)); ck_assert(t != NULL); t->cluda_name = "ga_test"; t->size = 5; t->align = 1; t->typecode = 1; /* Normally you don't fill this */ tcode = gpuarray_register_type(t, NULL); ck_assert(tcode != -1); ck_assert(tcode == t->typecode); ck_assert(gpuarray_get_type(tcode) != NULL); ck_assert_str_eq(gpuarray_get_type(tcode)->cluda_name, "ga_test"); } END_TEST START_TEST(test_type_flags) { ck_assert_int_eq(gpuarray_type_flags(-1), 0); ck_assert_int_eq(gpuarray_type_flags(GA_FLOAT, -1), 0); ck_assert_int_eq(gpuarray_type_flags(GA_DOUBLE, -1), GA_USE_DOUBLE); ck_assert_int_eq(gpuarray_type_flags(GA_CFLOAT, -1), GA_USE_COMPLEX); ck_assert_int_eq(gpuarray_type_flags(GA_CDOUBLE, -1), GA_USE_DOUBLE|GA_USE_COMPLEX); ck_assert_int_eq(gpuarray_type_flags(GA_HALF, -1), GA_USE_HALF|GA_USE_SMALL); ck_assert_int_eq(gpuarray_type_flags(GA_BYTE, -1), GA_USE_SMALL); ck_assert_int_eq(gpuarray_type_flags(GA_SHORT, GA_DOUBLE, -1), GA_USE_SMALL|GA_USE_DOUBLE); ck_assert_int_eq(gpuarray_type_flags(GA_DOUBLE, GA_DOUBLE, -1), GA_USE_DOUBLE); } END_TEST START_TEST(test_elemwise_collapse) { size_t dims[3]; ssize_t *strs[2]; ssize_t _strs0[3]; ssize_t _strs1[3]; unsigned int nd; strs[0] = _strs0; strs[1] = _strs1; nd = 3; dims[0] = 50; dims[1] = 1; dims[2] = 20; strs[0][0] = 80; strs[0][1] = 80; strs[0][2] = 4; strs[1][0] = 80; strs[1][1] = 80; strs[1][2] = 4; gpuarray_elemwise_collapse(2, &nd, dims, strs); ck_assert_uint_eq(nd, 1); ck_assert_uint_eq(dims[0], 1000); ck_assert_int_eq(strs[0][0], 4); ck_assert_int_eq(strs[1][0], 4); nd = 3; dims[0] = 50; dims[1] = 1; dims[2] = 20; strs[0][0] = 168; strs[0][1] = 80; strs[0][2] = 4; strs[1][0] = 80; strs[1][1] = 80; strs[1][2] = 4; gpuarray_elemwise_collapse(2, &nd, dims, strs); ck_assert_uint_eq(nd, 2); ck_assert_uint_eq(dims[0], 50); ck_assert_uint_eq(dims[1], 20); ck_assert_int_eq(strs[0][0], 168); ck_assert_int_eq(strs[0][1], 4); ck_assert_int_eq(strs[1][0], 80); ck_assert_int_eq(strs[1][1], 4); nd = 3; dims[0] = 20; dims[1] = 1; dims[2] = 50; strs[0][0] = 4; strs[0][1] = 80; strs[0][2] = 168; strs[1][0] = 4; strs[1][1] = 80; strs[1][2] = 80; gpuarray_elemwise_collapse(2, &nd, dims, strs); ck_assert_uint_eq(nd, 2); ck_assert_uint_eq(dims[0], 20); ck_assert_uint_eq(dims[1], 50); ck_assert_int_eq(strs[0][0], 4); ck_assert_int_eq(strs[0][1], 168); ck_assert_int_eq(strs[1][0], 4); ck_assert_int_eq(strs[1][1], 80); nd = 2; dims[0] = 1; dims[1] = 1; strs[0][0] = 4; strs[0][1] = 4; gpuarray_elemwise_collapse(1, &nd, dims, strs); ck_assert_uint_eq(nd, 1); ck_assert_uint_eq(dims[0], 1); ck_assert_int_eq(strs[0][0], 4); } END_TEST START_TEST(test_float2half) { const float f[] = { 2.9831426e-08f, 2e-25f, 2e-26f, 1.0005035f, 1.0002441f, 65519.f, 65520.f, }; const ga_half_t h[] = { {0x0001u}, /* 2e-24 */ {0x0000u}, /* 0 */ {0x0000u}, /* 0 */ {0x3c01u}, /* 1.0 + 2e-10 */ {0x3c00u}, /* 1.0 */ {0x7bffu}, /* 65504 */ {0x7c00u}, /* Inf */ }; unsigned int i; ga_half_t hr; for (i = 0; i < sizeof(f)/sizeof(f[0]); i++) { hr = ga_float2half(f[i]); ck_assert_int_eq(hr.h, h[i].h); } } END_TEST Suite *get_suite(void) { Suite *s = suite_create("util"); TCase *tc = tcase_create("All"); tcase_add_test(tc, test_register_type); tcase_add_test(tc, test_type_flags); tcase_add_test(tc, test_elemwise_collapse); tcase_add_test(tc, test_float2half); suite_add_tcase(s, tc); return s; } libgpuarray-0.7.6/tests/check_util_integerfactoring.c000066400000000000000000000437051326743622600231230ustar00rootroot00000000000000/* Includes */ #include #include #include #include #include #include #include "util/integerfactoring.h" /** * Primality Checker */ START_TEST(test_primalitychecker){ /* Tiny numbers */ ck_assert(!gaIIsPrime( 0ULL)); ck_assert(!gaIIsPrime( 1ULL)); ck_assert( gaIIsPrime( 2ULL)); ck_assert( gaIIsPrime( 3ULL)); ck_assert(!gaIIsPrime( 4ULL)); ck_assert( gaIIsPrime( 5ULL)); ck_assert(!gaIIsPrime( 6ULL)); ck_assert( gaIIsPrime( 7ULL)); ck_assert(!gaIIsPrime( 8ULL)); ck_assert(!gaIIsPrime( 9ULL)); ck_assert(!gaIIsPrime( 10ULL)); ck_assert( gaIIsPrime( 11ULL)); ck_assert(!gaIIsPrime( 12ULL)); ck_assert( gaIIsPrime( 13ULL)); ck_assert(!gaIIsPrime( 14ULL)); ck_assert(!gaIIsPrime( 15ULL)); ck_assert(!gaIIsPrime( 16ULL)); ck_assert( gaIIsPrime( 17ULL)); ck_assert(!gaIIsPrime( 18ULL)); ck_assert( gaIIsPrime( 19ULL)); ck_assert(!gaIIsPrime( 20ULL)); /* Small primes */ ck_assert( gaIIsPrime( 4987ULL)); ck_assert( gaIIsPrime( 4993ULL)); ck_assert( gaIIsPrime( 4999ULL)); /* Squares of primes */ ck_assert(!gaIIsPrime( 24870169ULL)); ck_assert(!gaIIsPrime( 24930049ULL)); ck_assert(!gaIIsPrime( 24990001ULL)); /* Catalan pseudoprimes */ ck_assert(!gaIIsPrime( 5907ULL)); ck_assert(!gaIIsPrime( 1194649ULL)); ck_assert(!gaIIsPrime( 12327121ULL)); /* Fermat base-2 pseudoprimes */ ck_assert(!gaIIsPrime( 341ULL)); ck_assert(!gaIIsPrime( 561ULL)); ck_assert(!gaIIsPrime( 645ULL)); ck_assert(!gaIIsPrime( 1105ULL)); ck_assert(!gaIIsPrime( 1387ULL)); ck_assert(!gaIIsPrime( 1729ULL)); ck_assert(!gaIIsPrime( 1905ULL)); ck_assert(!gaIIsPrime( 2047ULL)); ck_assert(!gaIIsPrime( 2465ULL)); ck_assert(!gaIIsPrime( 486737ULL)); /* Strong Lucas pseudoprimes */ ck_assert(!gaIIsPrime( 5459ULL)); ck_assert(!gaIIsPrime( 5459ULL)); ck_assert(!gaIIsPrime( 5459ULL)); ck_assert(!gaIIsPrime( 5777ULL)); ck_assert(!gaIIsPrime( 10877ULL)); ck_assert(!gaIIsPrime( 16109ULL)); ck_assert(!gaIIsPrime( 18971ULL)); ck_assert(!gaIIsPrime( 22499ULL)); ck_assert(!gaIIsPrime( 24569ULL)); ck_assert(!gaIIsPrime( 25199ULL)); ck_assert(!gaIIsPrime( 40309ULL)); ck_assert(!gaIIsPrime( 58519ULL)); ck_assert(!gaIIsPrime( 75077ULL)); ck_assert(!gaIIsPrime( 97439ULL)); ck_assert(!gaIIsPrime( 100127ULL)); ck_assert(!gaIIsPrime( 113573ULL)); ck_assert(!gaIIsPrime( 115639ULL)); ck_assert(!gaIIsPrime( 130139ULL)); /* Medium, prime. */ ck_assert( gaIIsPrime( 2100000011ULL)); ck_assert( gaIIsPrime( 2100000017ULL)); /* Large, non-smooth, composite */ ck_assert(!gaIIsPrime( 2196095973992233039ULL)); /* Largest prime < 2**64: */ ck_assert( gaIIsPrime(18446744073709551557ULL)); /* Largest integers */ ck_assert(!gaIIsPrime(18446744073709551613ULL)); ck_assert(!gaIIsPrime(18446744073709551614ULL)); ck_assert(!gaIIsPrime(18446744073709551615ULL)); }END_TEST /** * Integer Factorization test */ START_TEST(test_integerfactorization){ ga_factor_list fl; uint64_t n; /** * Attempt exact factorization for 2^64-1, no k-smoothness constraint. * Expected PASS with 3*5*17*257*641*65537*6700417 */ n = 18446744073709551615ULL; ck_assert_int_ne (gaIFactorize(n, 0, 0, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 3ULL), 1); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 1); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 17ULL), 1); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 257ULL), 1); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 641ULL), 1); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 65537ULL), 1); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 6700417ULL), 1); ck_assert_uint_eq(gaIFLGetProduct(&fl), n); /** * Attempt exact factorization for 2^64-1, 4096-smooth constraint. * Expected FAIL, because 2^64-1 possesses prime factors in excess of 4096. */ n = 18446744073709551615ULL; ck_assert_int_eq (gaIFactorize(n, 0, 4096, &fl), 0); /** * Attempt approximate factorization for 2^64-1, no k-smoothness constraint. * Unlimited growth permitted. * Expected PASS, since 2^64-1 rounds up to 2^64 and 2^64 trivially factorizes. */ n = 18446744073709551615ULL; ck_assert_int_ne (gaIFactorize(n, -1, 0, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 64); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 2); ck_assert_int_ne (gaIFLIsOverflowed(&fl), 0); /** * Attempt exact factorization for 2196095973992233039, no k-smoothness constraint. * 2196095973992233039 is a large, highly non-smooth number, with three enormous * factors. * Expected PASS *very quickly*, since it factorizes as 1299817*1299821*1299827 */ n = 2196095973992233039ULL; ck_assert_int_ne (gaIFactorize(n, 0, 0, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 1299817ULL), 1); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 1299821ULL), 1); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 1299827ULL), 1); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 1299827); ck_assert_uint_eq(gaIFLGetProduct(&fl), n); /** * Attempt approximate factorization for 2196095973992233039, 16-smooth constraint. * 2196095973992233039 is a large, highly non-smooth number, with three enormous * factors. It is not 64-smooth, so code paths that attempt approximate * factorization within the growth limits (.005%) are exercised. * * Expected PASS *relatively quickly*. */ n = 2196095973992233039ULL; ck_assert_int_ne (gaIFactorize(n, n*1.00005, 16, &fl), 0); ck_assert_uint_ge(gaIFLGetProduct(&fl), n); ck_assert_uint_le(gaIFLGetProduct(&fl), n*1.00005); /** * Attempt exact factorization of 7438473388800000000, 5-smooth constraint. * It is a large, 5-smooth number. This should exercise the 5-smooth * factorization path. */ n = 7438473388800000000ULL; ck_assert_int_ne (gaIFactorize(n, 0, 5, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 14); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 3ULL), 19); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 8); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 5); ck_assert_uint_eq(gaIFLGetProduct(&fl), n); /** * Attempt approximate factorization of 7438473388799999997, 2-smooth constraint. * It is a large, non-smooth number. This should exercise the optimal 2-smooth * factorizer in spite of the available, unlimited slack. */ n = 7438473388799999997ULL; ck_assert_int_ne (gaIFactorize(n, -1, 2, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 63); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 3ULL), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 0); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 2); ck_assert_uint_eq(gaIFLGetProduct(&fl), 9223372036854775808ULL); /** * Attempt approximate factorization of 7438473388799999997, 3-smooth constraint. * It is a large, non-smooth number. This should exercise the optimal 3-smooth * factorizer in spite of the available, unlimited slack. */ n = 7438473388799999997ULL; ck_assert_int_ne (gaIFactorize(n, -1, 3, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 31); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 3ULL), 20); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 0); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 3); ck_assert_uint_eq(gaIFLGetProduct(&fl), 7487812485248974848ULL); /** * Attempt approximate factorization of 7438473388799999997, 5-smooth constraint. * It is a large, non-smooth number, but 3 integers above it is a 5-smooth * integer, 7438473388800000000. This should exercise the optimal 5-smooth * factorizer in spite of the available, unlimited slack. */ n = 7438473388799999997ULL; ck_assert_int_ne (gaIFactorize(n, -1, 5, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 14); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 3ULL), 19); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 8); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 5); ck_assert_uint_eq(gaIFLGetProduct(&fl), 7438473388800000000ULL); /** * Toughest challenge: Attempt very tight approximate factorization of * 9876543210987654321 with .01% slack and 43-smooth constraint. * * This forces a bypass of the optimal 5-smooth factorizers and heavily * exercises the nextI:, subfactorize:, primetest: and newX jumps and * calculations. * * Expected PASS, "reasonably fast". */ n = 9876543210987654321ULL; ck_assert_int_ne (gaIFactorize(n, n*1.0001, 43, &fl), 0); ck_assert_uint_ge(gaIFLGetProduct(&fl), n); ck_assert_uint_le(gaIFLGetProduct(&fl), n*1.0001); ck_assert_uint_le(gaIFLGetGreatestFactor(&fl), 43); }END_TEST START_TEST(test_scheduler){ /* We use here the CUDA limits of a CC 3.0 GPU as an example. */ uint64_t maxBTot = 1024, maxBInd[] = { 1024, 1024, 64}, maxGTot = 0xFFFFFFFF, maxGInd[] = {2147483647, 65535, 65535}, warpSize = 32; int warpAxis; uint64_t dims[3]; ga_factor_list factBS[3], factGS[3], factCS[3]; unsigned long long intbBS[3], intbGS[3], intbCS[3]; unsigned long long intaBS[3], intaGS[3], intaCS[3]; /** * NOTE: If you want to view befores-and-afters of scheduling, #define PRINT * to something non-0. */ #define PRINT 0 /** * * Testcase: (895,1147,923) job, warpSize on axis 0. * */ { warpAxis = 0; dims[0] = 895; dims[1] = 1141; dims[2] = 923; dims[warpAxis] = (dims[warpAxis]+warpSize-1) / warpSize; /** * Factorization job must be successful. */ ck_assert(gaIFactorize(warpAxis==0?warpSize:1, 0, maxBInd[0], factBS+0)); ck_assert(gaIFactorize(warpAxis==1?warpSize:1, 0, maxBInd[1], factBS+1)); ck_assert(gaIFactorize(warpAxis==2?warpSize:1, 0, maxBInd[2], factBS+2)); ck_assert(gaIFactorize( 1, 0, maxBInd[0], factGS+0)); ck_assert(gaIFactorize( 1, 0, maxBInd[1], factGS+1)); ck_assert(gaIFactorize( 1, 0, maxBInd[2], factGS+2)); ck_assert(gaIFactorize( dims[0], dims[0]*1.1, maxBInd[0], factCS+0)); ck_assert(gaIFactorize( dims[1], dims[1]*1.1, maxBInd[1], factCS+1)); ck_assert(gaIFactorize( dims[2], dims[2]*1.1, maxBInd[2], factCS+2)); intbBS[0] = gaIFLGetProduct(factBS+0); intbBS[1] = gaIFLGetProduct(factBS+1); intbBS[2] = gaIFLGetProduct(factBS+2); intbGS[0] = gaIFLGetProduct(factGS+0); intbGS[1] = gaIFLGetProduct(factGS+1); intbGS[2] = gaIFLGetProduct(factGS+2); intbCS[0] = gaIFLGetProduct(factCS+0); intbCS[1] = gaIFLGetProduct(factCS+1); intbCS[2] = gaIFLGetProduct(factCS+2); /** * Ensure that factorization only *increases* the size of the problem. */ ck_assert_uint_ge(intbCS[0], dims[0]); ck_assert_uint_ge(intbCS[1], dims[1]); ck_assert_uint_ge(intbCS[2], dims[2]); /** * Run scheduler. */ #if PRINT printf("Before:\n"); printf("BS: (%6llu, %6llu, %6llu)\n", intbBS[0], intbBS[1], intbBS[2]); printf("GS: (%6llu, %6llu, %6llu)\n", intbGS[0], intbGS[1], intbGS[2]); printf("CS: (%6llu, %6llu, %6llu)\n", intbCS[0], intbCS[1], intbCS[2]); #endif gaIFLSchedule(3, maxBTot, maxBInd, maxGTot, maxGInd, factBS, factGS, factCS); intaBS[0] = gaIFLGetProduct(factBS+0); intaBS[1] = gaIFLGetProduct(factBS+1); intaBS[2] = gaIFLGetProduct(factBS+2); intaGS[0] = gaIFLGetProduct(factGS+0); intaGS[1] = gaIFLGetProduct(factGS+1); intaGS[2] = gaIFLGetProduct(factGS+2); intaCS[0] = gaIFLGetProduct(factCS+0); intaCS[1] = gaIFLGetProduct(factCS+1); intaCS[2] = gaIFLGetProduct(factCS+2); #if PRINT printf("After:\n"); printf("BS: (%6llu, %6llu, %6llu)\n", intaBS[0], intaBS[1], intaBS[2]); printf("GS: (%6llu, %6llu, %6llu)\n", intaGS[0], intaGS[1], intaGS[2]); printf("CS: (%6llu, %6llu, %6llu)\n", intaCS[0], intaCS[1], intaCS[2]); #endif /** * Scheduling is only about moving factors between block/grid/chunk factor * lists. Therefore, the three dimensions must not have changed size. */ ck_assert_uint_eq(intbBS[0]*intbGS[0]*intbCS[0], intaBS[0]*intaGS[0]*intaCS[0]); ck_assert_uint_eq(intbBS[1]*intbGS[1]*intbCS[1], intaBS[1]*intaGS[1]*intaCS[1]); ck_assert_uint_eq(intbBS[2]*intbGS[2]*intbCS[2], intaBS[2]*intaGS[2]*intaCS[2]); /** * Verify that the individual limits and global limits on threads in a * block and blocks in a grid are met. */ ck_assert_uint_le(intaBS[0], maxBInd[0]); ck_assert_uint_le(intaBS[1], maxBInd[1]); ck_assert_uint_le(intaBS[2], maxBInd[2]); ck_assert_uint_le(intaGS[0], maxGInd[0]); ck_assert_uint_le(intaGS[1], maxGInd[1]); ck_assert_uint_le(intaGS[2], maxGInd[2]); ck_assert_uint_le(intaBS[0]*intaBS[1]*intaBS[2], maxBTot); ck_assert_uint_le(intaGS[0]*intaGS[1]*intaGS[2], maxGTot); } /** * * Testcase: (1,1,121632959) job, warpSize on axis 2. * */ { warpAxis = 2; dims[0] = 1; dims[1] = 1; dims[2] = 121632959; dims[warpAxis] = (dims[warpAxis]+warpSize-1) / warpSize; /** * Factorization job must be successful. */ ck_assert(gaIFactorize(warpAxis==0?warpSize:1, 0, maxBInd[0], factBS+0)); ck_assert(gaIFactorize(warpAxis==1?warpSize:1, 0, maxBInd[1], factBS+1)); ck_assert(gaIFactorize(warpAxis==2?warpSize:1, 0, maxBInd[2], factBS+2)); ck_assert(gaIFactorize( 1, 0, maxBInd[0], factGS+0)); ck_assert(gaIFactorize( 1, 0, maxBInd[1], factGS+1)); ck_assert(gaIFactorize( 1, 0, maxBInd[2], factGS+2)); ck_assert(gaIFactorize( dims[0], dims[0]*1.1, maxBInd[0], factCS+0)); ck_assert(gaIFactorize( dims[1], dims[1]*1.1, maxBInd[1], factCS+1)); ck_assert(gaIFactorize( dims[2], dims[2]*1.1, maxBInd[2], factCS+2)); intbBS[0] = gaIFLGetProduct(factBS+0); intbBS[1] = gaIFLGetProduct(factBS+1); intbBS[2] = gaIFLGetProduct(factBS+2); intbGS[0] = gaIFLGetProduct(factGS+0); intbGS[1] = gaIFLGetProduct(factGS+1); intbGS[2] = gaIFLGetProduct(factGS+2); intbCS[0] = gaIFLGetProduct(factCS+0); intbCS[1] = gaIFLGetProduct(factCS+1); intbCS[2] = gaIFLGetProduct(factCS+2); /** * Ensure that factorization only *increases* the size of the problem. */ ck_assert_uint_ge(intbCS[0], dims[0]); ck_assert_uint_ge(intbCS[1], dims[1]); ck_assert_uint_ge(intbCS[2], dims[2]); /** * Run scheduler. */ #if PRINT printf("Before:\n"); printf("BS: (%6llu, %6llu, %6llu)\n", intbBS[0], intbBS[1], intbBS[2]); printf("GS: (%6llu, %6llu, %6llu)\n", intbGS[0], intbGS[1], intbGS[2]); printf("CS: (%6llu, %6llu, %6llu)\n", intbCS[0], intbCS[1], intbCS[2]); #endif gaIFLSchedule(3, maxBTot, maxBInd, maxGTot, maxGInd, factBS, factGS, factCS); intaBS[0] = gaIFLGetProduct(factBS+0); intaBS[1] = gaIFLGetProduct(factBS+1); intaBS[2] = gaIFLGetProduct(factBS+2); intaGS[0] = gaIFLGetProduct(factGS+0); intaGS[1] = gaIFLGetProduct(factGS+1); intaGS[2] = gaIFLGetProduct(factGS+2); intaCS[0] = gaIFLGetProduct(factCS+0); intaCS[1] = gaIFLGetProduct(factCS+1); intaCS[2] = gaIFLGetProduct(factCS+2); #if PRINT printf("After:\n"); printf("BS: (%6llu, %6llu, %6llu)\n", intaBS[0], intaBS[1], intaBS[2]); printf("GS: (%6llu, %6llu, %6llu)\n", intaGS[0], intaGS[1], intaGS[2]); printf("CS: (%6llu, %6llu, %6llu)\n", intaCS[0], intaCS[1], intaCS[2]); #endif /** * Scheduling is only about moving factors between block/grid/chunk factor * lists. Therefore, the three dimensions must not have changed size. */ ck_assert_uint_eq(intbBS[0]*intbGS[0]*intbCS[0], intaBS[0]*intaGS[0]*intaCS[0]); ck_assert_uint_eq(intbBS[1]*intbGS[1]*intbCS[1], intaBS[1]*intaGS[1]*intaCS[1]); ck_assert_uint_eq(intbBS[2]*intbGS[2]*intbCS[2], intaBS[2]*intaGS[2]*intaCS[2]); /** * Verify that the individual limits and global limits on threads in a * block and blocks in a grid are met. */ ck_assert_uint_le(intaBS[0], maxBInd[0]); ck_assert_uint_le(intaBS[1], maxBInd[1]); ck_assert_uint_le(intaBS[2], maxBInd[2]); ck_assert_uint_le(intaGS[0], maxGInd[0]); ck_assert_uint_le(intaGS[1], maxGInd[1]); ck_assert_uint_le(intaGS[2], maxGInd[2]); ck_assert_uint_le(intaBS[0]*intaBS[1]*intaBS[2], maxBTot); ck_assert_uint_le(intaGS[0]*intaGS[1]*intaGS[2], maxGTot); } }END_TEST Suite *get_suite(void){ Suite *s = suite_create("util_integerfactoring"); TCase *tc = tcase_create("All"); tcase_set_timeout(tc, 10.0); tcase_add_test(tc, test_primalitychecker); tcase_add_test(tc, test_integerfactorization); tcase_add_test(tc, test_scheduler); suite_add_tcase(s, tc); return s; } libgpuarray-0.7.6/tests/communicator.c000066400000000000000000000021261326743622600200670ustar00rootroot00000000000000#include #include #include "gpuarray/buffer.h" #include "gpuarray/buffer_collectives.h" #include "gpuarray/error.h" extern gpucontext* ctx; int comm_ndev; //!< number of devices in the comm int comm_rank; //!< comm's rank in the world // (for the tests it's the same as process rank in MPI_COMM_WORLD) gpucomm* comm; extern void setup(void); extern void teardown(void); /** * \brief Setup for `check_buffer_collectives.c` and `check_collectives.c`. * * Includes tests for `gpucomm_new` and `gpucomm_gen_clique_id` */ void setup_comm(void) { int err; gpucommCliqueId comm_id; setup(); MPI_Barrier(MPI_COMM_WORLD); err = gpucomm_gen_clique_id(ctx, &comm_id); // Has successfully got a unique comm id. ck_assert_int_eq(err, GA_NO_ERROR); MPI_Bcast(&comm_id, GA_COMM_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD); err = gpucomm_new(&comm, ctx, comm_id, comm_ndev, comm_rank % comm_ndev); // Has successfully created a new gpucomm. ck_assert_int_eq(err, GA_NO_ERROR); ck_assert_ptr_ne(comm, NULL); } void teardown_comm(void) { gpucomm_free(comm); teardown(); } libgpuarray-0.7.6/tests/device.c000066400000000000000000000031351326743622600166270ustar00rootroot00000000000000#include #include #include #include #include "gpuarray/buffer.h" #include "gpuarray/error.h" char* dev_name = NULL; int get_env_dev(const char **name, gpucontext_props *p) { char *dev = NULL; char *end; long no; int pl; dev = dev_name; if (dev == NULL) { if ((dev = getenv("GPUARRAY_TEST_DEVICE")) == NULL) { if ((dev = getenv("DEVICE")) == NULL) { fprintf(stderr, "No device specified for testing, specify a device with DEVICE or GPUARRAY_TEST_DEVICE"); return -1; } } } if (strncmp(dev, "cuda", 4) == 0) { *name = "cuda"; no = strtol(dev + 4, &end, 10); if (end == dev || *end != '\0') return -1; if (no < 0 || no > INT_MAX) return -1; gpucontext_props_cuda_dev(p, (int)no); return 0; } if (strncmp(dev, "opencl", 6) == 0) { *name = "opencl"; no = strtol(dev + 6, &end, 10); if (end == dev || *end != ':') return -1; if (no < 0 || no > 32768) return -1; pl = (int)no; dev = end; no = strtol(dev + 1, &end, 10); if (end == dev || *end != '\0') return -1; if (no < 0 || no > 32768) return -1; gpucontext_props_opencl_dev(p, pl, (int)no); return 0; } return -1; } gpucontext *ctx; void setup(void) { const char *name = NULL; gpucontext_props *p; ck_assert_int_eq(gpucontext_props_new(&p), GA_NO_ERROR); ck_assert_int_eq(get_env_dev(&name, p), 0); ck_assert_int_eq(gpucontext_init(&ctx, name, p), GA_NO_ERROR); ck_assert_ptr_ne(ctx, NULL); } void teardown(void) { gpucontext_deref(ctx); ctx = NULL; } libgpuarray-0.7.6/tests/main.c000066400000000000000000000023351326743622600163150ustar00rootroot00000000000000#include #ifdef TEST_COLLECTIVES #include #endif // TEST_COLLECTIVES #include #ifdef TEST_COLLECTIVES #include extern int comm_ndev; extern int comm_rank; extern char *dev_name; #endif // TEST_COLLECTIVES extern Suite *get_suite(void); int main(int argc, char *argv[]) { int number_failed; Suite *s; SRunner *sr; #ifdef TEST_COLLECTIVES MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &comm_ndev); MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank); if (argc < comm_ndev) { if (comm_rank == 0) printf("Usage : %s \n", argv[0]); exit(1); } dev_name = argv[comm_rank + 1]; // Set a gpu for this process. #endif // TEST_COLLECTIVES s = get_suite(); sr = srunner_create(s); #ifdef TEST_COLLECTIVES // Check by default forks to another (non mpi registered) process in order to // run tests. Using MPI inside tests means we must disable this. srunner_set_fork_status(sr, CK_NOFORK); #endif // TEST_COLLECTIVES srunner_run_all(sr, CK_VERBOSE); number_failed = srunner_ntests_failed(sr); srunner_free(sr); #ifdef TEST_COLLECTIVES MPI_Finalize(); #endif // TEST_COLLECTIVES return number_failed == 0 ? EXIT_SUCCESS : EXIT_FAILURE; } libgpuarray-0.7.6/versioneer.py000066400000000000000000002060021326743622600166130ustar00rootroot00000000000000 # Version: 0.18 """The Versioneer - like a rocketeer, but for versions. The Versioneer ============== * like a rocketeer, but for versions! * https://github.com/warner/python-versioneer * Brian Warner * License: Public Domain * Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy * [![Latest Version] (https://pypip.in/version/versioneer/badge.svg?style=flat) ](https://pypi.python.org/pypi/versioneer/) * [![Build Status] (https://travis-ci.org/warner/python-versioneer.png?branch=master) ](https://travis-ci.org/warner/python-versioneer) This is a tool for managing a recorded version number in distutils-based python projects. The goal is to remove the tedious and error-prone "update the embedded version string" step from your release process. Making a new release should be as easy as recording a new tag in your version-control system, and maybe making new tarballs. ## Quick Install * `pip install versioneer` to somewhere to your $PATH * add a `[versioneer]` section to your setup.cfg (see below) * run `versioneer install` in your source tree, commit the results ## Version Identifiers Source trees come from a variety of places: * a version-control system checkout (mostly used by developers) * a nightly tarball, produced by build automation * a snapshot tarball, produced by a web-based VCS browser, like github's "tarball from tag" feature * a release tarball, produced by "setup.py sdist", distributed through PyPI Within each source tree, the version identifier (either a string or a number, this tool is format-agnostic) can come from a variety of places: * ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows about recent "tags" and an absolute revision-id * the name of the directory into which the tarball was unpacked * an expanded VCS keyword ($Id$, etc) * a `_version.py` created by some earlier build step For released software, the version identifier is closely related to a VCS tag. Some projects use tag names that include more than just the version string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool needs to strip the tag prefix to extract the version identifier. For unreleased software (between tags), the version identifier should provide enough information to help developers recreate the same tree, while also giving them an idea of roughly how old the tree is (after version 1.2, before version 1.3). Many VCS systems can report a description that captures this, for example `git describe --tags --dirty --always` reports things like "0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the 0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has uncommitted changes. The version identifier is used for multiple purposes: * to allow the module to self-identify its version: `myproject.__version__` * to choose a name and prefix for a 'setup.py sdist' tarball ## Theory of Operation Versioneer works by adding a special `_version.py` file into your source tree, where your `__init__.py` can import it. This `_version.py` knows how to dynamically ask the VCS tool for version information at import time. `_version.py` also contains `$Revision$` markers, and the installation process marks `_version.py` to have this marker rewritten with a tag name during the `git archive` command. As a result, generated tarballs will contain enough information to get the proper version. To allow `setup.py` to compute a version too, a `versioneer.py` is added to the top level of your source tree, next to `setup.py` and the `setup.cfg` that configures it. This overrides several distutils/setuptools commands to compute the version when invoked, and changes `setup.py build` and `setup.py sdist` to replace `_version.py` with a small static file that contains just the generated version data. ## Installation See [INSTALL.md](./INSTALL.md) for detailed installation instructions. ## Version-String Flavors Code which uses Versioneer can learn about its version string at runtime by importing `_version` from your main `__init__.py` file and running the `get_versions()` function. From the "outside" (e.g. in `setup.py`), you can import the top-level `versioneer.py` and run `get_versions()`. Both functions return a dictionary with different flavors of version information: * `['version']`: A condensed version string, rendered using the selected style. This is the most commonly used value for the project's version string. The default "pep440" style yields strings like `0.11`, `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section below for alternative styles. * `['full-revisionid']`: detailed revision identifier. For Git, this is the full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". * `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the commit date in ISO 8601 format. This will be None if the date is not available. * `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that this is only accurate if run in a VCS checkout, otherwise it is likely to be False or None * `['error']`: if the version string could not be computed, this will be set to a string describing the problem, otherwise it will be None. It may be useful to throw an exception in setup.py if this is set, to avoid e.g. creating tarballs with a version string of "unknown". Some variants are more useful than others. Including `full-revisionid` in a bug report should allow developers to reconstruct the exact code being tested (or indicate the presence of local changes that should be shared with the developers). `version` is suitable for display in an "about" box or a CLI `--version` output: it can be easily compared against release notes and lists of bugs fixed in various releases. The installer adds the following text to your `__init__.py` to place a basic version in `YOURPROJECT.__version__`: from ._version import get_versions __version__ = get_versions()['version'] del get_versions ## Styles The setup.cfg `style=` configuration controls how the VCS information is rendered into a version string. The default style, "pep440", produces a PEP440-compliant string, equal to the un-prefixed tag name for actual releases, and containing an additional "local version" section with more detail for in-between builds. For Git, this is TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags --dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and that this commit is two revisions ("+2") beyond the "0.11" tag. For released software (exactly equal to a known tag), the identifier will only contain the stripped tag, e.g. "0.11". Other styles are available. See [details.md](details.md) in the Versioneer source tree for descriptions. ## Debugging Versioneer tries to avoid fatal errors: if something goes wrong, it will tend to return a version of "0+unknown". To investigate the problem, run `setup.py version`, which will run the version-lookup code in a verbose mode, and will display the full contents of `get_versions()` (including the `error` string, which may help identify what went wrong). ## Known Limitations Some situations are known to cause problems for Versioneer. This details the most significant ones. More can be found on Github [issues page](https://github.com/warner/python-versioneer/issues). ### Subprojects Versioneer has limited support for source trees in which `setup.py` is not in the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are two common reasons why `setup.py` might not be in the root: * Source trees which contain multiple subprojects, such as [Buildbot](https://github.com/buildbot/buildbot), which contains both "master" and "slave" subprojects, each with their own `setup.py`, `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI distributions (and upload multiple independently-installable tarballs). * Source trees whose main purpose is to contain a C library, but which also provide bindings to Python (and perhaps other langauges) in subdirectories. Versioneer will look for `.git` in parent directories, and most operations should get the right version string. However `pip` and `setuptools` have bugs and implementation details which frequently cause `pip install .` from a subproject directory to fail to find a correct version string (so it usually defaults to `0+unknown`). `pip install --editable .` should work correctly. `setup.py install` might work too. Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in some later version. [Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking this issue. The discussion in [PR #61](https://github.com/warner/python-versioneer/pull/61) describes the issue from the Versioneer side in more detail. [pip PR#3176](https://github.com/pypa/pip/pull/3176) and [pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve pip to let Versioneer work correctly. Versioneer-0.16 and earlier only looked for a `.git` directory next to the `setup.cfg`, so subprojects were completely unsupported with those releases. ### Editable installs with setuptools <= 18.5 `setup.py develop` and `pip install --editable .` allow you to install a project into a virtualenv once, then continue editing the source code (and test) without re-installing after every change. "Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a convenient way to specify executable scripts that should be installed along with the python package. These both work as expected when using modern setuptools. When using setuptools-18.5 or earlier, however, certain operations will cause `pkg_resources.DistributionNotFound` errors when running the entrypoint script, which must be resolved by re-installing the package. This happens when the install happens with one version, then the egg_info data is regenerated while a different version is checked out. Many setup.py commands cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into a different virtualenv), so this can be surprising. [Bug #83](https://github.com/warner/python-versioneer/issues/83) describes this one, but upgrading to a newer version of setuptools should probably resolve it. ### Unicode version strings While Versioneer works (and is continually tested) with both Python 2 and Python 3, it is not entirely consistent with bytes-vs-unicode distinctions. Newer releases probably generate unicode version strings on py2. It's not clear that this is wrong, but it may be surprising for applications when then write these strings to a network connection or include them in bytes-oriented APIs like cryptographic checksums. [Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates this question. ## Updating Versioneer To upgrade your project to a new release of Versioneer, do the following: * install the new Versioneer (`pip install -U versioneer` or equivalent) * edit `setup.cfg`, if necessary, to include any new configuration settings indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. * re-run `versioneer install` in your source tree, to replace `SRC/_version.py` * commit any changed files ## Future Directions This tool is designed to make it easily extended to other version-control systems: all VCS-specific components are in separate directories like src/git/ . The top-level `versioneer.py` script is assembled from these components by running make-versioneer.py . In the future, make-versioneer.py will take a VCS name as an argument, and will construct a version of `versioneer.py` that is specific to the given VCS. It might also take the configuration arguments that are currently provided manually during installation by editing setup.py . Alternatively, it might go the other direction and include code from all supported VCS systems, reducing the number of intermediate scripts. ## License To make Versioneer easier to embed, all its code is dedicated to the public domain. The `_version.py` that it creates is also in the public domain. Specifically, both are released under the Creative Commons "Public Domain Dedication" license (CC0-1.0), as described in https://creativecommons.org/publicdomain/zero/1.0/ . """ from __future__ import print_function try: import configparser except ImportError: import ConfigParser as configparser import errno import json import os import re import subprocess import sys class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_root(): """Get the project root directory. We require that all commands are run from the project root, i.e. the directory that contains setup.py, setup.cfg, and versioneer.py . """ root = os.path.realpath(os.path.abspath(os.getcwd())) setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): # allow 'python path/to/setup.py COMMAND' root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): err = ("Versioneer was unable to run the project root directory. " "Versioneer requires setup.py to be executed from " "its immediate directory (like 'python setup.py COMMAND'), " "or in a way that lets it use sys.argv[0] to find the root " "(like 'python path/to/setup.py COMMAND').") raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools # tree) execute all dependencies in a single python process, so # "versioneer" may be imported multiple times, and python's shared # module-import table will cache the first one. So we can't use # os.path.dirname(__file__), as that will find whichever # versioneer.py was first imported, even in later projects. me = os.path.realpath(os.path.abspath(__file__)) me_dir = os.path.normcase(os.path.splitext(me)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir: print("Warning: build in %s is using versioneer.py from %s" % (os.path.dirname(me), versioneer_py)) except NameError: pass return root def get_config_from_root(root): """Read the project setup.cfg file to determine Versioneer config.""" # This might raise EnvironmentError (if setup.cfg is missing), or # configparser.NoSectionError (if it lacks a [versioneer] section), or # configparser.NoOptionError (if it lacks "VCS="). See the docstring at # the top of versioneer.py for instructions on writing your setup.cfg . setup_cfg = os.path.join(root, "setup.cfg") parser = configparser.SafeConfigParser() with open(setup_cfg, "r") as f: parser.readfp(f) VCS = parser.get("versioneer", "VCS") # mandatory def get(parser, name): if parser.has_option("versioneer", name): return parser.get("versioneer", name) return None cfg = VersioneerConfig() cfg.VCS = VCS cfg.style = get(parser, "style") or "" cfg.versionfile_source = get(parser, "versionfile_source") cfg.versionfile_build = get(parser, "versionfile_build") cfg.tag_prefix = get(parser, "tag_prefix") if cfg.tag_prefix in ("''", '""'): cfg.tag_prefix = "" cfg.parentdir_prefix = get(parser, "parentdir_prefix") cfg.verbose = get(parser, "verbose") return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" # these dictionaries contain VCS-specific tools LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen([c] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None)) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, p.returncode return stdout, p.returncode LONG_VERSION_PY['git'] = ''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. Generated by # versioneer-0.18 (https://github.com/warner/python-versioneer) """Git implementation of _version.py.""" import errno import os import re import subprocess import sys def get_keywords(): """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_config(): """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "%(STYLE)s" cfg.tag_prefix = "%(TAG_PREFIX)s" cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen([c] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None)) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %%s" %% dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %%s" %% (commands,)) return None, None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %%s (error)" %% dispcmd) print("stdout was %%s" %% stdout) return None, p.returncode return stdout, p.returncode def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %%s but none started with prefix %%s" %% (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") date = keywords.get("date") if date is not None: # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %%d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: print("discarding '%%s', no digits" %% ",".join(refs - tags)) if verbose: print("likely tags: %%s" %% ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] if verbose: print("picking %%s" %% r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %%s not under git control" %% root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", "--always", "--long", "--match", "%%s*" %% tag_prefix], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%%s'" %% describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%%s' doesn't start with prefix '%%s'" print(fmt %% (full_tag, tag_prefix)) pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" %% (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%%d" %% pieces["distance"] else: # exception #1 rendered = "0.post.dev%%d" %% pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%%s'" %% style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} def get_versions(): """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for i in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} ''' @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") date = keywords.get("date") if date is not None: # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] if verbose: print("picking %s" % r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", "--always", "--long", "--match", "%s*" % tag_prefix], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%s'" % describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" % (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def do_vcs_install(manifest_in, versionfile_source, ipy): """Git-specific installation logic for Versioneer. For Git, this means creating/changing .gitattributes to mark _version.py for export-subst keyword substitution. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] files = [manifest_in, versionfile_source] if ipy: files.append(ipy) try: me = __file__ if me.endswith(".pyc") or me.endswith(".pyo"): me = os.path.splitext(me)[0] + ".py" versioneer_file = os.path.relpath(me) except NameError: versioneer_file = "versioneer.py" files.append(versioneer_file) present = False try: f = open(".gitattributes", "r") for line in f.readlines(): if line.strip().startswith(versionfile_source): if "export-subst" in line.strip().split()[1:]: present = True f.close() except EnvironmentError: pass if not present: f = open(".gitattributes", "a+") f.write("%s export-subst\n" % versionfile_source) f.close() files.append(".gitattributes") run_command(GITS, ["add", "--"] + files) def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") SHORT_VERSION_PY = """ # This file was generated by 'versioneer.py' (0.18) from # revision-control system data, or from the parent directory name of an # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. import json version_json = ''' %s ''' # END VERSION_JSON def get_versions(): return json.loads(version_json) """ def versions_from_file(filename): """Try to determine the version from _version.py if present.""" try: with open(filename) as f: contents = f.read() except EnvironmentError: raise NotThisMethod("unable to read _version.py") mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) def write_to_version_file(filename, versions): """Write the given version number to the given _version.py file.""" os.unlink(filename) contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) print("set %s to '%s'" % (filename, versions["version"])) def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%d" % pieces["distance"] else: # exception #1 rendered = "0.post.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} class VersioneerBadRootError(Exception): """The project root directory is unknown or missing key files.""" def get_versions(verbose=False): """Get the project version from whatever source is available. Returns dict with two keys: 'version' and 'full'. """ if "versioneer" in sys.modules: # see the discussion in cmdclass.py:get_cmdclass() del sys.modules["versioneer"] root = get_root() cfg = get_config_from_root(root) assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose assert cfg.versionfile_source is not None, \ "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) # extract version from first of: _version.py, VCS command (e.g. 'git # describe'), parentdir. This is meant to work for developers using a # source checkout, for users of a tarball created by 'setup.py sdist', # and for users of a tarball/zipball created by 'git archive' or github's # download-from-tag feature or the equivalent in other VCSes. get_keywords_f = handlers.get("get_keywords") from_keywords_f = handlers.get("keywords") if get_keywords_f and from_keywords_f: try: keywords = get_keywords_f(versionfile_abs) ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) if verbose: print("got version from expanded keyword %s" % ver) return ver except NotThisMethod: pass try: ver = versions_from_file(versionfile_abs) if verbose: print("got version from file %s %s" % (versionfile_abs, ver)) return ver except NotThisMethod: pass from_vcs_f = handlers.get("pieces_from_vcs") if from_vcs_f: try: pieces = from_vcs_f(cfg.tag_prefix, root, verbose) ver = render(pieces, cfg.style) if verbose: print("got version from VCS %s" % ver) return ver except NotThisMethod: pass try: if cfg.parentdir_prefix: ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) if verbose: print("got version from parentdir %s" % ver) return ver except NotThisMethod: pass if verbose: print("unable to compute version") return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} def get_version(): """Get the short version string for this project.""" return get_versions()["version"] def get_cmdclass(): """Get the custom setuptools/distutils subclasses used by Versioneer.""" if "versioneer" in sys.modules: del sys.modules["versioneer"] # this fixes the "python setup.py develop" case (also 'install' and # 'easy_install .'), in which subdependencies of the main project are # built (using setup.py bdist_egg) in the same python process. Assume # a main project A and a dependency B, which use different versions # of Versioneer. A's setup.py imports A's Versioneer, leaving it in # sys.modules by the time B's setup.py is executed, causing B to run # with the wrong versioneer. Setuptools wraps the sub-dep builds in a # sandbox that restores sys.modules to it's pre-build state, so the # parent is protected against the child's "import versioneer". By # removing ourselves from sys.modules here, before the child build # happens, we protect the child from the parent's versioneer too. # Also see https://github.com/warner/python-versioneer/issues/52 cmds = {} # we add "version" to both distutils and setuptools from distutils.core import Command class cmd_version(Command): description = "report generated version string" user_options = [] boolean_options = [] def initialize_options(self): pass def finalize_options(self): pass def run(self): vers = get_versions(verbose=True) print("Version: %s" % vers["version"]) print(" full-revisionid: %s" % vers.get("full-revisionid")) print(" dirty: %s" % vers.get("dirty")) print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools # # most invocation pathways end up running build_py: # distutils/build -> build_py # distutils/install -> distutils/build ->.. # setuptools/bdist_wheel -> distutils/install ->.. # setuptools/bdist_egg -> distutils/install_lib -> build_py # setuptools/install -> bdist_egg ->.. # setuptools/develop -> ? # pip install: # copies source tree to a tempdir before running egg_info/etc # if .git isn't copied too, 'git describe' will fail # then does setup.py bdist_wheel, or sometimes setup.py install # setup.py egg_info -> ? # we override different "build_py" commands for both environments if "setuptools" in sys.modules: from setuptools.command.build_py import build_py as _build_py else: from distutils.command.build_py import build_py as _build_py class cmd_build_py(_build_py): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_py.run(self) # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_py"] = cmd_build_py if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION # "product_version": versioneer.get_version(), # ... class cmd_build_exe(_build_exe): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _build_exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) cmds["build_exe"] = cmd_build_exe del cmds["build_py"] if 'py2exe' in sys.modules: # py2exe enabled? try: from py2exe.distutils_buildexe import py2exe as _py2exe # py3 except ImportError: from py2exe.build_exe import py2exe as _py2exe # py2 class cmd_py2exe(_py2exe): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _py2exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) cmds["py2exe"] = cmd_py2exe # we override different "sdist" commands for both environments if "setuptools" in sys.modules: from setuptools.command.sdist import sdist as _sdist else: from distutils.command.sdist import sdist as _sdist class cmd_sdist(_sdist): def run(self): versions = get_versions() self._versioneer_generated_versions = versions # unless we update this, the command will keep using the old # version self.distribution.metadata.version = versions["version"] return _sdist.run(self) def make_release_tree(self, base_dir, files): root = get_root() cfg = get_config_from_root(root) _sdist.make_release_tree(self, base_dir, files) # now locate _version.py in the new base_dir directory # (remembering that it may be a hardlink) and replace it with an # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, self._versioneer_generated_versions) cmds["sdist"] = cmd_sdist return cmds CONFIG_ERROR = """ setup.cfg is missing the necessary Versioneer configuration. You need a section like: [versioneer] VCS = git style = pep440 versionfile_source = src/myproject/_version.py versionfile_build = myproject/_version.py tag_prefix = parentdir_prefix = myproject- You will also need to edit your setup.py to use the results: import versioneer setup(version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), ...) Please read the docstring in ./versioneer.py for configuration instructions, edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. """ SAMPLE_CONFIG = """ # See the docstring in versioneer.py for instructions. Note that you must # re-run 'versioneer.py setup' after changing this section, and commit the # resulting files. [versioneer] #VCS = git #style = pep440 #versionfile_source = #versionfile_build = #tag_prefix = #parentdir_prefix = """ INIT_PY_SNIPPET = """ from ._version import get_versions __version__ = get_versions()['version'] del get_versions """ def do_setup(): """Main VCS-independent setup function for installing Versioneer.""" root = get_root() try: cfg = get_config_from_root(root) except (EnvironmentError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (EnvironmentError, configparser.NoSectionError)): print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) return 1 print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write(LONG % {"DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, }) ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: old = f.read() except EnvironmentError: old = "" if INIT_PY_SNIPPET not in old: print(" appending to %s" % ipy) with open(ipy, "a") as f: f.write(INIT_PY_SNIPPET) else: print(" %s unmodified" % ipy) else: print(" %s doesn't exist, ok" % ipy) ipy = None # Make sure both the top-level "versioneer.py" and versionfile_source # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so # they'll be copied into source distributions. Pip won't be able to # install the package without this. manifest_in = os.path.join(root, "MANIFEST.in") simple_includes = set() try: with open(manifest_in, "r") as f: for line in f: if line.startswith("include "): for include in line.split()[1:]: simple_includes.add(include) except EnvironmentError: pass # That doesn't cover everything MANIFEST.in can do # (http://docs.python.org/2/distutils/sourcedist.html#commands), so # it might give some false negatives. Appending redundant 'include' # lines is safe, though. if "versioneer.py" not in simple_includes: print(" appending 'versioneer.py' to MANIFEST.in") with open(manifest_in, "a") as f: f.write("include versioneer.py\n") else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: print(" appending versionfile_source ('%s') to MANIFEST.in" % cfg.versionfile_source) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: print(" versionfile_source already in MANIFEST.in") # Make VCS-specific changes. For git, this means creating/changing # .gitattributes to mark _version.py for export-subst keyword # substitution. do_vcs_install(manifest_in, cfg.versionfile_source, ipy) return 0 def scan_setup_py(): """Validate the contents of setup.py against Versioneer's expectations.""" found = set() setters = False errors = 0 with open("setup.py", "r") as f: for line in f.readlines(): if "import versioneer" in line: found.add("import") if "versioneer.get_cmdclass()" in line: found.add("cmdclass") if "versioneer.get_version()" in line: found.add("get_version") if "versioneer.VCS" in line: setters = True if "versioneer.versionfile_source" in line: setters = True if len(found) != 3: print("") print("Your setup.py appears to be missing some important items") print("(but I might be wrong). Please make sure it has something") print("roughly like the following:") print("") print(" import versioneer") print(" setup( version=versioneer.get_version(),") print(" cmdclass=versioneer.get_cmdclass(), ...)") print("") errors += 1 if setters: print("You should remove lines like 'versioneer.VCS = ' and") print("'versioneer.versionfile_source = ' . This configuration") print("now lives in setup.cfg, and should be removed from setup.py") print("") errors += 1 return errors if __name__ == "__main__": cmd = sys.argv[1] if cmd == "setup": errors = do_setup() errors += scan_setup_py() if errors: sys.exit(1)