pax_global_header00006660000000000000000000000064141522126010014504gustar00rootroot0000000000000052 comment=d8171e74c535b6117f699e5d7728eab9eb953f4c ROCm-Device-Libs-rocm-5.0.0/000077500000000000000000000000001415221260100153305ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/.clang-format000066400000000000000000000002361415221260100177040ustar00rootroot00000000000000AlwaysBreakAfterReturnType: All BraceWrapping: AfterFunction: true BreakBeforeBraces: Custom IndentWidth: 4 PenaltyBreakBeforeFirstCallParameter: 300 ROCm-Device-Libs-rocm-5.0.0/.gitignore000066400000000000000000000000071415221260100173150ustar00rootroot00000000000000/build ROCm-Device-Libs-rocm-5.0.0/AMDDeviceLibsConfig.cmake.in000066400000000000000000000007011415221260100224160ustar00rootroot00000000000000if(COMMAND include_guard) include_guard(DIRECTORY) else() string(MAKE_C_IDENTIFIER "${CMAKE_CURRENT_LIST_FILE}" _PACKAGE_ID) if(DEFINED ${_GUARD_FILE_${_PACKAGE_ID}}) return() endif() set(${_GUARD_FILE_${_PACKAGE_ID}} On) endif() @AMD_DEVICE_LIBS_PREFIX_CODE@ @AMD_DEVICE_LIBS_TARGET_CODE@ set_property(GLOBAL PROPERTY AMD_DEVICE_LIBS "@AMDGCN_LIB_LIST@") # List of exported target names. set(AMD_DEVICE_LIBS_TARGETS "@AMDGCN_LIB_LIST@") ROCm-Device-Libs-rocm-5.0.0/CMakeLists.txt000066400000000000000000000114171415221260100200740ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- cmake_minimum_required(VERSION 3.13.4) project(ROCm-Device-Libs VERSION "1.0.0") cmake_policy(SET CMP0011 NEW) if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) include(CMakePackageConfigHelpers) include(GNUInstallDirs) find_package(ROCM) if (ROCM_FOUND) include(ROCMSetupVersion) rocm_setup_version(VERSION "${PROJECT_VERSION}") endif() endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") # Optionally, build Device Libs with ccache. set(ROCM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build") if (ROCM_CCACHE_BUILD) find_program(CCACHE_PROGRAM ccache) if (CCACHE_PROGRAM) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM}) else() message(WARNING "Unable to find ccache. Falling back to real compiler") endif() # if (CCACHE_PROGRAM) endif() # if (ROCM_CCACHE_BUILD) if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) find_package(LLVM REQUIRED) find_package(Clang HINTS ${LLVM_DIR}/../clang) list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR}) if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/dist CACHE INTERNAL "Prefix prepended to install directories") endif() set(ROCM_DEVICELIB_STANDALONE_BUILD ON) endif(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) if (NOT DEFINED AMDGPU_TARGET_TRIPLE) set(AMDGPU_TARGET_TRIPLE "amdgcn-amd-amdhsa") endif() if (NOT PREPARE_BUILTINS) add_subdirectory(utils/prepare-builtins) set (PREPARE_BUILTINS $) endif() include(OCL) set(AMDGCN_LIB_LIST) set(AMDGCN_DEP_LIST) add_subdirectory(irif) add_subdirectory(oclc) add_subdirectory(ocml) add_subdirectory(ockl) add_subdirectory(opencl) add_subdirectory(hip) add_subdirectory(asanrtl) enable_testing() add_subdirectory(test/constant_folding) include(Packages) if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) ## CPack standard variables set ( CPACK_PACKAGE_NAME "rocm-device-libs" ) set ( CPACK_PACKAGE_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}" ) set ( CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}" ) set ( CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}" ) set ( CPACK_PACKAGE_VERSION "${PROJECT_VERSION}" ) set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." ) set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "Radeon Open Compute - device libraries" ) set ( CPACK_PACKAGE_DESCRIPTION "This package includes LLVM bitcode libraries." ) set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCm-Device-Libs" ) set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Default packaging generators." ) ## ROCM version updates as per naming convention set ( ROCM_VERSION_FOR_PACKAGE "99999" ) if( DEFINED ENV{ROCM_LIBPATCH_VERSION} ) set ( ROCM_VERSION_FOR_PACKAGE $ENV{ROCM_LIBPATCH_VERSION} ) endif() ## Debian package values set ( CPACK_DEBIAN_PACKAGE_MAINTAINER "ROCm Compiler Support " ) set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" ) if( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) endif() ## RPM package variables set ( CPACK_RPM_PACKAGE_RELEASE "local" ) if( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} ) set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} ) endif() ## get distro for RPM package using dist message("device-libs CPACK_RPM_PACKAGE_RELEASE now is ${CPACK_RPM_PACKAGE_RELEASE}") execute_process( COMMAND rpm --eval %{?dist} RESULT_VARIABLE _result_var OUTPUT_VARIABLE _output_var OUTPUT_STRIP_TRAILING_WHITESPACE ) if( _result_var EQUAL "0" AND NOT _output_var STREQUAL "" ) string (APPEND CPACK_RPM_PACKAGE_RELEASE ${_output_var}) endif() # set package name as per standard set ( CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.${ROCM_VERSION_FOR_PACKAGE}" ) set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" ) set ( CPACK_RPM_PACKAGE_REQUIRES "rocm-core" ) set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" ) set ( CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core" ) # Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake if(NOT ROCM_DEP_ROCMCORE) string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES}) string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS}) endif() include( CPack ) endif() ROCm-Device-Libs-rocm-5.0.0/LICENSE000066400000000000000000000036401415221260100163400ustar00rootroot00000000000000============================================================================== ROCm-Device-Libs Release License ============================================================================== University of Illinois/NCSA Open Source License Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. Developed by: AMD Research and AMD HSA Software Development Advanced Micro Devices, Inc. www.amd.com Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. * Neither the names of the LLVM Team, University of Illinois at Urbana-Champaign, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. ROCm-Device-Libs-rocm-5.0.0/README.md000066400000000000000000000113461415221260100166140ustar00rootroot00000000000000## OVERVIEW ROCm Device libraries. This repository contains the sources and CMake build system for a set of AMD specific device-side language runtime libraries. Specifically: | **Name** | **Comments** | **Dependencies** | | --- | --- | --- | | oclc* | Open Compute library controls ([documentation](doc/OCML.md#controls)) | | | ocml | Open Compute Math library ([documentation](doc/OCML.md)) | oclc* | | ockl | Open Compute Kernel library ([documentation](doc/OCKL.md)) | oclc* | | opencl | OpenCL built-in library | ocml, ockl, oclc* | | hip | HIP built-in library | ocml, ockl, oclc* | | hc | Heterogeneous Compute built-in library | ocml, ockl, oclc* | Refer to [LICENSE.TXT](LICENSE.TXT) for license information. ## BUILDING The build requires clang and several llvm development tools. This can be built using the amd-stg-open branch of the RadeonOpenCompute modified llvm-project repository, but the upstream llvm-project should also work. There are two different methods to build the device libraries; as a standalone project or as an llvm external subproject. For a standalone build, this will find a preexisting clang and llvm tools using the standard cmake search mechanisms. If you wish to use a specific build, you can specify this with the CMAKE_PREFIX_PATH variable: git clone https://github.com/RadeonOpenCompute/ROCm-Device-Libs.git -b amd-stg-open and from its top level run the following commands: mkdir -p build cd build export LLVM_BUILD=... (path to LLVM build directory created previously) cmake -DCMAKE_PREFIX_PATH=$LLVM_BUILD .. make To build as an llvm external project: LLVM_PROJECT_ROOT=llvm-project-rocm git clone https://github.com/RadeonOpenCompute/llvm-project.git -b amd-stg-open ${LLVM_PROJECT_ROOT} cd ${LLVM_PROJECT_ROOT} mkdir -p build cd build cmake ${LLVM_PROJECT_ROOT}/llvm -DCMAKE_BUILD_TYPE=Release \ -DLLVM_ENABLE_PROJECTS="clang;lld" \ -DLLVM_EXTERNAL_PROJECTS="device-libs" \ -DLLVM_EXTERNAL_DEVICE_LIBS_SOURCE_DIR=/path/to/ROCm-Device-Libs Testing requires the amdhsacod utility from ROCm Runtime. To install artifacts: make install To create packages for the library: make package ## USING BITCODE LIBRARIES The ROCm language compilers and runtimes automatically link the required bitcode files invoked during the process of creating a code object. clang will search for these libraries by default when targeting amdhsa, in the default ROCm install location. To specify a specific set of libraries, the --rocm-path argument can point to the root directory where the bitcode libraries are installed, which is the recommended way to link the libraries. $LLVM_BUILD/bin/clang -x cl -Xclang -finclude-default-header \ -target amdgcn-amd-amdhsa -mcpu=gfx900 \ --rocm-path=/srv/git/ROCm-Device-Libs/build/dist These can be manually linked, but is generally not recommended. The set of libraries linked should be in sync with the corresponding compiler flags and target options. The default library linking can be disabled with -nogpulib, and a manual linking invocation might look like as follows: $LLVM_BUILD/bin/clang -x cl -Xclang -finclude-default-header \ -nogpulib -target amdgcn-amd-amdhsa -mcpu=gfx900 \ -Xclang -mlink-bitcode-file -Xclang /srv/git/ROCm-Device-Libs/build/dist/amdgcn/bitcode/opencl/opencl.bc \ -Xclang -mlink-bitcode-file -Xclang /srv/git/ROCm-Device-Libs/build/dist/amdgcn/bitcode/ocml/ocml.bc \ -Xclang -mlink-bitcode-file -Xclang /srv/git/ROCm-Device-Libs/build/dist/amdgcn/bitcode/ockl/ockl.bc \ -Xclang -mlink-bitcode-file -Xclang /srv/git/ROCm-Device-Libs/build/dist/amdgcn/bitcode/oclc/oclc_correctly_rounded_sqrt_off.bc \ -Xclang -mlink-bitcode-file -Xclang /srv/git/ROCm-Device-Libs/build/dist/amdgcn/bitcode/oclc/oclc_daz_opt_off.bc \ -Xclang -mlink-bitcode-file -Xclang /srv/git/ROCm-Device-Libs/build/dist/amdgcn/bitcode/oclc/oclc_finite_only_off.bc \ -Xclang -mlink-bitcode-file -Xclang /srv/git/ROCm-Device-Libs/build/dist/amdgcn/bitcode/oclc/oclc_unsafe_math_off.bc \ -Xclang -mlink-bitcode-file -Xclang /srv/git/ROCm-Device-Libs/build/dist/amdgcn/bitcode/oclc/oclc_wavefrontsize64_on.bc \ -Xclang -mlink-bitcode-file -Xclang /srv/git/ROCm-Device-Libs/build/dist/amdgcn/bitcode/oclc/oclc_isa_version_900.bc \ test.cl -o test.so ### USING FROM CMAKE The bitcode libraries are exported as CMake targets, organized in a CMake package. You can depend on this package using `find_package(AMDDeviceLibs REQUIRED CONFIG)` after ensuring the `CMAKE_PREFIX_PATH` includes either the build directory or install prefix of the bitcode libraries. The package defines a variable `AMD_DEVICE_LIBS_TARGETS` containing a list of the exported CMake targets. ROCm-Device-Libs-rocm-5.0.0/asanrtl/000077500000000000000000000000001415221260100167745ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/asanrtl/CMakeLists.txt000066400000000000000000000013161415221260100215350ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- file(GLOB sources ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cl ) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../irif/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../oclc/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ockl/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) opencl_bc_lib(NAME asanrtl SOURCES ${sources}) ROCm-Device-Libs-rocm-5.0.0/asanrtl/inc/000077500000000000000000000000001415221260100175455ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/asanrtl/inc/asan_util.h000066400000000000000000000035241415221260100217010ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma once #include "ockl.h" typedef ulong uptr; typedef unsigned char u8; typedef signed char s8; typedef unsigned short u16; typedef short s16; typedef unsigned long u64; #define ASAN_SHADOW 3 #define SHADOW_GRANULARITY (1ULL << ASAN_SHADOW) #define GET_CALLER_PC() (uptr) __builtin_return_address(0) #define WORKGROUP_ID(dim) __builtin_amdgcn_workgroup_id_##dim() #define OPT_NONE __attribute__((optnone)) #define NO_SANITIZE_ADDR __attribute__((no_sanitize("address"))) #define REPORT_IMPL(caller_pc, addr, is_write, size, no_abort) \ uptr read = is_write; \ if (no_abort) \ read |= 0xFFFFFFFF00000000; \ \ __ockl_sanitizer_report(addr, caller_pc, WORKGROUP_ID(x), WORKGROUP_ID(y), \ WORKGROUP_ID(z), __ockl_get_local_linear_id(), \ read, size); NO_SANITIZE_ADDR static bool is_aligned_by_granularity(uptr addr) { return (addr & (SHADOW_GRANULARITY - 1)) == 0; } // round up size to the nearest multiple of boundary. NO_SANITIZE_ADDR static uptr round_upto(uptr size, uptr boundary) { return (size + boundary - 1) & ~(boundary - 1); } // round down size to the nearest multiple of boundary. NO_SANITIZE_ADDR static uptr round_downto(uptr size, uptr boundary) { return size & ~(boundary - 1); } ROCm-Device-Libs-rocm-5.0.0/asanrtl/inc/globals.h000066400000000000000000000030061415221260100213400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma once #include "asan_util.h" // The strucutures semantics and layout must match the host instrumented // global variable as defined in // llvm-project/compiler-rt/lib/asan/asan_interface_internal.h // This structure used to describe the source location of a place // where global was defined. struct global_source_location { const char *filename; int line_no; int column_no; }; // This structure describes an instrumented global variable. struct device_global { uptr beg; // The address of the global. uptr size; // The original size of the global. uptr size_with_redzone; // The size with the redzone. const char *name; // Name as a C string. const char *module_name; // Module name as a C string. This pointer is a // unique identifier of a module. uptr has_dynamic_init; // Non-zero if the global has dynamic initializer. struct global_source_location *location; // Source location of a global, // or NULL if it is unknown. uptr odr_indicator; // The address of the ODR indicator symbol. }; static const __constant s8 kAsanGlobalRedzoneMagic = 0xf9; ROCm-Device-Libs-rocm-5.0.0/asanrtl/inc/shadow_mapping.h000066400000000000000000000021371415221260100227210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma once #include "asan_util.h" //offset from llvm/compiler-rt/lib/asan/asan_mapping.h static const u64 kh_Linux64bit_ShadowOffset = 0x7FFFFFFF & (~0xFFFULL << ASAN_SHADOW); #define MEM_TO_SHADOW(mem_addr) (((mem_addr) >> ASAN_SHADOW) + kh_Linux64bit_ShadowOffset) // Addresses are atleast SHADOW_GRANULARITY aligned. // True, when given byte is accessible false otherwise. NO_SANITIZE_ADDR static bool is_address_poisoned(uptr addr) { uptr shadow_addr = MEM_TO_SHADOW(addr); s8 shadow_value = *(__global s8 *)shadow_addr; if (shadow_value) { //compute index of the given address within 8-byte range return (s8)(addr & (SHADOW_GRANULARITY - 1)) >= shadow_value; } return false; } NO_SANITIZE_ADDR uptr __asan_region_is_poisoned(uptr beg, uptr size); ROCm-Device-Libs-rocm-5.0.0/asanrtl/src/000077500000000000000000000000001415221260100175635ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/asanrtl/src/globals.cl000066400000000000000000000052161415221260100215320ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "asan_util.h" #include "globals.h" #include "shadow_mapping.h" // fill shadow bytes of range [aligned_beg, aligned_beg+aligned_size) // with value. NO_SANITIZE_ADDR static void fill_shadowof(uptr aligned_beg, uptr aligned_size, s8 value) { u64 nbytes = aligned_size / SHADOW_GRANULARITY; __global s8 *shadow_beg = (__global s8*)MEM_TO_SHADOW(aligned_beg); for (; nbytes; nbytes--, shadow_beg++) *shadow_beg = value; } // poison the redzones around the global only if global is shadow granularity aligned. NO_SANITIZE_ADDR static void poison_redzones(__global const struct device_global *g) { if (!is_aligned_by_granularity(g->beg)) return; if (!is_aligned_by_granularity(g->size_with_redzone)) return; uptr aligned_size = round_upto(g->size, SHADOW_GRANULARITY); uptr redzone_beg = g->beg + aligned_size; uptr redzone_size = g->size_with_redzone - aligned_size; fill_shadowof(redzone_beg, redzone_size, kAsanGlobalRedzoneMagic); // poison partial redzones if any. // since SHADOW_GRANULARITY is 8 bytes we require only one shadow byte // to keep partially addressable bytes information. if (g->size != aligned_size) { uptr aligned_addr = g->beg + round_downto(g->size, SHADOW_GRANULARITY); __global s8 *shadow_addr = (__global s8*)MEM_TO_SHADOW(aligned_addr); *shadow_addr = (s8) (g->size % SHADOW_GRANULARITY); } } // This function is called by one-workitem constructor kernel. NO_SANITIZE_ADDR void __asan_register_globals(uptr globals, uptr n) { __global struct device_global *dglobals = (__global struct device_global*) globals; for (uptr i = 0; i < n; i++) poison_redzones(&dglobals[i]); } // unpoison global and redzones around it only if global is shadow granularity aligned. NO_SANITIZE_ADDR static void unpoison_global(__global const struct device_global *g) { if (!is_aligned_by_granularity(g->beg)) return; if (!is_aligned_by_granularity(g->size_with_redzone)) return; fill_shadowof(g->beg, g->size_with_redzone, 0); } // This function is called by one-workitem destructor kernel. NO_SANITIZE_ADDR void __asan_unregister_globals(uptr globals, uptr n) { __global struct device_global* dglobals = (__global struct device_global*) globals; for (uptr i = 0; i < n; i++) unpoison_global(&dglobals[i]); } ROCm-Device-Libs-rocm-5.0.0/asanrtl/src/memintrinsics.cl000066400000000000000000000032671415221260100227770ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "asan_util.h" #include "shadow_mapping.h" OPT_NONE NO_SANITIZE_ADDR static void check_memory_range_accessible(void* dest, const void* src, uptr size, uptr pc) { if (size == 0) return; uptr invalid_addr = 0; uptr src_addr = (uptr)src; invalid_addr = __asan_region_is_poisoned(src_addr, size); if (invalid_addr) { REPORT_IMPL(pc, invalid_addr, false, size, false) } uptr dest_addr = (uptr)dest; invalid_addr = __asan_region_is_poisoned(dest_addr, size); if (invalid_addr) { REPORT_IMPL(pc, invalid_addr, true, size, false) } } OPT_NONE NO_SANITIZE_ADDR void* __asan_memcpy(void* to, const void* from, uptr size) { uptr pc = GET_CALLER_PC(); check_memory_range_accessible(to, from, size, pc); return __builtin_memcpy(to, from, size); } OPT_NONE NO_SANITIZE_ADDR void* __asan_memmove(void* to, const void* from, uptr size) { uptr pc = GET_CALLER_PC(); check_memory_range_accessible(to, from, size, pc); return __builtin_memmove(to, from, size); } OPT_NONE NO_SANITIZE_ADDR void* __asan_memset(void* s, int c, uptr n) { uptr pc = GET_CALLER_PC(); uptr src_addr = (uptr)s; uptr invalid_addr = 0; invalid_addr = __asan_region_is_poisoned(src_addr, n); if (invalid_addr) { REPORT_IMPL(pc, invalid_addr, true, n, false) } return __builtin_memset(s, c, n); } ROCm-Device-Libs-rocm-5.0.0/asanrtl/src/preserve.cl000066400000000000000000000131221415221260100217350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ typedef ulong uptr; extern void __asan_report_load1 (uptr addr); extern void __asan_report_load1_noabort (uptr addr); extern void __asan_report_load2 (uptr addr); extern void __asan_report_load2_noabort (uptr addr); extern void __asan_report_load4 (uptr addr); extern void __asan_report_load4_noabort (uptr addr); extern void __asan_report_load8 (uptr addr); extern void __asan_report_load8_noabort (uptr addr); extern void __asan_report_load16 (uptr addr); extern void __asan_report_load16_noabort (uptr addr); extern void __asan_report_store1 (uptr addr); extern void __asan_report_store1_noabort (uptr addr); extern void __asan_report_store2 (uptr addr); extern void __asan_report_store2_noabort (uptr addr); extern void __asan_report_store4 (uptr addr); extern void __asan_report_store4_noabort (uptr addr); extern void __asan_report_store8 (uptr addr); extern void __asan_report_store8_noabort (uptr addr); extern void __asan_report_store16 (uptr addr); extern void __asan_report_store16_noabort (uptr addr); extern void __asan_report_store_n (uptr addr, uptr size); extern void __asan_report_store_n_noabort (uptr addr, uptr size); extern void __asan_report_load_n (uptr addr, uptr size); extern void __asan_report_load_n_noabort (uptr addr, uptr size); extern void __asan_load1 (uptr addr); extern void __asan_load1_noabort (uptr addr); extern void __asan_load2 (uptr addr); extern void __asan_load2_noabort (uptr addr); extern void __asan_load4 (uptr addr); extern void __asan_load4_noabort (uptr addr); extern void __asan_load8 (uptr addr); extern void __asan_load8_noabort (uptr addr); extern void __asan_load16 (uptr addr); extern void __asan_load16_noabort (uptr addr); extern void __asan_store1 (uptr addr); extern void __asan_store1_noabort (uptr addr); extern void __asan_store2 (uptr addr); extern void __asan_store2_noabort (uptr addr); extern void __asan_store4 (uptr addr); extern void __asan_store4_noabort (uptr addr); extern void __asan_store8 (uptr addr); extern void __asan_store8_noabort (uptr addr); extern void __asan_store16 (uptr addr); extern void __asan_store16_noabort (uptr addr); extern void __asan_store_n (uptr addr, uptr size); extern void __asan_store_n_noabort (uptr addr, uptr size); extern void __asan_load_n (uptr addr, uptr size); extern void __asan_load_n_noabort (uptr addr, uptr size); extern uptr __asan_region_is_poisoned(uptr beg, uptr size); extern void* __asan_memmove(void* to, void* from, uptr size); extern void* __asan_memcpy(void* to, void* from, uptr size); extern void* __asan_memset(void* s, int c, uptr n); extern void __asan_handle_no_return(void); extern void __sanitizer_ptr_cmp(uptr a, uptr b); extern void __sanitizer_ptr_sub(uptr a, uptr b); extern void __asan_before_dynamic_init(uptr addr); extern void __asan_after_dynamic_init(void); extern void __asan_register_globals(uptr start, uptr n); extern void __asan_unregister_globals(uptr start, uptr n); extern void __asan_register_image_globals(uptr flag); extern void __asan_unregister_image_globals(uptr flag); extern void __asan_register_elf_globals(uptr flag, uptr start, uptr stop); extern void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop); extern void __asan_init(void); extern void __asan_version_mismatch_check_v8(void); void __amdgpu_device_library_preserve_asan_functions(void) { __asan_report_load1(0); __asan_report_load1_noabort(0); __asan_report_load2(0); __asan_report_load2_noabort(0); __asan_report_load4(0); __asan_report_load4_noabort(0); __asan_report_load8(0); __asan_report_load8_noabort(0); __asan_report_load16(0); __asan_report_load16_noabort(0); __asan_report_store1(0); __asan_report_store1_noabort(0); __asan_report_store2(0); __asan_report_store2_noabort(0); __asan_report_store4(0); __asan_report_store4_noabort(0); __asan_report_store8(0); __asan_report_store8_noabort(0); __asan_report_store16(0); __asan_report_store16_noabort(0); __asan_report_store_n(0, 0); __asan_report_store_n_noabort(0, 0); __asan_report_load_n(0, 0); __asan_report_load_n_noabort(0, 0); __asan_load1(0); __asan_load1_noabort(0); __asan_load2(0); __asan_load2_noabort(0); __asan_load4(0); __asan_load4_noabort(0); __asan_load8(0); __asan_load8_noabort(0); __asan_load16(0); __asan_load16_noabort(0); __asan_store1(0); __asan_store1_noabort(0); __asan_store2(0); __asan_store2_noabort(0); __asan_store4(0); __asan_store4_noabort(0); __asan_store8(0); __asan_store8_noabort(0); __asan_store16(0); __asan_store16_noabort(0); __asan_store_n(0, 0); __asan_store_n_noabort(0, 0); __asan_load_n(0, 0); __asan_load_n_noabort(0, 0); __asan_region_is_poisoned(0, 0); (void)__asan_memmove((void*)0, (void*)0, 0); (void)__asan_memcpy((void*)0, (void*)0, 0); (void)__asan_memset((void*)0, 0, 0); __asan_handle_no_return(); __sanitizer_ptr_cmp(0, 0); __sanitizer_ptr_sub(0, 0); __asan_before_dynamic_init(0); __asan_after_dynamic_init(); __asan_register_globals(0, 0); __asan_unregister_globals(0, 0); __asan_register_image_globals(0); __asan_unregister_image_globals(0); __asan_register_elf_globals(0, 0, 0); __asan_unregister_elf_globals(0, 0, 0); __asan_init(); __asan_version_mismatch_check_v8(); } ROCm-Device-Libs-rocm-5.0.0/asanrtl/src/report.cl000066400000000000000000000111371415221260100214210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "asan_util.h" #include "shadow_mapping.h" #define ASAN_REPORT_ERROR(type, size, is_write) \ OPT_NONE \ void __asan_report_ ## type ## size(uptr addr) { \ REPORT_IMPL(GET_CALLER_PC(), addr, is_write, size, false) \ } \ OPT_NONE \ void __asan_report_ ## type ## size ## _noabort(uptr addr) { \ REPORT_IMPL(GET_CALLER_PC(), addr, is_write, size, true) \ } \ ASAN_REPORT_ERROR(load, 1, 0) ASAN_REPORT_ERROR(load, 2, 0) ASAN_REPORT_ERROR(load, 4, 0) ASAN_REPORT_ERROR(load, 8, 0) ASAN_REPORT_ERROR(load, 16,0) ASAN_REPORT_ERROR(store, 1, 1) ASAN_REPORT_ERROR(store, 2, 1) ASAN_REPORT_ERROR(store, 4, 1) ASAN_REPORT_ERROR(store, 8, 1) ASAN_REPORT_ERROR(store, 16,1) #define ASAN_REPORT_ERROR_N(type, is_write) \ OPT_NONE \ void __asan_report_ ## type ## _n(uptr addr, uptr size) { \ REPORT_IMPL(GET_CALLER_PC(), addr, is_write, size, false) \ } \ OPT_NONE \ void __asan_report_ ## type ## _n_noabort(uptr addr, uptr size) { \ REPORT_IMPL(GET_CALLER_PC(), addr, is_write, size, true) \ } \ ASAN_REPORT_ERROR_N(store,1) ASAN_REPORT_ERROR_N(load,0) NO_SANITIZE_ADDR static bool is_invalid_access(uptr addr, uptr size) { uptr shadow_addr = MEM_TO_SHADOW(addr); if (size <= SHADOW_GRANULARITY) { s8 shadow_value = *(__global s8*) shadow_addr; return shadow_value != 0 && ((s8)((addr & (SHADOW_GRANULARITY-1)) + size - 1) >= shadow_value); } else { s16 shadow_value = *(__global s16*) shadow_addr; return shadow_value != 0; } } #define ASAN_ERROR(type, size, is_write) \ OPT_NONE NO_SANITIZE_ADDR \ void __asan_ ## type ## size(uptr addr) { \ uptr caller_pc = GET_CALLER_PC(); \ if (is_invalid_access(addr, size)) { \ REPORT_IMPL(caller_pc, addr, is_write, size, false) \ } \ } \ OPT_NONE NO_SANITIZE_ADDR \ void __asan_ ## type ## size ## _noabort(uptr addr) { \ uptr caller_pc = GET_CALLER_PC(); \ if (is_invalid_access(addr, size)) { \ REPORT_IMPL(caller_pc, addr, is_write, size, true) \ } \ } \ ASAN_ERROR(load, 1, 0) ASAN_ERROR(load, 2, 0) ASAN_ERROR(load, 4, 0) ASAN_ERROR(load, 8, 0) ASAN_ERROR(load, 16,0) ASAN_ERROR(store, 1, 1) ASAN_ERROR(store, 2, 1) ASAN_ERROR(store, 4, 1) ASAN_ERROR(store, 8, 1) ASAN_ERROR(store, 16,1) #define ASAN_ERROR_N(type, is_write) \ OPT_NONE NO_SANITIZE_ADDR \ void __asan_ ## type ## _n(uptr addr, uptr size) { \ uptr caller_pc = GET_CALLER_PC(); \ if (__asan_region_is_poisoned(addr, size)) { \ REPORT_IMPL(caller_pc, addr, is_write, size, false) \ } \ } \ OPT_NONE NO_SANITIZE_ADDR \ void __asan_ ## type ## _n_noabort(uptr addr, uptr size) { \ uptr caller_pc = GET_CALLER_PC(); \ if (__asan_region_is_poisoned(addr, size)) { \ REPORT_IMPL(caller_pc, addr, is_write, size, true) \ } \ } \ ASAN_ERROR_N(store, 1) ASAN_ERROR_N(load, 0) ROCm-Device-Libs-rocm-5.0.0/asanrtl/src/shadow_mapping.cl000066400000000000000000000030051415221260100231010ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "shadow_mapping.h" NO_SANITIZE_ADDR static uptr range_check(uptr beg, uptr end) { uptr aligned_beg = round_downto(beg, SHADOW_GRANULARITY); uptr aligned_end = round_downto(end, SHADOW_GRANULARITY); uptr shadow_beg = MEM_TO_SHADOW(aligned_beg); uptr shadow_end = MEM_TO_SHADOW(aligned_end); uptr nbytes = (shadow_end - shadow_beg)+1; uptr shadow_byte_count = 0; while (shadow_beg <= shadow_end) { s8 shadow_value = *(__global s8 *)shadow_beg; if (shadow_value) break; shadow_byte_count++; shadow_beg++; } if (shadow_byte_count == nbytes) return 0; uptr start_addr = round_downto(beg + (shadow_byte_count*SHADOW_GRANULARITY), SHADOW_GRANULARITY); return start_addr; } //check all application bytes in [beg,beg+size) range are accessible NO_SANITIZE_ADDR uptr __asan_region_is_poisoned(uptr beg, uptr size) { uptr end = beg + size - 1; uptr start_addr = range_check(beg, end); if (start_addr != 0) { // loop through the range to find accessible address. for (uptr addr = start_addr; addr <= end; ++addr) { if (is_address_poisoned(addr)) return addr; } } return 0; } ROCm-Device-Libs-rocm-5.0.0/asanrtl/src/stubs.cl000066400000000000000000000015601415221260100212450ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ typedef ulong uptr; void __asan_handle_no_return(void) {} void __sanitizer_ptr_cmp(uptr a, uptr b) {} void __sanitizer_ptr_sub(uptr a, uptr b) {} void __asan_before_dynamic_init(uptr addr) {} void __asan_after_dynamic_init(void) {} void __asan_register_image_globals(uptr flag) {} void __asan_unregister_image_globals(uptr flag) {} void __asan_register_elf_globals(uptr flag, uptr start, uptr stop) {} void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop) {} void __asan_init(void) {} void __asan_version_mismatch_check_v8(void) {} ROCm-Device-Libs-rocm-5.0.0/cmake/000077500000000000000000000000001415221260100164105ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/cmake/OCL.cmake000066400000000000000000000170111415221260100200270ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- # Required because we need to generate response files on windows for long # command-lines, but the only way to do this as part of the dependency graph is # configure_file and we are included from multiple places. To get around this # we `file(WRITE)` a file with an @variable reference and `configure_file` it. cmake_policy(SET CMP0053 OLD) if (WIN32) set(EXE_SUFFIX ".exe") else() set(EXE_SUFFIX) endif() # -Wno-error=atomic-alignment was added to workaround build problems due to # potential mis-aligned atomic ops detected by clang set(CLANG_OCL_FLAGS -fcolor-diagnostics -Werror -Wno-error=atomic-alignment -x cl -Xclang -cl-std=CL2.0 -target "${AMDGPU_TARGET_TRIPLE}" -fvisibility=protected -fomit-frame-pointer -Xclang -finclude-default-header -nogpulib -cl-no-stdinc "${CLANG_OPTIONS_APPEND}") # For compatibility with the MSVC headers we use a 32-bit wchar. Users linking # against us must also use a short wchar. if (WIN32) set(CLANG_OCL_FLAGS ${CLANG_OCL_FLAGS} -fshort-wchar) endif() set (BC_EXT .bc) set (LIB_SUFFIX ".lib${BC_EXT}") set (STRIP_SUFFIX ".strip${BC_EXT}") set (FINAL_SUFFIX "${BC_EXT}") set (INSTALL_ROOT_SUFFIX "amdgcn/bitcode") # Set `inc_options` to contain Clang command-line for include directories for # current source directory. macro(set_inc_options) get_property(inc_dirs DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" PROPERTY INCLUDE_DIRECTORIES) set(inc_options) foreach(inc_dir ${inc_dirs}) list(APPEND inc_options "-I${inc_dir}") endforeach() endmacro() # called with NAME: library name # SOURCES: .cl and .ll source files # INTERNAL_LINK_LIBS: Extra .lls to be linked and internalized into final library macro(opencl_bc_lib) set(parse_options) set(one_value_args NAME) set(multi_value_args SOURCES INTERNAL_LINK_LIBS) cmake_parse_arguments(OPENCL_BC_LIB "${parse_options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) set(name ${OPENCL_BC_LIB_NAME}) set(sources ${OPENCL_BC_LIB_SOURCES}) set(internal_link_libs ${OPENCL_BC_LIB_INTERNAL_LINK_LIBS}) get_target_property(irif_lib_output irif OUTPUT_NAME) # Mirror the install layout structure. set(OUTPUT_DIR ${PROJECT_BINARY_DIR}/${INSTALL_ROOT_SUFFIX}) file(MAKE_DIRECTORY ${OUTPUT_DIR}) set(OUT_NAME ${name}) set(OUTPUT_BC_LIB ${OUTPUT_DIR}/${name}${FINAL_SUFFIX}) set(clean_files) list(APPEND AMDGCN_LIB_LIST ${name}) set(AMDGCN_LIB_LIST ${AMDGCN_LIB_LIST} PARENT_SCOPE) list(APPEND AMDGCN_DEP_LIST ${name}) set(AMDGCN_DEP_LIST ${AMDGCN_DEP_LIST} PARENT_SCOPE) set_inc_options() set(deps) foreach(file ${OPENCL_BC_LIB_SOURCES}) get_filename_component(fname_we "${file}" NAME_WE) get_filename_component(fext "${file}" EXT) if (fext STREQUAL ".cl") set(output "${CMAKE_CURRENT_BINARY_DIR}/${fname_we}${BC_EXT}") add_custom_command(OUTPUT "${output}" COMMAND $ ${inc_options} ${CLANG_OCL_FLAGS} -emit-llvm -Xclang -mlink-builtin-bitcode -Xclang "${irif_lib_output}" -c "${file}" -o "${output}" DEPENDS "${file}" "${irif_lib_output}" "${CLANG}" # FIXME: Currently IMPLICIT_DEPENDS is only supported for GNU Makefile, # so as an overly-conservatively workaround to cover all generators # we just assume all .cl sources require irif.h. If all the generators # we care about begin to support IMPLICIT_DEPENDS we won't need this. "${CMAKE_CURRENT_SOURCE_DIR}/../irif/inc/irif.h" IMPLICIT_DEPENDS C "${file}") list(APPEND deps "${output}") list(APPEND clean_files "${output}") endif() if (fext STREQUAL ".ll") list(APPEND deps "${file}") endif() endforeach() # The llvm-link command-lines can get long enough to trigger strange behavior # on Windows. LLVM tools support "response files" which can work around this: # http://llvm.org/docs/CommandLine.html#response-files set(RESPONSE_COMMAND_LINE) foreach(dep ${deps}) set(RESPONSE_COMMAND_LINE "${RESPONSE_COMMAND_LINE} ${dep}") endforeach() file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/response.in" "@RESPONSE_COMMAND_LINE@") configure_file("${CMAKE_CURRENT_BINARY_DIR}/response.in" "${CMAKE_CURRENT_BINARY_DIR}/${OUT_NAME}_response" @ONLY) add_custom_command(OUTPUT ${OUTPUT_BC_LIB} # Link regular library dependencies COMMAND $ -o "${OUT_NAME}.link0${LIB_SUFFIX}" "@${OUT_NAME}_response" # Extra link step with internalize COMMAND $ -internalize -only-needed "${name}.link0${LIB_SUFFIX}" -o "${OUT_NAME}${LIB_SUFFIX}" ${internal_link_libs} COMMAND $ -strip -o "${OUT_NAME}${STRIP_SUFFIX}" "${OUT_NAME}${LIB_SUFFIX}" COMMAND "${PREPARE_BUILTINS}" -o ${OUTPUT_BC_LIB} "${OUT_NAME}${STRIP_SUFFIX}" DEPENDS "${deps}" "${CMAKE_CURRENT_BINARY_DIR}/${OUT_NAME}_response" "${PREPARE_BUILTINS}" ${internal_link_libs}) add_custom_target("${name}" ALL DEPENDS "${OUTPUT_DIR}/${OUT_NAME}${FINAL_SUFFIX}" SOURCES ${OPENCL_BC_LIB_SOURCES}) set_target_properties(${name} PROPERTIES OUTPUT_NAME "${OUTPUT_DIR}/${OUT_NAME}${FINAL_SUFFIX}" ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" ARCHIVE_OUTPUT_NAME "${name}" PREFIX "" SUFFIX ${FINAL_SUFFIX}) list(APPEND clean_files "${OUT_NAME}${LIB_SUFFIX}" "${OUT_NAME}${STRIP_SUFFIX}") set_property(GLOBAL APPEND PROPERTY AMD_DEVICE_LIBS ${name}) if(NOT ROCM_DEVICELIB_STANDALONE_BUILD) add_dependencies("${name}" llvm-link clang opt llvm-objdump) endif() if (TARGET prepare-builtins) add_dependencies("${name}" prepare-builtins) endif() add_dependencies("${name}" irif) set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${clean_files}") install(FILES ${OUTPUT_BC_LIB} DESTINATION ${INSTALL_ROOT_SUFFIX} COMPONENT device-libs) endmacro() function(clang_opencl_code name dir) set(TEST_TGT "${name}_code") set(OUT_NAME "${CMAKE_CURRENT_BINARY_DIR}/${name}") set(mlink_flags) foreach (lib ${ARGN}) get_target_property(lib_path "${lib}" OUTPUT_NAME) list(APPEND mlink_flags -Xclang -mlink-bitcode-file -Xclang "${lib_path}") endforeach() set_inc_options() add_custom_command(OUTPUT "${OUT_NAME}.co" COMMAND "${CLANG}" ${inc_options} ${CLANG_OCL_FLAGS} -mcpu=fiji ${mlink_flags} -o "${OUT_NAME}.co" -c "${dir}/${name}.cl" DEPENDS "${dir}/${name}.cl") add_custom_target("${TEST_TGT}" ALL DEPENDS "${OUT_NAME}.co" SOURCES "${dir}/${name}.cl") set_target_properties(${TEST_TGT} PROPERTIES OUTPUT_NAME "${OUT_NAME}.co") foreach (lib ${ARGN}) add_dependencies(${TEST_TGT} ${lib}) endforeach() endfunction() set(OCLC_DEFAULT_LIBS oclc_correctly_rounded_sqrt_off oclc_daz_opt_off oclc_finite_only_off oclc_isa_version_803 oclc_unsafe_math_off) macro(clang_opencl_test name dir) clang_opencl_code(${name} ${dir} hip opencl ocml ockl ${OCLC_DEFAULT_LIBS}) add_test( NAME ${name}:llvm-objdump COMMAND $ -disassemble -mcpu=fiji "${name}.co" ) endmacro() macro(clang_opencl_test_file dir fname) get_filename_component(name ${fname} NAME_WE) get_filename_component(fdir ${fname} DIRECTORY) clang_opencl_test(${name} ${dir}/${fdir}) endmacro() ROCm-Device-Libs-rocm-5.0.0/cmake/Packages.cmake000066400000000000000000000042061415221260100211320ustar00rootroot00000000000000set(PACKAGE_PREFIX lib/cmake/AMDDeviceLibs) # Generate the build-tree package. # We know the absolute path to the build tree, so we leave # AMD_DEVICE_LIBS_PREFIX_CODE blank and include absolute paths in the target # imports in AMD_DEVICE_LIBS_TARGET_CODE. foreach(target ${AMDGCN_LIB_LIST}) get_target_property(target_path ${target} OUTPUT_NAME) set(AMD_DEVICE_LIBS_TARGET_CODE "${AMD_DEVICE_LIBS_TARGET_CODE} add_library(${target} STATIC IMPORTED) set_target_properties(${target} PROPERTIES IMPORTED_LOCATION \"${target_path}\")") endforeach() configure_file(AMDDeviceLibsConfig.cmake.in ${PACKAGE_PREFIX}/AMDDeviceLibsConfig.cmake @ONLY) set(install_path_suffix "amdgcn/bitcode") # Generate the install-tree package. # We do not know the absolute path to the intall tree until we are installed, # so we calculate it dynamically in AMD_DEVICE_LIBS_PREFIX_CODE and use # relative paths in the target imports in AMD_DEVICE_LIBS_TARGET_CODE. set(AMD_DEVICE_LIBS_PREFIX_CODE " # Derive absolute install prefix from config file path. get_filename_component(AMD_DEVICE_LIBS_PREFIX \"\${CMAKE_CURRENT_LIST_FILE}\" PATH)") string(REGEX REPLACE "/" ";" count "${PACKAGE_PREFIX}") foreach(p ${count}) set(AMD_DEVICE_LIBS_PREFIX_CODE "${AMD_DEVICE_LIBS_PREFIX_CODE} get_filename_component(AMD_DEVICE_LIBS_PREFIX \"\${AMD_DEVICE_LIBS_PREFIX}\" PATH)") endforeach() set(AMD_DEVICE_LIBS_TARGET_CODE) foreach(target ${AMDGCN_LIB_LIST}) get_target_property(target_name ${target} ARCHIVE_OUTPUT_NAME) get_target_property(target_prefix ${target} PREFIX) get_target_property(target_suffix ${target} SUFFIX) set(AMD_DEVICE_LIBS_TARGET_CODE "${AMD_DEVICE_LIBS_TARGET_CODE} add_library(${target} STATIC IMPORTED) set_target_properties(${target} PROPERTIES IMPORTED_LOCATION \"\${AMD_DEVICE_LIBS_PREFIX}/${install_path_suffix}/${target_prefix}${target_name}${target_suffix}\")") endforeach() configure_file(AMDDeviceLibsConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/AMDDeviceLibsConfig.cmake.install @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/AMDDeviceLibsConfig.cmake.install DESTINATION ${PACKAGE_PREFIX} COMPONENT device-libs RENAME AMDDeviceLibsConfig.cmake) ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/000077500000000000000000000000001415221260100170165ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/CMakeLists.txt000066400000000000000000000012271415221260100215600ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- file(GLOB cl_sources ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cl ) file(GLOB sources ${cl_sources}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ocml/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ockl/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../irif/inc) opencl_bc_lib(NAME cuda2gcn SOURCES ${sources}) ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/src/000077500000000000000000000000001415221260100176055ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/src/bitsbytes.cl000066400000000000000000000023471415221260100221430ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #include "irif.h" #define ATTR __attribute__((const)) //-------- T __nv_brev ATTR int __nv_brev(int x) { return __builtin_bitreverse32(x); } //-------- T __nv_brevll ATTR long __nv_brevll(long x) { return __builitn_bitreverse64(x); } //-------- T __nv_clz ATTR int __nv_clz(int x) { return (int)__ockl_clz_u32((uint)x); } //-------- T __nv_clzll ATTR int __nv_clzll(long x) { uint xlo = (uint)x; uint xhi = (uint)(x >> 32); uint zlo = __ockl_clz_u32(xlo) + 32u; uint zhi = __ockl_clz_u32(xhi); return (int)(xhi == 0 ? zlo : zhi); } //-------- T __nv_ffs ATTR int __nv_ffs(int x) { return (32 - __nv_clz(x&(-x))); } //-------- T __nv_ffsll ATTR int __nv_ffsll(long x) { return (int)(64 - __nv_clzll(x&(-x))); } //-------- T __nv_popc ATTR int __nv_popc(int x) { return __llvm_ctpop_i32(x); } //-------- T __nv_popcll ATTR int __nv_popcll(long x) { return (int)__llvm_ctpop_i64(x); } ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/src/convert.cl000066400000000000000000000077341415221260100216200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((const)) #define CONVERTM(A,B,m,n) ATTR B __nv_##A##2##B##_##m(A x) \ { return convert_##B##_##n(x); } #define CONVERT(A,B) \ CONVERTM(A, B, rd, rtn) \ CONVERTM(A, B, rn, rte) \ CONVERTM(A, B, ru, rtp) \ CONVERTM(A, B, rz, rtz) //-------- T __nv_double2float_rd //-------- T __nv_double2float_rn //-------- T __nv_double2float_ru //-------- T __nv_double2float_rz CONVERT(double, float) //-------- T __nv_double2int_rd //-------- T __nv_double2int_rn //-------- T __nv_double2int_ru //-------- T __nv_double2int_rz CONVERT(double, int) //-------- T __nv_float2int_rd //-------- T __nv_float2int_rn //-------- T __nv_float2int_ru //-------- T __nv_float2int_rz CONVERT(float, int) //-------- T __nv_int2float_rd //-------- T __nv_int2float_rn //-------- T __nv_int2float_ru //-------- T __nv_int2float_rz CONVERT(int, float) //-------- T __nv_double2uint_rd //-------- T __nv_double2uint_rn //-------- T __nv_double2uint_ru //-------- T __nv_double2uint_rz CONVERT(double, uint) //-------- T __nv_float2uint_rd //-------- T __nv_float2uint_rn //-------- T __nv_float2uint_ru //-------- T __nv_float2uint_rz CONVERT(float, uint) //-------- T __nv_uint2double_rd //-------- T __nv_uint2double_rn //-------- T __nv_uint2double_ru //-------- T __nv_uint2double_rz CONVERT(uint, double) //-------- T __nv_uint2float_rd //-------- T __nv_uint2float_rn //-------- T __nv_uint2float_ru //-------- T __nv_uint2float_rz CONVERT(uint, float) #define CONVERT2LLM(A,B,m,n) ATTR long __nv_##A##2ll_##m(A x) \ { return convert_long_##n(x); } #define CONVERT2LL(A) \ CONVERT2LLM(A, long, rd, rtn) \ CONVERT2LLM(A, long, rn, rte) \ CONVERT2LLM(A, long, ru, rtp) \ CONVERT2LLM(A, long, rz, rtz) //-------- T __nv_double2ll_rd //-------- T __nv_double2ll_rn //-------- T __nv_double2ll_ru //-------- T __nv_double2ll_rz CONVERT2LL(double) //-------- T __nv_float2ll_rd //-------- T __nv_float2ll_rn //-------- T __nv_float2ll_ru //-------- T __nv_float2ll_rz CONVERT2LL(float) #define CONVERT2ULLM(A,B,m,n) ATTR ulong __nv_##A##2ull_##m(A x) \ { return convert_ulong_##n(x); } #define CONVERT2ULL(A) \ CONVERT2ULLM(A, ulong, rd, rtn) \ CONVERT2ULLM(A, ulong, rn, rte) \ CONVERT2ULLM(A, ulong, ru, rtp) \ CONVERT2ULLM(A, ulong, rz, rtz) //-------- T __nv_double2ull_rd //-------- T __nv_double2ull_rn //-------- T __nv_double2ull_ru //-------- T __nv_double2ull_rz CONVERT2ULL(double) //-------- T __nv_float2ull_rd //-------- T __nv_float2ull_rn //-------- T __nv_float2ull_ru //-------- T __nv_float2ull_rz CONVERT2ULL(float) #define CONVERT4LLM(A,B,m,n) ATTR B __nv_ll2##B##_##m(long x) \ { return convert_##B##_##n(x); } #define CONVERT4LL(B) \ CONVERT4LLM(long, B, rd, rtn) \ CONVERT4LLM(long, B, rn, rte) \ CONVERT4LLM(long, B, ru, rtp) \ CONVERT4LLM(long, B, rz, rtz) //-------- T __nv_ll2double_rd //-------- T __nv_ll2double_rn //-------- T __nv_ll2double_ru //-------- T __nv_ll2double_rz CONVERT4LL(double) //-------- T __nv_ll2float_rd //-------- T __nv_ll2float_rn //-------- T __nv_ll2float_ru //-------- T __nv_ll2float_rz CONVERT4LL(float) #define CONVERT4ULLM(A,B,m,n) ATTR B __nv_ull2##B##_##m(ulong x) \ { return convert_##B##_##n(x); } #define CONVERT4ULL(B) \ CONVERT4ULLM(ulong, B, rd, rtn) \ CONVERT4ULLM(ulong, B, rn, rte) \ CONVERT4ULLM(ulong, B, ru, rtp) \ CONVERT4ULLM(ulong, B, rz, rtz) //-------- T __nv_ull2double_rd //-------- T __nv_ull2double_rn //-------- T __nv_ull2double_ru //-------- T __nv_ull2double_rz CONVERT4ULL(double) //-------- T __nv_ull2float_rd //-------- T __nv_ull2float_rn //-------- T __nv_ull2float_ru //-------- T __nv_ull2float_rz CONVERT4ULL(float) ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/src/float.cl000066400000000000000000000017661415221260100212440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((const)) //-------- T __nv_finitef ATTR int __nv_finitef(float x) { return isfinite(x); } //-------- T __nv_isfinited ATTR int __nv_isfinited(double x) { return isfinite(x); } //-------- T __nv_isinfd ATTR int __nv_isinfd(double x) { return isinf(x); } //-------- T __nv_isinff ATTR int __nv_isinff(float x) { return isinf(x); } //-------- T __nv_isnand ATTR int __nv_isnand(double x) { return isnan(x); } //-------- T __nv_isnanf ATTR int __nv_isnanf(float x) { return isnan(x); } //-------- T __nv_nan ATTR double __nv_nan(char *tagp) { return __builtin_nan(tagp); } //-------- T __nv_nanf ATTR float __nv_nanf(char *tagp) { return __builtin_nan(tagp); } ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/src/generic.cl000066400000000000000000000026311415221260100215430ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((const)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) #define MIN(x, y) (((x) < (y)) ? (x) : (y)) //-------- T __nv_abs ATTR int __nv_abs(int x) { return abs(x); } //-------- T __nv_llabs ATTR long __nv_llabs(long x) { return abs(x); } //-------- T __nv_max ATTR int __nv_max(int a, int b) { return MAX(a,b); } //-------- T __nv_llmax ATTR long __nv_llmax(long a, long b) { return MAX(a,b); } //-------- T __nv_ullmax ATTR ulong __nv_ullmax(ulong a, ulong b) { return MAX(a,b); } //-------- T __nv_umax ATTR uint __nv_umax(uint a, uint b) { return MAX(a,b); } //-------- T __nv_min ATTR int __nv_min(int a, int b) { return MIN(a,b); } //-------- T __nv_llmin ATTR long __nv_llmin(long a, long b) { return MIN(a,b); } //-------- T __nv_ullmin ATTR ulong __nv_ullmin(ulong a, ulong b) { return MIN(a,b); } //-------- T __nv_umin ATTR uint __nv_umin(uint a, uint b) { return MIN(a,b); } //-------- T __nv_sad ATTR uint __nv_sad(int x, int y, uint z) { return (z+abs(x-y)); } //-------- T __nv_usad ATTR uint __nv_usad(uint x, uint y, uint z) { return (z+abs(x-y)); } ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/src/half.cl000066400000000000000000000011041415221260100210330ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define ATTR __attribute__((const)) //-------- T __nv_float2half_rn half __nv_float2half_rn(float x) { return (half)x; } //-------- T __nv_half2float float __nv_half2float(half x) { return (float)x; } ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/src/integer.cl000066400000000000000000000017311415221260100215640ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((always_inline, const)) //-------- T __nv_mul24 ATTR int __nv_mul24(int x, int y) { return __ockl_mul24_i32(x, y); } //-------- T __nv_umul24 ATTR uint __nv_umul24(uint x, uint y) { return __ockl_mul24_u32(x, y); } //-------- T __nv_mul64hi ATTR long __nv_mul64hi(long x, long y) { return __ockl_mul_hi_i64(x,y); } //-------- T __nv_mulhi ATTR int __nv_mulhi(int x, int y) { return __ockl_mul_hi_i32(x,y); } //-------- T __nv_umul64hi ATTR ulong __nv_umul64hi(ulong x, ulong y) { return __ockl_mul_hi_u64(x,y); } //-------- T __nv_umulhi ATTR uint __nv_umulhi(uint x, uint y) { return __ockl_mul_hi_u32(x,y); } ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/src/math.cl000066400000000000000000000200421415221260100210540ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #define ATTR __attribute__((always_inline)) #define FUNC1D(root) \ ATTR double __nv_##root(double x) { return __ocml_##root##_f64(x); } #define FUNC1F(root) \ ATTR float __nv_##root##f(float x) { return __ocml_##root##_f32(x); } #define FUNC1(root) FUNC1D(root) FUNC1F(root) #define FUNC2D(root) \ ATTR double __nv_##root(double x, double y) { return __ocml_##root##_f64(x, y); } #define FUNC2F(root) \ ATTR float __nv_##root##f(float x, float y) { return __ocml_##root##_f32(x, y); } #define FUNC2(root) FUNC2D(root) FUNC2F(root) #define FUNC3D(root) \ ATTR double __nv_##root(double x, double y, double z) { return __ocml_##root##_f64(x, y, z); } #define FUNC3F(root) \ ATTR float __nv_##root##f(float x, float y, float z) { return __ocml_##root##_f32(x, y, z); } #define FUNC3(root) FUNC3D(root) FUNC3F(root) //-------- T __nv_acos //-------- T __nv_acosf FUNC1(acos) //-------- T __nv_acosh //-------- T __nv_acoshf FUNC1(acosh) //-------- T __nv_asin //-------- T __nv_asinf FUNC1(asin) //-------- T __nv_asinh //-------- T __nv_asinhf FUNC1(asinh) //-------- T __nv_atan //-------- T __nv_atanf FUNC1(atan) //-------- T __nv_atan2 //-------- T __nv_atan2f FUNC2(atan2) //-------- T __nv_atanh //-------- T __nv_atanhf FUNC1(atanh) //-------- T __nv_cbrt //-------- T __nv_cbrtf FUNC1(cbrt) //-------- T __nv_ceil //-------- T __nv_ceilf FUNC1(ceil) //-------- T __nv_copysign //-------- T __nv_copysignf FUNC2(copysign) //-------- T __nv_cos //-------- T __nv_cosf FUNC1(cos) //-------- T __nv_cosh //-------- T __nv_coshf FUNC1(cosh) //-------- T __nv_cospi //-------- T __nv_cospif FUNC1(cospi) //-------- T __nv_erf //-------- T __nv_erff FUNC1(erf) //-------- T __nv_erfc //-------- T __nv_erfcf FUNC1(erfc) //-------- T __nv_erfcinv //-------- T __nv_erfcinvf FUNC1(erfcinv) //-------- T __nv_erfcx //-------- T __nv_erfcxf FUNC1(erfcx) //-------- T __nv_erfinv //-------- T __nv_erfinvf FUNC1(erfinv) //-------- T __nv_exp //-------- T __nv_expf FUNC1(exp) //-------- T __nv_exp10 //-------- T __nv_exp10f FUNC1(exp10) //-------- T __nv_exp2 //-------- T __nv_exp2f FUNC1(exp2) //-------- T __nv_expm1 //-------- T __nv_expm1f FUNC1(expm1) //-------- T __nv_fabs //-------- T __nv_fabsf FUNC1(fabs) //-------- T __nv_fdim //-------- T __nv_fdimf FUNC2(fdim) //-------- T __nv_floor //-------- T __nv_floorf FUNC1(floor) //-------- T __nv_fma //-------- T __nv_fmaf FUNC3(fma) //-------- T __nv_fmax //-------- T __nv_fmaxf FUNC2(fmax) //-------- T __nv_fmin //-------- T __nv_fminf FUNC2(fmin) //-------- T __nv_fmod //-------- T __nv_fmodf FUNC2(fmod) //-------- T __nv_hypot //-------- T __nv_hypotf FUNC2(hypot) //-------- T __nv_j0 //-------- T __nv_j0f FUNC1(j0) //-------- T __nv_j1 //-------- T __nv_j1f FUNC1(j1) //-------- T __nv_lgamma //-------- T __nv_lgammaf FUNC1(lgamma) //-------- T __nv_log //-------- T __nv_logf FUNC1(log) //-------- T __nv_log10 //-------- T __nv_log10f FUNC1(log10) //-------- T __nv_log1p //-------- T __nv_log1pf FUNC1(log1p) //-------- T __nv_log2 //-------- T __nv_log2f FUNC1(log2) //-------- T __nv_logb //-------- T __nv_logbf FUNC1(logb) //-------- T __nv_pow //-------- T __nv_powf FUNC2(pow) //-------- T __nv_rcbrt //-------- T __nv_rcbrtf FUNC1(rcbrt) //-------- T __nv_remainder //-------- T __nv_remainderf FUNC2(remainder) //-------- T __nv_rhypot //-------- T __nv_rhypotf FUNC2(rhypot) //-------- T __nv_nearbyint //-------- T __nv_nearbyintf FUNC1(nearbyint) //-------- T __nv_nextafter //-------- T __nv_nextafterf FUNC2(nextafter) //-------- T __nv_rint //-------- T __nv_rintf FUNC1(rint) //-------- T __nv_round //-------- T __nv_roundf FUNC1(round) //-------- T __nv_rsqrt //-------- T __nv_rsqrtf FUNC1(rsqrt) //-------- T __nv_scalbn //-------- T __nv_scalbnf FUNC2(scalbn) //-------- T __nv_sin //-------- T __nv_sinf FUNC1(sin) //-------- T __nv_sinh //-------- T __nv_sinhf FUNC1(sinh) //-------- T __nv_sinpi //-------- T __nv_sinpif FUNC1(sinpi) //-------- T __nv_sqrt //-------- T __nv_sqrtf FUNC1(sqrt) //-------- T __nv_tan //-------- T __nv_tanf FUNC1(tan) //-------- T __nv_tanh //-------- T __nv_tanhf FUNC1(tanh) //-------- T __nv_tgamma //-------- T __nv_tgammaf FUNC1(tgamma) //-------- T __nv_trunc //-------- T __nv_truncf FUNC1(trunc) //-------- T __nv_y0 //-------- T __nv_y0f FUNC1(y0) //-------- T __nv_y1 //-------- T __nv_y1f FUNC1(y1) //-------- T __nv_cyl_bessel_i0 ATTR double __nv_cyl_bessel_i0(double x) { return __ocml_i0_f64(x); } //-------- T __nv_cyl_bessel_i0f ATTR float __nv_cyl_bessel_i0f(float x) { return __ocml_i0_f32(x); } //-------- T __nv_cyl_bessel_i1 ATTR double __nv_cyl_bessel_i1(double x) { return __ocml_i1_f64(x); } //-------- T __nv_cyl_bessel_i1f ATTR float __nv_cyl_bessel_i1f(float x) { return __ocml_i1_f32(x); } //-------- T __nv_frexp ATTR double __nv_frexp(double x, __private int *ptr) { return __ocml_frexp_f64(x, ptr); } //-------- T __nv_frexpf ATTR float __nv_frexpf(float x, __private int *ptr) { return __ocml_frexp_f32(x, ptr); } //-------- T __nv_ilogb ATTR int __nv_ilogb(double x) { return __ocml_ilogb_f64(x); } //-------- T __nv_ilogbf ATTR int __nv_ilogbf(float x) { return __ocml_ilogb_f32(x); } //-------- T __nv_ldexp ATTR double __nv_ldexp(double x, int i) { return __ocml_ldexp_f64(x, i); } //-------- T __nv_ldexpf ATTR float __nv_ldexpf(float x, int i) { return __ocml_ldexp_f32(x, i); } //-------- T __nv_modf ATTR double __nv_modf(double x, __private double *ptr) { return __ocml_modf_f64(x, ptr); } //-------- T __nv_modff ATTR float __nv_modff(float x, __private float *ptr) { return __ocml_modf_f32(x, ptr); } //-------- T __nv_norm3d ATTR double __nv_norm3d(double x, double y, double z) { return __ocml_len3_f64(x,y,z); } //-------- T __nv_norm3df ATTR float __nv_norm3df(float x, float y, float z) { return __ocml_len3_f32(x,y,z); } //-------- T __nv_norm4d ATTR double __nv_norm4d(double a, double b, double c, double d) { return __ocml_len4_f64(a,b,c,d); } //-------- T __nv_norm4df ATTR float __nv_norm4df(float a, float b, float c, float d) { return __ocml_len4_f32(a,b,c,d); } //-------- T __nv_normcdf ATTR double __nv_normcdf(double x) { return __ocml_ncdf_f64(x); } //-------- T __nv_normcdff ATTR float __nv_normcdff(float x) { return __ocml_ncdf_f32(x); } //-------- T __nv_normcdfinv ATTR double __nv_normcdfinv(double x) { return __ocml_ncdfinv_f64(x); } //-------- T __nv_normcdfinvf ATTR float __nv_normcdfinvf(float x) { return __ocml_ncdfinv_f32(x); } //-------- T __nv_powi ATTR double __nv_powi(double x, int n) { return __ocml_pown_f64(x, n); } //-------- T __nv_powi ATTR float __nv_powif(float x, int n) { return __ocml_pown_f32(x, n); } //-------- T __nv_remquo ATTR double __nv_remquo(double x, double y, __private int *ptr) { return __ocml_remquo_f64(x, y, ptr); } //-------- T __nv_remquof ATTR float __nv_remquof(float x, float y, __private int *ptr) { return __ocml_remquo_f32(x, y, ptr); } //-------- T __nv_saturatef ATTR float __nv_saturatef(float x) { return __ocml_min_f32(__ocml_max_f32(x, 0.0f), 1.0f); } //-------- T __nv_signbitd ATTR int __nv_signbitd(double x) { return __ocml_signbit_f64(x); } //-------- T __nv_signbitf ATTR int __nv_signbitf(float x) { return __ocml_signbit_f32(x); } //-------- T __nv_sincos ATTR void __nv_sincos(double x, __private double * sptr, __private double *cptr) { (*sptr)=__ocml_sincos_f64(x, cptr); } //-------- T __nv_sincosf ATTR void __nv_sincosf(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincos_f32(x, cptr); } //-------- T __nv_sincospi ATTR void __nv_sincospi(double x, __private double * sptr, __private double *cptr) { (*sptr)=__ocml_sincospi_f64(x, cptr); } //-------- T __nv_sincospif ATTR void __nv_sincosfpif(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincospi_f32(x, cptr); } ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/src/precision.cl000066400000000000000000000026271415221260100221270ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #define ATTR #define FUNC1F(root) \ ATTR float __nv_fast_##root##f(float x) { return __ocml_##root##_f32(x); } #define FUNC1(root) FUNC1F(root) #define FUNC2F(root) \ ATTR float __nv_fast_##root##f(float x, float y) { return __ocml_##root##_f32(x, y); } #define FUNC2(root) FUNC2F(root) #define FUNC3F(root) \ ATTR float __nv_fast_##root##f(float x, float y, float z) { return __ocml_##root##_f32(x, y, z); } #define FUNC3(root) FUNC3F(root) //-------- T __nv_fast_cosf FUNC1(cos) //-------- T __nv_fast_exp10f FUNC1(exp10) //-------- T __nv_fast_expf FUNC1(exp) //-------- T __nv_fast_log10f FUNC1(log10) //-------- T __nv_fast_log2f FUNC1(log2) //-------- T __nv_fast_logf FUNC1(log) //-------- T __nv_fast_powf FUNC2(pow) //-------- T __nv_fast_sinf FUNC1(sin) //-------- T __nv_fast_tanf FUNC1(tan) //-------- T __nv_fast_fdividef ATTR float __nv_fast_fdividef(float x, float y) { return native_divide(x, y); } //-------- T __nv_fast_sincosf ATTR void __nv_fast_sincosf(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincos_f32(x, cptr); } ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/src/reinterpret.cl000066400000000000000000000024301415221260100224670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((always_inline, const)) //-------- T __nv_double_as_longlong ATTR long __nv_double_as_longlong(double x) { return as_long(x); } //-------- T __nv_float_as_int ATTR int __nv_float_as_int(float x) { return as_int(x); } //-------- T __nv_float_as_uint ATTR unsigned int __nv_float_as_uint(float x) { return as_uint(x); } //-------- T __nv_int_as_float ATTR float __nv_int_as_float(int x) { return as_float(x); } //-------- T __nv_longlong_as_double ATTR double __nv_longlong_as_double(long x) { return as_double(x); } //-------- T __nv_uint_as_float ATTR float __nv_uint_as_float(unsigned int x) { return as_float(x); } //-------- T __nv_double2hiint int __nv_double2hiint(double x) { return (int) as_long(x) >> 32; } //-------- T __nv_double2loint int __nv_double2loint(double x) { return (int) as_long(x); } //-------- T __nv_hiloint2double double __nv_hiloint2double(int x, int y) { return as_double((long)x << 32 | y); } ROCm-Device-Libs-rocm-5.0.0/cuda2gcn/src/rounding.cl000066400000000000000000000013751415221260100217600ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #define ATTR __attribute__((const)) //-------- T __nv_llrint ATTR long __nv_llrint(double x) { return (long)__ocml_rint_f64(x); } //-------- T __nv_llrintf ATTR long __nv_llrintf(float x) { return (long)__ocml_rint_f32(x); } //-------- T __nv_llround ATTR long __nv_llround(double x) { return (long)__ocml_round_f64(x); } //-------- T __nv_llroundf ATTR long __nv_llroundf(float x) { return (long)__ocml_round_f32(x); } ROCm-Device-Libs-rocm-5.0.0/doc/000077500000000000000000000000001415221260100160755ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/doc/OCKL.md000066400000000000000000000552701415221260100171600ustar00rootroot00000000000000# OCKL User Guide * [Introduction](#introduction) * [What Is OCKL](#what-is-ockl) * [Using OCKL](#using-ocml) * [Standard Usage](#standard-usage) * [Controls](#controls) * [Versioning](#versioning) * [Naming convention](#naming-convention) * [Supported functions](#supported-functions) ## Introduction ### What Is OCKL OCKL is an LLVM-IR bitcode library designed to provide access to certain hardware and compiler capabilities needed by language runtimes. It should rarely be necessary to call any of these functions directly from application code. Consider this library a "detail" layer. ## Using OCKL ### Standard Usage OCKL is expected to be used in a standard LLVM compilation flow as follows: * Compile source modules to LLVM-IR bitcode (clang) * Link together program bitcode with library bitcode including OCKL and OCLC. * Run generic optimizations (opt) * Code generation (llc) ### Controls OCKL supports a number of controls that are provided by linking in specifically named inline functions. These functions are inlined at optimization time and result in specific paths taken with no control flow overhead. These functions all have the form (in C) __attribute__((always_inline, const)) int __oclc_control(void) { return 1; } // or 0 to disable The currently supported control are * `finite_only_opt` - floating point Inf and NaN are never expected to be consumed or produced * `unsafe_math_opt` - lower accuracy results may be produced with higher performance * `daz_opt` - subnormal values consumed and produced may be flushed to zero * `correctly_rounded_sqrt32` - float square root must be correctly rounded * `ISA_version` - an integer representation of the ISA version of the target device ### Versioning OCKL usually ships as a single LLVM-IR bitcode file named ocml-{LLVM rev}-{OCKL rev}.bc where `{LLVM rev}` is the version of LLVM used to create the file, of the form X.Y, e.g. 3.8, and `{OCKL rev}` is the OCKL library version of the form X.Y, currently 0.9. ### Naming convention OCKL functions follow a simple naming convention: __ockl_{function}_{type suffix} where {type suffix} generally indicates the type of the arguments and/or returned result using a type letter, e.g. "u" for unsigned integer, and a bit width, e.g. 32. ### Supported functions The following table lists the available functions along with a brief description of each: | **function** | **Brief Description** | | :--- | :--- | | `uchar __ockl_clz_u8(uchar);` | Count leading zeroes | | `ushort __ockl_clz_u16(ushort);` | | | `uint __ockl_clz_u32(uint);` | | | `ulong __ockl_clz_u64(ulong);` | | | - | | | `uchar __ockl_ctz_u8(uchar);` | Count trailing zeroes | | `ushort __ockl_ctz_u16(ushort);` | | | `uint __ockl_ctz_u32(uint);` | | | `ulong __ockl_ctz_u64(ulong);` | | | - | | | `uint __ockl_popcount_u32(uint);` | Count nonzero bits | | `ulong __ockl_popcount_u64(ulong);` | | | - | | | `int __ockl_add_sat_i32(int,int);` | Add with saturation | | `uint __ockl_add_sat_u32(uint,uint);` | | | `long __ockl_add_sat_i64(long,long);` | | | `ulong __ockl_add_sat_u64(ulong,ulong);` | | | - | | | `int __ockl_sub_sat_i32(int,int);` | Subtract with saturation | | `uint __ockl_sub_sat_u32(uint,uint);` | | | `long __ockl_sub_sat_i64(long,long);` | | | `ulong __ockl_sub_sat_u64(ulong,ulong);` | | | - | | | `int __ockl_mul_hi_i32(int,int);` | High part of multiplication | | `uint __ockl_mul_hi_u32(uint,uint);` | | | `long __ockl_mul_hi_i64(long,long);` | | | `ulong __ockl_mul_hi_u64(ulong,ulong);` | | | - | | | `int __ockl_mul24_i32(int,int);` | Multiply assuming operands fit in 24 bits | | `uint __ockl_mul24_u32(uint,uint);` | | | - | | | `ulong __ockl_memtime_u64(void);` | Current value of free running 64-bit clock counter | | `ulong __ockl_memrealtime_u64(void);` | Current value of constant speed 64-bit clock counter | | - | | | `uint __ockl_activelane_u32(void);` | Index of currently lane counting only active lanes in wavefront | | - | | | `half __ockl_wfred_add_f16(half x);` | ADD reduction across wavefront | | `float __ockl_wfred_add_f32(float x);` | | | `double __ockl_wfred_add_f64(double x);` | | | `int __ockl_wfred_add_i32(int x);` | | | `long __ockl_wfred_add_i64(long x);` | | | `uint __ockl_wfred_add_u32(uint x);` | | | `ulong __ockl_wfred_add_u64(ulong x);` | AND reduction across wavefront | | `int __ockl_wfred_and_i32(int x);` | | | `long __ockl_wfred_and_i64(long x);` | | | `uint __ockl_wfred_and_u32(uint x);` | | | `ulong __ockl_wfred_and_u64(ulong x);` | | | `half __ockl_wfred_max_f16(half x);` | MAX reduction across wavefront | | `float __ockl_wfred_max_f32(float x);` | | | `double __ockl_wfred_max_f64(double x);` | | | `int __ockl_wfred_max_i32(int x);` | | | `long __ockl_wfred_max_i64(long x);` | | | `uint __ockl_wfred_max_u32(uint x);` | | | `ulong __ockl_wfred_max_u64(ulong x);` | | | `half __ockl_wfred_min_f16(half x);` | MIN reduction across wavefront | | `float __ockl_wfred_min_f32(float x);` | | | `double __ockl_wfred_min_f64(double x);` | | | `int __ockl_wfred_min_i32(int x);` | | | `long __ockl_wfred_min_i64(long x);` | | | `uint __ockl_wfred_min_u32(uint x);` | | | `ulong __ockl_wfred_min_u64(ulong x);` | | | `int __ockl_wfred_or_i32(int x);` | OR reduction across wavefront | | `long __ockl_wfred_or_i64(long x);` | | | `uint __ockl_wfred_or_u32(uint x);` | | | `ulong __ockl_wfred_or_u64(ulong x);` | | | `int __ockl_wfred_xor_i32(int x);` | XOR reduction across wavefront | | `long __ockl_wfred_xor_i64(long x);` | | | `uint __ockl_wfred_xor_u32(uint x);` | | | `ulong __ockl_wfred_xor_u64(ulong x);` | | | `half __ockl_wfscan_add_f16(half x, bool inclusive);` | ADD scan across wavefront | | `float __ockl_wfscan_add_f32(float x, bool inclusive);` | | | `double __ockl_wfscan_add_f64(double x, bool inclusive);` | | | `int __ockl_wfscan_add_i32(int x, bool inclusive);` | | | `long __ockl_wfscan_add_i64(long x, bool inclusive);` | | | `uint __ockl_wfscan_add_u32(uint x, bool inclusive);` | | | `ulong __ockl_wfscan_add_u64(ulong x, bool inclusive);` | | | `int __ockl_wfscan_and_i32(int x, bool inclusive);` | AND scan across wavefront | | `long __ockl_wfscan_and_i64(long x, bool inclusive);` | | | `uint __ockl_wfscan_and_u32(uint x, bool inclusive);` | | | `ulong __ockl_wfscan_and_u64(ulong x, bool inclusive);` | | | `half __ockl_wfscan_max_f16(half x, bool inclusive);` | MAX scan across wavefront | | `float __ockl_wfscan_max_f32(float x, bool inclusive);` | | | `double __ockl_wfscan_max_f64(double x, bool inclusive);` | | | `int __ockl_wfscan_max_i32(int x, bool inclusive);` | | | `long __ockl_wfscan_max_i64(long x, bool inclusive);` | | | `uint __ockl_wfscan_max_u32(uint x, bool inclusive);` | | | `ulong __ockl_wfscan_max_u64(ulong x, bool inclusive);` | | | `half __ockl_wfscan_min_f16(half x, bool inclusive);` | MIN scan across wavefront | | `float __ockl_wfscan_min_f32(float x, bool inclusive);` | | | `double __ockl_wfscan_min_f64(double x, bool inclusive);` | | | `int __ockl_wfscan_min_i32(int x, bool inclusive);` | | | `long __ockl_wfscan_min_i64(long x, bool inclusive);` | | | `uint __ockl_wfscan_min_u32(uint x, bool inclusive);` | | | `ulong __ockl_wfscan_min_u64(ulong x, bool inclusive);` | | | `int __ockl_wfscan_or_i32(int x, bool inclusive);` | OR scan across wavefront | | `long __ockl_wfscan_or_i64(long x, bool inclusive);` | | | `uint __ockl_wfscan_or_u32(uint x, bool inclusive);` | | | `ulong __ockl_wfscan_or_u64(ulong x, bool inclusive);` | | | `int __ockl_wfscan_xor_i32(int x, bool inclusive);` | XOR scan across wavefront | | `long __ockl_wfscan_xor_i64(long x, bool inclusive);` | | | `uint __ockl_wfscan_xor_u32(uint x, bool inclusive);` | | | `ulong __ockl_wfscan_xor_u64(ulong x, bool inclusive);` | | | `uint __ockl_wfbcast_u32(uint x, uint i);` | Broadcast to wavefront | | `ulong __ockl_wfbcast_u64(ulong x, uint i);` | | | - | | | `bool __ockl_wfany_i32(int e);` | Detect any nonzero across wavefront | | `bool __ockl_wfall_i32(int e);` | Detect all nozero across wavefront | | `bool __ockl_wfsame_i32(int e);` | Detect same across wavefront | | - | | | `uint __ockl_bfm_u32(uint,uint);` | Bit field mask | | `int __ockl_bfe_i32(int, uint, uint);` | Bit field extract | | `uint __ockl_bfe_u32(uint,uint,uint);` | | | `uint __ockl_bitalign_u32(uint,uint,uint);` | Align on bit boundary | | `uint __ockl_bytealign_u32(uint,uint,uint);` | Align on byte boundary | | `uint __ockl_lerp_u32(uint,uint,uint);` | Add each byte with prescribed carry | | `float __ockl_max3_f32(float,float,float);` | Max of 3 | | `half __ockl_max3_f16(half,half,half);` | | | `int __ockl_max3_i32(int,int,int);` | | | `uint __ockl_max3_u32(uint,uint,uint);` | | | `float __ockl_median3_f32(float,float,float);` | Median of 3 | | `half __ockl_median3_f16(half,half,half);` | | | `int __ockl_median3_i32(int,int,int);` | | | `uint __ockl_median3_u32(uint,uint,uint);` | | | `float __ockl_min3_f32(float,float,float);` | Min of 3 | | `half __ockl_min3_f16(half,half,half);` | | | `int __ockl_min3_i32(int,int,int);` | | | `uint __ockl_min3_u32(uint,uint,uint);` | | | `ulong __ockl_mqsad_u64(ulong, uint, ulong);` | Masked rolling SAD | | `uint __ockl_pack_u32(float4);` | Pack vector to bytes | | `ulong __ockl_qsad_u64(ulong, uint, ulong);` | Rolling SAD | | `uint __ockl_msad_u32(uint,uint,uint);` | Masked SAD | | `uint __ockl_sad_u32(uint,uint,uint);` | SAD | | `uint __ockl_sadd_u32(uint,uint,uint);` | 32-bit SAD | | `uint __ockl_sadhi_u32(uint,uint,uint);` | SAD accululating to high half | | `uint __ockl_sadw_u32(uint,uint,uint);` | 16-bit SAD | | `float __ockl_unpack0_f32(uint);` | Extract byte and convert to float | | `float __ockl_unpack1_f32(uint);` | | | `float __ockl_unpack2_f32(uint);` | | | `float __ockl_unpack3_f32(uint);` | | | - | | | `float4 __ockl_image_load_1D(TSHARP i, int c);` | Load from 1D image | | `float4 __ockl_image_load_1Da(TSHARP i, int2 c);` | Load from 1D image array | | `float4 __ockl_image_load_1Db(TSHARP i, int c);` | Load from 1D buffered image | | `float4 __ockl_image_load_2D(TSHARP i, int2 c);` | Load from 2D image | | `float4 __ockl_image_load_2Da(TSHARP i, int4 c);` | Load from 2D image array | | `float __ockl_image_load_2Dad(TSHARP i, int4 c);` | Load from 2D depth image array | | `float __ockl_image_load_2Dd(TSHARP i, int2 c);` | Load from 2D depth image | | `float4 __ockl_image_load_3D(TSHARP i, int4 c);` | Load from 3D image | | `float4 __ockl_image_load_CM(TSHARP i, int2 c, int f);` | Load from cubemap | | `float4 __ockl_image_load_CMa(TSHARP i, int4 c, int f);` | Load from cubemap array | | - | | | `float4 __ockl_image_load_mip_1D(TSHARP i, int c, int l);` | Load from mipmapped image | | `float4 __ockl_image_load_mip_1Da(TSHARP i, int2 c, int l);` | | | `float4 __ockl_image_load_mip_2D(TSHARP i, int2 c, int l);` | | | `float4 __ockl_image_load_mip_2Da(TSHARP i, int4 c, int l);` | | | `float __ockl_image_load_mip_2Dad(TSHARP i, int4 c, int l);` | | | `float __ockl_image_load_mip_2Dd(TSHARP i, int2 c, int l);` | | | `float4 __ockl_image_load_mip_3D(TSHARP i, int4 c, int l);` | | | `float4 __ockl_image_load_mip_CM(TSHARP i, int2 c, int f, int l);` | | | `float4 __ockl_image_load_mip_CMa(TSHARP i, int4 c, int f, int l);` | | | - | | | `half4 __ockl_image_loadh_1D(TSHARP i, int c);` | Load from image returning half precision | | `half4 __ockl_image_loadh_1Da(TSHARP i, int2 c);` | | | `half4 __ockl_image_loadh_1Db(TSHARP i, int c);` | | | `half4 __ockl_image_loadh_2D(TSHARP i, int2 c);` | | | `half4 __ockl_image_loadh_2Da(TSHARP i, int4 c);` | | | `half4 __ockl_image_loadh_3D(TSHARP i, int4 c);` | | | `half4 __ockl_image_loadh_CM(TSHARP i, int2 c, int f);` | | | `half4 __ockl_image_loadh_CMa(TSHARP i, int4 c, int f);` | | | `half4 __ockl_image_loadh_mip_1D(TSHARP i, int c, int l);` | | | `half4 __ockl_image_loadh_mip_1Da(TSHARP i, int2 c, int l);` | | | `half4 __ockl_image_loadh_mip_2D(TSHARP i, int2 c, int l);` | | | `half4 __ockl_image_loadh_mip_2Da(TSHARP i, int4 c, int l);` | | | `half4 __ockl_image_loadh_mip_3D(TSHARP i, int4 c, int l);` | | | `half4 __ockl_image_loadh_mip_CM(TSHARP i, int2 c, int f, int l);` | | | `half4 __ockl_image_loadh_mip_CMa(TSHARP i, int4 c, int f, int l);` | | | - | | | `void __ockl_image_store_1D(TSHARP i, int c, float4 p);` | Store to image | | `void __ockl_image_store_1Da(TSHARP i, int2 c, float4 p);` | | | `void __ockl_image_store_1Db(TSHARP i, int c, float4 p);` | | | `void __ockl_image_store_2D(TSHARP i, int2 c, float4 p);` | | | `void __ockl_image_store_2Da(TSHARP i, int4 c, float4 p);` | | | `void __ockl_image_store_2Dad(TSHARP i, int4 c, float p);` | | | `void __ockl_image_store_2Dd(TSHARP i, int2 c, float p);` | | | `void __ockl_image_store_3D(TSHARP i, int4 c, float4 p);` | | | `void __ockl_image_store_CM(TSHARP i, int2 c, int f, float4 p);` | | | `void __ockl_image_store_CMa(TSHARP i, int4 c, int f, float4 p);` | | | `void __ockl_image_store_lod_1D(TSHARP i, int c, int l, float4 p);` | Store to level of mipmapped image | | - | | | `void __ockl_image_store_lod_1Da(TSHARP i, int2 c, int l, float4 p);` | | | `void __ockl_image_store_lod_2D(TSHARP i, int2 c, int l, float4 p);` | | | `void __ockl_image_store_lod_2Da(TSHARP i, int4 c, int l, float4 p);` | | | `void __ockl_image_store_lod_2Dad(TSHARP i, int4 c, int l, float p);` | | | `void __ockl_image_store_lod_2Dd(TSHARP i, int2 c, int l, float p);` | | | `void __ockl_image_store_lod_3D(TSHARP i, int4 c, int l, float4 p);` | | | `void __ockl_image_store_lod_CM(TSHARP i, int2 c, int f, int l, float4 p);` | | | `void __ockl_image_store_lod_CMa(TSHARP i, int4 c, int f, int l, float4 p);` | | | - | | | `void __ockl_image_storeh_1D(TSHARP i, int c, half4 p);` | Store half precision pixel to image| | `void __ockl_image_storeh_1Da(TSHARP i, int2 c, half4 p);` | | | `void __ockl_image_storeh_1Db(TSHARP i, int c, half4 p);` | | | `void __ockl_image_storeh_2D(TSHARP i, int2 c, half4 p);` | | | `void __ockl_image_storeh_2Da(TSHARP i, int4 c, half4 p);` | | | `void __ockl_image_storeh_3D(TSHARP i, int4 c, half4 p);` | | | `void __ockl_image_storeh_CM(TSHARP i, int2 c, int f, half4 p);` | | | `void __ockl_image_storeh_CMa(TSHARP i, int4 c, int f, half4 p);` | | | - | | | `void __ockl_image_storeh_lod_1D(TSHARP i, int c, int l, half4 p);` | Store half precision pixel to level of mipmapped image | | `void __ockl_image_storeh_lod_1Da(TSHARP i, int2 c, int l, half4 p);` | | | `void __ockl_image_storeh_lod_2D(TSHARP i, int2 c, int l, half4 p);` | | | `void __ockl_image_storeh_lod_2Da(TSHARP i, int4 c, int l, half4 p);` | | | `void __ockl_image_storeh_lod_3D(TSHARP i, int4 c, int l, half4 p);` | | | `void __ockl_image_storeh_lod_CM(TSHARP i, int2 c, int f, int l, half4 p);` | | | `void __ockl_image_storeh_lod_CMa(TSHARP i, int4 c, int f, int l, half4 p);` | | | - | | | `float4 __ockl_image_sample_1D(TSHARP i, SSHARP s, float c);` | Sample image | | `float4 __ockl_image_sample_1Da(TSHARP i, SSHARP s, float2 c);` | | | `float4 __ockl_image_sample_2D(TSHARP i, SSHARP s, float2 c);` | | | `float4 __ockl_image_sample_2Da(TSHARP i, SSHARP s, float4 c);` | | | `float __ockl_image_sample_2Dad(TSHARP i, SSHARP s, float4 c);` | | | `float __ockl_image_sample_2Dd(TSHARP i, SSHARP s, float2 c);` | | | `float4 __ockl_image_sample_3D(TSHARP i, SSHARP s, float4 c);` | | | `float4 __ockl_image_sample_CM(TSHARP i, SSHARP s, float4 c);` | | | `float4 __ockl_image_sample_CMa(TSHARP i, SSHARP s, float4 c);` | | | - | | | `float4 __ockl_image_sample_grad_1D(TSHARP i, SSHARP s, float c, float dx, float dy);` | Sample mipmapped image using gradient | | `float4 __ockl_image_sample_grad_1Da(TSHARP i, SSHARP s, float2 c, float dx, float dy);` | | | `float4 __ockl_image_sample_grad_2D(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | | | `float4 __ockl_image_sample_grad_2Da(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | | | `float __ockl_image_sample_grad_2Dad(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | | | `float __ockl_image_sample_grad_2Dd(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | | | `float4 __ockl_image_sample_grad_3D(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy);` | | | - | | | `float4 __ockl_image_sample_lod_1D(TSHARP i, SSHARP s, float c, float l);` | Sample mipmapped image using LOD | | `float4 __ockl_image_sample_lod_1Da(TSHARP i, SSHARP s, float2 c, float l);` | | | `float4 __ockl_image_sample_lod_2D(TSHARP i, SSHARP s, float2 c, float l);` | | | `float4 __ockl_image_sample_lod_2Da(TSHARP i, SSHARP s, float4 c, float l);` | | | `float __ockl_image_sample_lod_2Dad(TSHARP i, SSHARP s, float4 c, float l);` | | | `float __ockl_image_sample_lod_2Dd(TSHARP i, SSHARP s, float2 c, float l);` | | | `float4 __ockl_image_sample_lod_3D(TSHARP i, SSHARP s, float4 c, float l);` | | | `float4 __ockl_image_sample_lod_CM(TSHARP i, SSHARP s, float4 c, float l);` | | | `float4 __ockl_image_sample_lod_CMa(TSHARP i, SSHARP s, float4 c, float l);` | | | - | | | `half4 __ockl_image_sampleh_1D(TSHARP i, SSHARP s, float c);` | Sample image returning half precision | | `half4 __ockl_image_sampleh_1Da(TSHARP i, SSHARP s, float2 c);` | | | `half4 __ockl_image_sampleh_2D(TSHARP i, SSHARP s, float2 c);` | | | `half4 __ockl_image_sampleh_2Da(TSHARP i, SSHARP s, float4 c);` | | | `half4 __ockl_image_sampleh_3D(TSHARP i, SSHARP s, float4 c);` | | | `half4 __ockl_image_sampleh_CM(TSHARP i, SSHARP s, float4 c);` | | | `half4 __ockl_image_sampleh_CMa(TSHARP i, SSHARP s, float4 c);` | | | - | | | `half4 __ockl_image_sampleh_grad_1D(TSHARP i, SSHARP s, float c, float dx, float dy);` | Sample mipmapped image using gradient returning half precision | | `half4 __ockl_image_sampleh_grad_1Da(TSHARP i, SSHARP s, float2 c, float dx, float dy);` | | | `half4 __ockl_image_sampleh_grad_2D(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | | | `half4 __ockl_image_sampleh_grad_2Da(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | | | `half4 __ockl_image_sampleh_grad_3D(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy);` | | | - | | | `half4 __ockl_image_sampleh_lod_1D(TSHARP i, SSHARP s, float c, float l);` | Sample mipmapped image using LOD returning half precision | | `half4 __ockl_image_sampleh_lod_1Da(TSHARP i, SSHARP s, float2 c, float l);` | | | `half4 __ockl_image_sampleh_lod_2D(TSHARP i, SSHARP s, float2 c, float l);` | | | `half4 __ockl_image_sampleh_lod_2Da(TSHARP i, SSHARP s, float4 c, float l);` | | | `half4 __ockl_image_sampleh_lod_3D(TSHARP i, SSHARP s, float4 c, float l);` | | | `half4 __ockl_image_sampleh_lod_CM(TSHARP i, SSHARP s, float4 c, float l);` | | | `half4 __ockl_image_sampleh_lod_CMa(TSHARP i, SSHARP s, float4 c, float l);` | | | - | | | `float4 __ockl_image_gather4r_2D(TSHARP i, SSHARP s, float2 c);` | Gather 2x2 channel from image | | `float4 __ockl_image_gather4g_2D(TSHARP i, SSHARP s, float2 c);` | | | `float4 __ockl_image_gather4b_2D(TSHARP i, SSHARP s, float2 c);` | | | `float4 __ockl_image_gather4a_2D(TSHARP i, SSHARP s, float2 c);` | | | - | | | `int __ockl_image_array_size_1Da(TSHARP i);` | Get image array size | | `int __ockl_image_array_size_2Da(TSHARP i);` | | | `int __ockl_image_array_size_2Dad(TSHARP i);` | | | `int __ockl_image_array_size_CMa(TSHARP i);` | | | - | | | `int __ockl_image_channel_data_type_1D(TSHARP i);` | Get image channel data type | | `int __ockl_image_channel_data_type_1Da(TSHARP i);` | | | `int __ockl_image_channel_data_type_1Db(TSHARP i);` | | | `int __ockl_image_channel_data_type_2D(TSHARP i);` | | | `int __ockl_image_channel_data_type_2Da(TSHARP i);` | | | `int __ockl_image_channel_data_type_2Dad(TSHARP i);` | | | `int __ockl_image_channel_data_type_2Dd(TSHARP i);` | | | `int __ockl_image_channel_data_type_3D(TSHARP i);` | | | `int __ockl_image_channel_data_type_CM(TSHARP i);` | | | `int __ockl_image_channel_data_type_CMa(TSHARP i);` | | | - | | | `int __ockl_image_channel_order_1D(TSHARP i);` | Get image channel order | | `int __ockl_image_channel_order_1Da(TSHARP i);` | | | `int __ockl_image_channel_order_1Db(TSHARP i);` | | | `int __ockl_image_channel_order_2D(TSHARP i);` | | | `int __ockl_image_channel_order_2Da(TSHARP i);` | | | `int __ockl_image_channel_order_2Dad(TSHARP i);` | | | `int __ockl_image_channel_order_2Dd(TSHARP i);` | | | `int __ockl_image_channel_order_3D(TSHARP i);` | | | `int __ockl_image_channel_order_CM(TSHARP i);` | | | `int __ockl_image_channel_order_CMa(TSHARP i);` | | | - | | | `int __ockl_image_depth_3D(TSHARP i);` | Get 3D image depth | | - | | | `int __ockl_image_height_2D(TSHARP i);` | Get image height | | `int __ockl_image_height_2Da(TSHARP i);` | | | `int __ockl_image_height_2Dad(TSHARP i);` | | | `int __ockl_image_height_2Dd(TSHARP i);` | | | `int __ockl_image_height_3D(TSHARP i);` | | | `int __ockl_image_height_CM(TSHARP i);` | | | `int __ockl_image_height_CMa(TSHARP i);` | | | - | | | `int __ockl_image_num_mip_levels_1D(TSHARP i);` | Get number of levels in mipmapped image | | `int __ockl_image_num_mip_levels_1Da(TSHARP i);` | | | `int __ockl_image_num_mip_levels_2D(TSHARP i);` | | | `int __ockl_image_num_mip_levels_2Da(TSHARP i);` | | | `int __ockl_image_num_mip_levels_2Dad(TSHARP i);` | | | `int __ockl_image_num_mip_levels_2Dd(TSHARP i);` | | | `int __ockl_image_num_mip_levels_3D(TSHARP i);` | | | `int __ockl_image_num_mip_levels_CM(TSHARP i);` | | | `int __ockl_image_num_mip_levels_CMa(TSHARP i);` | | | - | | | `int __ockl_image_width_1D(TSHARP i);` | Get image width | | `int __ockl_image_width_1Da(TSHARP i);` | | | `int __ockl_image_width_1Db(TSHARP i);` | | | `int __ockl_image_width_2D(TSHARP i);` | | | `int __ockl_image_width_2Da(TSHARP i);` | | | `int __ockl_image_width_2Dad(TSHARP i);` | | | `int __ockl_image_width_2Dd(TSHARP i);` | | | `int __ockl_image_width_3D(TSHARP i);` | | | `int __ockl_image_width_CM(TSHARP i);` | | | `int __ockl_image_width_CMa(TSHARP i);` | | | - | | | `size_t __ockl_get_global_offset(uint);` | Get grid global offset (OpenCL) of dimension | | `size_t __ockl_get_global_id(uint);` | Get workitem global ID of dimension | | `size_t __ockl_get_local_id(uint);` | Get workitem local ID of dimension | | `size_t __ockl_get_group_id(uint);` | Get ID of group workitem resides in of dimension | | `size_t __ockl_get_global_size(uint);` | Get global size of dimension | | `size_t __ockl_get_local_size(uint);` | Get local size of dimension | | `size_t __ockl_get_num_groups(uint);` | Get number of groups in dimension | | `uint __ockl_get_work_dim(void);` | Get grid number of dimensions | | `size_t __ockl_get_enqueued_local_size(uint);` | Get enqueued local size of dimension | | `size_t __ockl_get_global_linear_id(void);` | Get global linear ID of workitem| | `size_t __ockl_get_local_linear_id(void);` | Get local linear ID of workitem | | - | | | `bool __ockl_is_local_addr(const void *);` | Test if generic address is local | | `bool __ockl_is_private_addr(const void *);` | Test if generic address is private | | `__global void * __ockl_to_global(void *);` | Convert generic address to global address | | `__local void * __ockl_to_local(void *);` | Convert generic address to local address | | `__private void * __ockl_to_private(void *);` | Convert generic address to private address | ROCm-Device-Libs-rocm-5.0.0/doc/OCML.md000066400000000000000000000254021415221260100171540ustar00rootroot00000000000000# OCML User Guide * [Introduction](#introduction) * [What Is OCML](#what-is-ocml) * [Using OCML](#using-ocml) * [Standard Usage](#standard-usage) * [Controls](#controls) * [Versioning](#versioning) * [Tables](#tables) * [Naming convention](#naming-convention) * [Supported functions](#supported-functions) ## Introduction ### What Is OCML OCML is an LLVM-IR bitcode library designed to relieve language compiler and runtime implementers of the burden of implementing efficient and accurate mathematical functions. It is essentially a “libm” in intermediate representation with a fixed, simple API that can be linked in to supply the implementations of most standard low-level mathematical functions provided by the language. ## Using OCML ### Standard Usage OCML is expected to be used in a standard LLVM compilation flow as follows: * Compile source modules to LLVM-IR bitcode (clang) * Link program bitcode, “wrapper” bitcode, OCML bitcode, other device library bitcode, and OCML control functions (llvm-link) * Generic optimizations (opt) * Code generation (llc) Here, “wrapper” bitcode denotes a thin library responsible for mapping language specific mangled built-in function calls as produced by clang to the OCML API. An example for handling "sqrt" might look like extern "C" __attribute__((const)) float __ocml_sqrt_f32(float); float sqrt(float x) { return __ocml_sqrt_f32(x); } The next section describes OCML controls and how to use them. ### Controls OCML (and a few other device libraries) requires a number of control variables definitions to be provided. These definitions may be provided by linking in specific OCLC libraries which define one specifically named variable or via other runtime specific means. These variables are known at optimization time and optimizations will result in specific paths taken with no control flow overhead. These variables all have the form (in C) `__constant const int __oclc_ = N;` The currently supported control ``s and values `N` are * `finite_only_opt` - floating point Inf and NaN are never expected to be consumed or produced. `N` may be 1 (on/true/enabled), or 0 (off/false/disabled). * `unsafe_math_opt` - lower accuracy results may be produced with higher performance. `N` may be 1 (on/true/enabled) or 0 (off/false/disabled). * `daz_opt` - subnormal values consumed and produced may be flushed to zero. `N`may be 1 (on/true/enabled) or 0 (off/false/disabled). * `correctly_rounded_sqrt32` - float square root must be correctly rounded. `N` may be 1 (on/true/enabled) or 0 (off/false/disabled). * `wavefrontsize64` - the wave front size is 64. `N` may be 1 (on/true/enabled) or 0 (off/false/disabled). Very few current devices support a value of 0. * `ISA_version` - an integer representation of the ISA version of the target device The language runtime can link a specific set of OCLC control libraries to properly configure OCML and other device libraries which also use the controls. If linking OCLC libraries is used to define the control variables, then the runtime must link in: - Exactly one of `oclc_correctly_rounded_sqrt_on.amdgcn.bc` or `oclc_correctly_rounded_sqrt_off.amdgcn.bc` depending on the kernel's requirements - Exactly one of `oclc_daz_opt_on.amdgcn.bc` or `oclc_daz_opt_off.amdgcn.bc` depending on the kernel's requirements - Exactly one of `oclc_finite_only_on.amdgcn.bc` or `oclc_finite_only_off.amdgcn.bc` depending on the kernel's requirements - Exactly one of `oclc_unsafe_math_on.amdgcn.bc` or `oclc_unsafe_math_off.amdgcn.bc` depending on the kernel's requirements - Exactly one of `oclc_wavefrontsize64_on.amdgcn.bc` or `oclc_wavefrontsize64_off.amdgcn.bc` depending on the kernel's requirements - Exactly one of `oclc_isa_version_XYZ.amdgcn.bc` where XYZ is the suffix of the `gfxXYZ` target name the kernel is being compiled for. If these rules are not follows, link time or execution time errors may result. ### Versioning OCML ships within the larger release as a single LLVM-IR bitcode file named ocml.amdgcn.bc Bitcode linking errors are possible if the library is not in-sync with the compiler shipped with the same release. ### Tables Some OCML functions require access to tables of constants. These tables are currently named with the prefix `__ocmltbl_` and are placed in LLVM address space 2. ### Naming convention OCML functions follow a simple naming convention: __ocml_{function}_{type suffix} where `{function}` is generally the familiar libm name of the function, and `{type suffix}` indicates the type of the floating point arguments or results, and is one of * `f16` – 16 bit floating point (half precision) * `f32` – 32 bit floating point (single precision) * `f64` – 64 bit floating point (double precision) For example, `__ocml_sqrt_f32` is the name of the OCML single precision square root function. OCML does not currently support higher precision than double precision due to the lack of hardware support for such precisions. ### Supported functions The following table contains a list of {function} currently supported by OCML, a brief description of each, and the maximum relative error in ULPs for each floating point type. A “c” in the last 3 columns indicates that the function is required to be correctly rounded. | **{function}** | **Description** | **f32 max err** | **f64 max err** | **f16 max err** | | --- | --- | --- | --- | --- | | acos | arc cosine | 4 | 4 | 2 | | acosh | arc hyperbolic cosine | 4 | 4 | 2 | | acospi | arc cosine / π | 5 | 5 | 2 | | add_{rm} | add with specific rounding mode | c | c | c | | asin | arc sine | 4 | 4 | 2 | | asinh | arc hyperbolic sin | 4 | 4 | 2 | | asinpi | arc sine / pi | 5 | 5 | 2 | | atan2 | two argument arc tangent | 6 | 6 | 2 | | atan2pi | two argument arc tangent / pi | 6 | 6 | 2 | | atan | single argument arc tangent | 5 | 5 | 2 | | atanh | arc hyperbolic tangent | 5 | 5 | 2 | | atanpi | single argument arc tangent / pi | 5 | 5 | 2 | | cbrt | cube root | 2 | 2 | 2 | | ceil | round upwards to integer | c | c | c | | copysign | copy sign of second argument to absolute value of first | 0 | 0 | 0 | | cos | cosine | 4 | 4 | 2 | | cosh | hyperbolic cosine | 4 | 4 | 2 | | cospi | cosine of argument times pi | 4 | 4 | 2 | | div_{rm} | correctly rounded division with specific rounding mode | c | c | c | | erf | error function | 16 | 16 | 4 | | erfc | complementary error function | 16 | 16 | 4 | | erfcinv | inverse complementary error function | 7 | 8 | 3 | | erfcx | scaled error function | 6 | 6 | 2 | | erfinv | inverse error function | 3 | 8 | 2 | | exp10 | 10x | 3 | 3 | 2 | | exp2 | 2x | 3 | 3 | 2 | | exp | ex | 3 | 3 | 2 | | expm1 | ex - 1, accurate at 0 | 3 | 3 | 2 | | fabs | absolute value | 0 | 0 | 0 | | fdim | positive difference | c | c | c | | floor | round downwards to integer | c | c | c | | fma[_{rm}] | fused (i.e. singly rounded) multiply-add, with optional specific rounding | c | c | c | | fmax | maximum, avoids NaN | 0 | 0 | 0 | | fmin | minimum, avoids NaN | 0 | 0 | 0 | | fmod | floating point remainder | 0 | 0 | 0 | | fpclassify | classify floating point | - | - | - | | fract | fractional part | c | c | c | | frexp | extract significand and exponent | 0 | 0 | 0 | | hypot | length, with overflow control | 4 | 4 | 2 | | i0 | modified Bessel function of the first kind, order 0, I0 | 6 | 6 | 2 | | i1 | modified Bessel function of the first kind, order 1, I1 | 6 | 6 | 2 | | ilogb | extract exponent | 0 | 0 | 0 | | isfinite | tests finiteness | - | - | - | | isinf | test for Inf | - | - | - | | isnan | test for NaN | - | - | - | | isnormal | test for normal | - | - | - | | j0 | Bessel function of the first kind, order 0, J0 | 6 (<12) | 6 (<12) | 2 (<12) | | j1 | Bessel function of the first kind, order 1, J1 | 6 (<12) | 6 (<12) | 2 (<12) | | ldexp | multiply by 2 raised to an integral power | c | c | c | | len3 | three argument hypot | 2 | 2 | 2| | len4 | four argument hypot | 2 | 2 | 2| | lgamma | log Γ function | 6(>0) | 4(>0) | 3(>0) | | lgamma_r | log Γ function with sign | 6(>0) | 4(>0) | 3(>0) | | log10 | log base 10 | 3 | 3 | 2 | | log1p | log base e accurate near 1 | 2 | 2 | 2 | | log2 | log base 2 | 3 | 3 | 2 | | log | log base e | 3 | 3 | 2 | | logb | extract exponent | 0 | 0 | 0 | | mad | multiply-add, implementation defined if fused | c | c | c | | max | maximum without special NaN handling | 0 | 0 | 0 | | maxmag | maximum magnitude | 0 | 0 | 0 | | min | minimum without special NaN handling | 0 | 0 | 0 | | minmag | minimum magnitude | 0 | 0 | 0 | | modf | extract integer and fraction | 0 | 0 | 0 | | mul_{rm} | multiply with specific rounding mode | c | c | c | | nan | produce a NaN with a specific payload | 0 | 0 | 0 | | ncdf | standard normal cumulative distribution function | 16 | 16 | 4 | | ncdfinv | inverse standard normal cumulative distribution function | 16 | 16 | 4 | | nearbyint | round to nearest integer (see also rint) | 0 | 0 | 0 | | nextafter | next closest value above or below | 0 | 0 | 0 | | pow | general power | 16 | 16 | 4 | | pown | power with integral exponent | 16 | 16 | 4 | | powr | power with positive floating point exponent | 16 | 16 | 4 | | rcbrt | reciprocal cube root | 2 | 2 | 2 | | remainder | floating point remainder | 0 | 0 | 0 | | remquo | floating point remainder and lowest integral quotient bits | 0 | 0 | 0 | | rhypot | reciprocal hypot | 2 | 2 | 2 | | rint | round to nearest integer | c | c | c | | rlen3 | reciprocal len3 | 2 | 2 | 2 | | rlen4 | reciprocal len4 | 2 | 2 | 2 | | rootn | nth root | 16 | 16 | 4 | | round | round to integer, always away from 0 | c | c | c | | rsqrt | reciprocal square root | 2 | 2 | 1 | | scalb | multiply by 2 raised to a power | c | c | c | | scalbn | multiply by 2 raised to an integral power (see also ldexp) | c | c | c | | signbit | nonzero if argument has sign bit set | - | - | - | | sin | sine function | 4 | 4 | 2 | | sincos | simultaneous sine and cosine evaluation | 4 | 4 | 2 | | sincospi | sincos function of argument times pi | 4 | 4 | 2 | | sinh | hyperbolic sin | 4 | 4 | 2 | | sinpi | sine of argument times pi | 4 | 4 | 2 | | sqrt | square root | 3/c | 3/c | c | | sub_{rm} | subtract with specific rounding mode | c | c | c | | tan | tangent | 5 | 5 | 2 | | tanh | hyperbolic tangent | 5 | 5 | 2 | | tanpi | tangent of argument times pi | 6 | 6 | 2 | | tgamma | true Γ function | 16 | 16 | 4 | | trunc | round to integer, towards zero | c | c | c | | y0 | Bessel function of the second kind, order 0, Y0 | 2 (<12) | 6 (<12) | 6 (<12) | | y1 | Bessel function of the second kind, order 1, Y1 | 2 (<12) | 6 (<12) | 6 (<12) | For the functions supporting specific roundings, the rounding mode {rm} can be one of * `rte` – round towards nearest even * `rtp` – round towards positive infinity * `rtn` – round towards negative infinity * `rtz` – round towards zero Note that these functions are not currently available. ROCm-Device-Libs-rocm-5.0.0/hip/000077500000000000000000000000001415221260100161105ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/hip/CMakeLists.txt000066400000000000000000000013371415221260100206540ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- file(GLOB cl_sources ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cl ) file(GLOB ll_sources ${CMAKE_CURRENT_SOURCE_DIR}/src/*.ll ) file(GLOB sources ${cl_sources} ${ll_sources}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ocml/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ockl/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../irif/inc) opencl_bc_lib(NAME hip SOURCES ${sources}) ROCm-Device-Libs-rocm-5.0.0/hip/src/000077500000000000000000000000001415221260100166775ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/hip/src/atomic_fence.cl000066400000000000000000000050011415221260100216270ustar00rootroot00000000000000#include "ockl.h" #include "irif.h" #define ATTR2 __attribute__((always_inline)) ATTR2 void __atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope) { // We're tying global-happens-before and local-happens-before together as does HSA if (order != memory_order_relaxed) { switch (scope) { case memory_scope_work_item: break; case memory_scope_sub_group: switch (order) { case memory_order_relaxed: break; case memory_order_acquire: __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront"); break; case memory_order_release: __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront"); break; case memory_order_acq_rel: __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "wavefront"); break; case memory_order_seq_cst: __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "wavefront"); break; } break; case memory_scope_work_group: switch (order) { case memory_order_relaxed: break; case memory_order_acquire: __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); break; case memory_order_release: __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); break; case memory_order_acq_rel: __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "workgroup"); break; case memory_order_seq_cst: __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); break; } break; case memory_scope_device: switch (order) { case memory_order_relaxed: break; case memory_order_acquire: __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); break; case memory_order_release: __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); break; case memory_order_acq_rel: __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); break; case memory_order_seq_cst: __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); break; } break; case memory_scope_all_svm_devices: switch (order) { case memory_order_relaxed: break; case memory_order_acquire: __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, ""); break; case memory_order_release: __builtin_amdgcn_fence(__ATOMIC_RELEASE, ""); break; case memory_order_acq_rel: __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, ""); break; case memory_order_seq_cst: __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); break; } break; } } } ROCm-Device-Libs-rocm-5.0.0/irif/000077500000000000000000000000001415221260100162615ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/irif/CMakeLists.txt000066400000000000000000000016041415221260100210220ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- file(GLOB sources ${CMAKE_CURRENT_SOURCE_DIR}/src/*.ll ) set(irif_lib_file ${CMAKE_CURRENT_BINARY_DIR}/irif.bc) add_custom_command(OUTPUT ${irif_lib_file} COMMAND $ ${sources} -o ${irif_lib_file} DEPENDS ${sources}) add_custom_target(irif DEPENDS ${irif_lib_file} SOURCES ${sources}) set_target_properties(irif PROPERTIES OUTPUT_NAME ${irif_lib_file} ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" ARCHIVE_OUTPUT_NAME "irif") if(NOT ROCM_DEVICELIB_STANDALONE_BUILD) add_dependencies(irif llvm-link) endif() ROCm-Device-Libs-rocm-5.0.0/irif/inc/000077500000000000000000000000001415221260100170325ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/irif/inc/irif.h000066400000000000000000000520721415221260100201420ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #ifndef IRIF_H #define IRIF_H #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define REQUIRES_16BIT_INSTS __attribute__((target("16-bit-insts"))) #define REQUIRES_GFX9_INSTS __attribute__((target("gfx9-insts"))) // Generic intrinsics extern __attribute__((const)) float2 __llvm_fma_2f32(float2, float2, float2) __asm("llvm.fma.v2f32"); extern __attribute__((const)) half2 __llvm_fma_2f16(half2, half2, half2) __asm("llvm.fma.v2f16"); extern __attribute__((const)) half2 __llvm_fabs_2f16(half2) __asm("llvm.fabs.v2f16"); extern __attribute__((const)) half2 __llvm_minnum_2f16(half2, half2) __asm("llvm.minnum.v2f16"); extern __attribute__((const)) half2 __llvm_maxnum_2f16(half2, half2) __asm("llvm.maxnum.v2f16"); extern __attribute__((const)) half2 __llvm_copysign_2f16(half2, half2) __asm("llvm.copysign.v2f16"); extern __attribute__((const)) half2 __llvm_floor_2f16(half2) __asm("llvm.floor.v2f16"); extern __attribute__((const)) half2 __llvm_ceil_2f16(half2) __asm("llvm.ceil.v2f16"); extern __attribute__((const)) half2 __llvm_trunc_2f16(half2) __asm("llvm.trunc.v2f16"); extern __attribute__((const)) half2 __llvm_round_2f16(half2) __asm("llvm.round.v2f16"); extern __attribute__((const)) half2 __llvm_rint_2f16(half2) __asm("llvm.rint.v2f16"); extern __attribute__((const)) half2 __llvm_canonicalize_2f16(half2) __asm("llvm.canonicalize.v2f16"); // Intrinsics requiring wrapping extern __attribute__((const)) uchar __llvm_ctlz_i8(uchar); extern __attribute__((const)) ushort __llvm_ctlz_i16(ushort); extern __attribute__((const)) uint __llvm_ctlz_i32(uint); extern __attribute__((const)) ulong __llvm_ctlz_i64(ulong); extern __attribute__((const)) uchar __llvm_cttz_i8(uchar); extern __attribute__((const)) ushort __llvm_cttz_i16(ushort); extern __attribute__((const)) uint __llvm_cttz_i32(uint); extern __attribute__((const)) ulong __llvm_cttz_i64(ulong); // Atomics extern uint __llvm_ld_atomic_a1_x_dev_i32(__global uint *); extern ulong __llvm_ld_atomic_a1_x_dev_i64(__global ulong *); extern uint __llvm_ld_atomic_a3_x_wg_i32(__local uint *); extern ulong __llvm_ld_atomic_a3_x_wg_i64(__local ulong *); extern void __llvm_st_atomic_a1_x_dev_i32(__global uint *, uint); extern void __llvm_st_atomic_a1_x_dev_i64(__global ulong *, ulong); extern void __llvm_st_atomic_a3_x_wg_i32(__local uint *, uint); extern void __llvm_st_atomic_a3_x_wg_i64(__local ulong *, ulong); extern uint __llvm_atomic_add_a1_x_dev_i32(__global uint *, uint); extern ulong __llvm_atomic_add_a1_x_dev_i64(__global ulong *, ulong); extern uint __llvm_atomic_add_a3_x_wg_i32(__local uint *, uint); extern ulong __llvm_atomic_add_a3_x_wg_i64(__local ulong *, ulong); extern uint __llvm_atomic_and_a1_x_dev_i32(__global uint *, uint); extern ulong __llvm_atomic_and_a1_x_dev_i64(__global ulong *, ulong); extern uint __llvm_atomic_and_a3_x_wg_i32(__local uint *, uint); extern ulong __llvm_atomic_and_a3_x_wg_i64(__local ulong *, ulong); extern uint __llvm_atomic_or_a1_x_dev_i32(__global uint *, uint); extern ulong __llvm_atomic_or_a1_x_dev_i64(__global ulong *, ulong); extern uint __llvm_atomic_or_a3_x_wg_i32(__local uint *, uint); extern ulong __llvm_atomic_or_a3_x_wg_i64(__local ulong *, ulong); extern uint __llvm_atomic_max_a1_x_dev_i32(__global int *, int); extern uint __llvm_atomic_umax_a1_x_dev_i32(__global uint *, uint); extern ulong __llvm_atomic_max_a1_x_dev_i64(__global long *, long); extern ulong __llvm_atomic_umax_a1_x_dev_i64(__global ulong *, ulong); extern uint __llvm_atomic_max_a3_x_wg_i32(__local int *, int); extern uint __llvm_atomic_umax_a3_x_wg_i32(__local uint *, uint); extern ulong __llvm_atomic_max_a3_x_wg_i64(__local long *, long); extern ulong __llvm_atomic_umax_a3_x_wg_i64(__local ulong *, ulong); extern uint __llvm_atomic_min_a1_x_dev_i32(__global int *, int); extern uint __llvm_atomic_umin_a1_x_dev_i32(__global uint *, uint); extern ulong __llvm_atomic_min_a1_x_dev_i64(__global long *, long); extern ulong __llvm_atomic_umin_a1_x_dev_i64(__global ulong *, ulong); extern uint __llvm_atomic_min_a3_x_wg_i32(__local int *, int); extern uint __llvm_atomic_umin_a3_x_wg_i32(__local uint *, uint); extern ulong __llvm_atomic_min_a3_x_wg_i64(__local long *, long); extern ulong __llvm_atomic_umin_a3_x_wg_i64(__local ulong *, ulong); extern uint __llvm_cmpxchg_a1_x_x_dev_i32(__global uint *, uint, uint); extern ulong __llvm_cmpxchg_a1_x_x_dev_i64(__global ulong *, ulong, ulong); extern uint __llvm_cmpxchg_a3_x_x_wg_i32(__local uint *, uint, uint); extern ulong __llvm_cmpxchg_a3_x_x_wg_i64(__local ulong *, ulong, ulong); // AMDGPU intrinsics // llvm.amdgcn.mov.dpp.i32 // llvm.amdgcn.update.dpp.i32 extern uint __llvm_amdgcn_update_dpp_i32(uint, uint, uint, uint, uint, bool) __asm("llvm.amdgcn.update.dpp.i32"); // llvm.amdgcn.mov.dpp8.i32 extern uint __llvm_amdgcn_dpp8_i32(uint, uint) __asm("llvm.amdgcn.dpp8.i32"); // llvm.amdgcn.permlane16 extern uint __llvm_amdgcn_permlane16(uint, uint, uint, uint, bool, bool) __asm("llvm.amdgcn.permlane16"); // llvm.amdgcn.permlanex16 extern uint __llvm_amdgcn_permlanex16(uint, uint, uint, uint, bool, bool) __asm("llvm.amdgcn.permlanex16"); extern __attribute__((const, convergent)) ulong __llvm_amdgcn_icmp_i64_i32(uint, uint, uint) __asm("llvm.amdgcn.icmp.i64.i32"); extern __attribute__((const, convergent)) ulong __llvm_amdgcn_icmp_i64_i64(ulong, ulong, uint) __asm("llvm.amdgcn.icmp.i64.i64"); extern __attribute__((const, convergent)) ulong __llvm_amdgcn_fcmp_i64_f32(float, float, uint) __asm("llvm.amdgcn.fcmp.i64.f32"); extern __attribute__((const, convergent)) ulong __llvm_amdgcn_fcmp_i64_f64(double, double, uint) __asm("llvm.amdgcn.fcmp.i64.f64"); extern __attribute__((const, convergent)) uint __llvm_amdgcn_icmp_i32_i32(uint, uint, uint) __asm("llvm.amdgcn.icmp.i32.i32"); extern __attribute__((const, convergent)) uint __llvm_amdgcn_icmp_i32_i64(ulong, ulong, uint) __asm("llvm.amdgcn.icmp.i32.i64"); extern __attribute__((const, convergent)) uint __llvm_amdgcn_fcmp_i32_f32(float, float, uint) __asm("llvm.amdgcn.fcmp.i32.f32"); extern __attribute__((const, convergent)) uint __llvm_amdgcn_fcmp_i32_f64(double, double, uint) __asm("llvm.amdgcn.fcmp.i32.f64"); // Buffer Load/Store extern __attribute__((pure)) float4 __llvm_amdgcn_struct_buffer_load_format_v4f32(uint4 rsrc, uint vindex, uint voffset, uint soffset, uint cachepolicy) __asm("llvm.amdgcn.struct.buffer.load.format.v4f32"); extern __attribute__((pure)) half4 __llvm_amdgcn_struct_buffer_load_format_v4f16(uint4 rsrc, uint vindex, uint voffset, uint soffset, uint cachepolicy) __asm("llvm.amdgcn.struct.buffer.load.format.v4f16"); extern void __llvm_amdgcn_struct_buffer_store_format_v4f32(float4 vdata, uint4 rsrc, uint vindex, uint voffset, uint soffset, uint cachepolicy) __asm("llvm.amdgcn.struct.buffer.store.format.v4f32"); extern void __llvm_amdgcn_struct_buffer_store_format_v4f16( half4 vdata, uint4 rsrc, uint vindex, uint voffset, uint soffset, uint cachepolicy) __asm("llvm.amdgcn.struct.buffer.store.format.v4f16"); // Image load, store, sample, gather extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_1d_v4f32_i32(uint ix, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_2d_v4f32_i32(uint ix, uint iy, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_3d_v4f32_i32(uint ix, uint iy, uint iz, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_cube_v4f32_i32(uint ix, uint iy, uint iface, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_1darray_v4f32_i32(uint ix, uint islice, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_2darray_v4f32_i32(uint ix, uint iy, uint islice, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_mip_1d_v4f32_i32(uint ix, uint imip, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_mip_2d_v4f32_i32(uint ix, uint iy, uint imip, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_mip_3d_v4f32_i32(uint ix, uint iy, uint iz, uint imip, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_mip_cube_v4f32_i32(uint ix, uint iy, uint iface, uint imip, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_mip_1darray_v4f32_i32(uint ix, uint islice, uint imip, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_load_mip_2darray_v4f32_i32(uint ix, uint iy, uint islice, uint imip, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_1d_v4f16_i32(uint ix, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_2d_v4f16_i32(uint ix, uint iy, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_3d_v4f16_i32(uint ix, uint iy, uint iz, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_cube_v4f16_i32(uint ix, uint iy, uint iface, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_1darray_v4f16_i32(uint ix, uint islice, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_2darray_v4f16_i32(uint ix, uint iy, uint islice, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_mip_1d_v4f16_i32(uint ix, uint imip, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_mip_2d_v4f16_i32(uint ix, uint iy, uint imip, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_mip_3d_v4f16_i32(uint ix, uint iy, uint iz, uint imip, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_mip_cube_v4f16_i32(uint ix, uint iy, uint iface, uint imip, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_mip_1darray_v4f16_i32(uint ix, uint islice, uint imip, uint8 t); extern __attribute__((pure)) half4 __llvm_amdgcn_image_load_mip_2darray_v4f16_i32(uint ix, uint iy, uint islice, uint imip, uint8 t); extern __attribute__((pure)) float __llvm_amdgcn_image_load_2d_f32_i32(uint ix, uint iy, uint8 t); extern __attribute__((pure)) float __llvm_amdgcn_image_load_2darray_f32_i32(uint ix, uint iy, uint islice, uint8 t); extern __attribute__((pure)) float __llvm_amdgcn_image_load_mip_2d_f32_i32(uint ix, uint iy, uint imip, uint8 t); extern __attribute__((pure)) float __llvm_amdgcn_image_load_mip_2darray_f32_i32(uint ix, uint iy, uint islice, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_1d_v4f32_i32(float4 pix, uint ix, uint8 t); extern void __llvm_amdgcn_image_store_2d_v4f32_i32(float4 pix, uint ix, uint iy, uint8 t); extern void __llvm_amdgcn_image_store_3d_v4f32_i32(float4 pix, uint ix, uint iy, uint iz, uint8 t); extern void __llvm_amdgcn_image_store_cube_v4f32_i32(float4 pix, uint ix, uint iy, uint iface, uint8 t); extern void __llvm_amdgcn_image_store_1darray_v4f32_i32(float4 pix, uint ix, uint islice, uint8 t); extern void __llvm_amdgcn_image_store_2darray_v4f32_i32(float4 pix, uint ix, uint iy, uint islice, uint8 t); extern void __llvm_amdgcn_image_store_mip_1d_v4f32_i32(float4 pix, uint ix, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_mip_2d_v4f32_i32(float4 pix, uint ix, uint iy, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_mip_3d_v4f32_i32(float4 pix, uint ix, uint iy, uint iz, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_mip_cube_v4f32_i32(float4 pix, uint ix, uint iy, uint iface, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_mip_1darray_v4f32_i32(float4 pix, uint ix, uint islice, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_mip_2darray_v4f32_i32(float4 pix, uint ix, uint iy, uint islice, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_1d_v4f16_i32(half4 pix, uint ix, uint8 t); extern void __llvm_amdgcn_image_store_2d_v4f16_i32(half4 pix, uint ix, uint iy, uint8 t); extern void __llvm_amdgcn_image_store_3d_v4f16_i32(half4 pix, uint ix, uint iy, uint iz, uint8 t); extern void __llvm_amdgcn_image_store_cube_v4f16_i32(half4 pix, uint ix, uint iy, uint iface, uint8 t); extern void __llvm_amdgcn_image_store_1darray_v4f16_i32(half4 pix, uint ix, uint islice, uint8 t); extern void __llvm_amdgcn_image_store_2darray_v4f16_i32(half4 pix, uint ix, uint iy, uint islice, uint8 t); extern void __llvm_amdgcn_image_store_mip_1d_v4f16_i32(half4 pix, uint ix, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_mip_2d_v4f16_i32(half4 pix, uint ix, uint iy, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_mip_3d_v4f16_i32(half4 pix, uint ix, uint iy, uint iz, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_mip_cube_v4f16_i32(half4 pix, uint ix, uint iy, uint iface, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_mip_1darray_v4f16_i32(half4 pix, uint ix, uint islice, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_mip_2darray_v4f16_i32(half4 pix, uint ix, uint iy, uint islice, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_2d_f32_i32(float pix, uint ix, uint iy, uint8 t); extern void __llvm_amdgcn_image_store_2darray_f32_i32(float pix, uint ix, uint iy, uint islice, uint8 t); extern void __llvm_amdgcn_image_store_mip_2d_f32_i32(float pix, uint ix, uint iy, uint imip, uint8 t); extern void __llvm_amdgcn_image_store_mip_2darray_f32_i32(float pix, uint ix, uint iy, uint islice, uint imip, uint8 t); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_1d_v4f32_f32(float x, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_1d_v4f32_f32(float x, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_l_1d_v4f32_f32(float x, float lod, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_d_1d_v4f32_f32_f32(float dxdh, float dxdv, float x, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_2d_v4f32_f32(float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_2d_v4f32_f32(float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_l_2d_v4f32_f32(float x, float y, float lod, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_d_2d_v4f32_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_3d_v4f32_f32(float x, float y, float z, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_3d_v4f32_f32(float x, float y, float z, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_l_3d_v4f32_f32(float x, float y, float z, float lod, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_d_3d_v4f32_f32_f32(float dxdh, float dydh, float dzdh, float dxdv, float dydv, float dzdv, float x, float y, float z, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_cube_v4f32_f32(float x, float y, float face, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_cube_v4f32_f32(float x, float y, float face, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_l_cube_v4f32_f32(float x, float y, float face, float lod, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_1darray_v4f32_f32(float x, float slice, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_1darray_v4f32_f32(float x, float slice, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_l_1darray_v4f32_f32(float x, float slice, float lod, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_d_1darray_v4f32_f32_f32(float dxdh, float dxdv, float x, float slice, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_2darray_v4f32_f32(float x, float y, float slice, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_2darray_v4f32_f32(float x, float y, float slice, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_l_2darray_v4f32_f32(float x, float y, float slice, float lod, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_d_2darray_v4f32_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, float slice, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_1d_v4f16_f32(float x, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_1d_v4f16_f32(float x, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_l_1d_v4f16_f32(float x, float lod, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_d_1d_v4f16_f32_f32(float dxdh, float dxdv, float x, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_2d_v4f16_f32(float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_2d_v4f16_f32(float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_l_2d_v4f16_f32(float x, float y, float lod, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_d_2d_v4f16_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_3d_v4f16_f32(float x, float y, float z, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_3d_v4f16_f32(float x, float y, float z, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_l_3d_v4f16_f32(float x, float y, float z, float lod, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_d_3d_v4f16_f32_f32(float dxdh, float dydh, float dzdh, float dxdv, float dydv, float dzdv, float x, float y, float z, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_cube_v4f16_f32(float x, float y, float face, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_cube_v4f16_f32(float x, float y, float face, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_l_cube_v4f16_f32(float x, float y, float face, float lod, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_1darray_v4f16_f32(float x, float slice, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_1darray_v4f16_f32(float x, float slice, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_l_1darray_v4f16_f32(float x, float slice, float lod, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_d_1darray_v4f16_f32_f32(float dxdh, float dxdv, float x, float slice, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_2darray_v4f16_f32(float x, float y, float slice, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_2darray_v4f16_f32(float x, float y, float slice, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_l_2darray_v4f16_f32(float x, float y, float slice, float lod, uint8 t, uint4 s); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_d_2darray_v4f16_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, float slice, uint8 t, uint4 s); extern __attribute__((pure)) float __llvm_amdgcn_image_sample_2d_f32_f32(float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) float __llvm_amdgcn_image_sample_lz_2d_f32_f32(float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) float __llvm_amdgcn_image_sample_l_2d_f32_f32(float x, float y, float lod, uint8 t, uint4 s); extern __attribute__((pure)) float __llvm_amdgcn_image_sample_d_2d_f32_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) float __llvm_amdgcn_image_sample_2darray_f32_f32(float x, float y, float slice, uint8 t, uint4 s); extern __attribute__((pure)) float __llvm_amdgcn_image_sample_lz_2darray_f32_f32(float x, float y, float slice, uint8 t, uint4 s); extern __attribute__((pure)) float __llvm_amdgcn_image_sample_l_2darray_f32_f32(float x, float y, float slice, float lod, uint8 t, uint4 s); extern __attribute__((pure)) float __llvm_amdgcn_image_sample_d_2darray_f32_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, float slice, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_r(float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_g(float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_b(float x, float y, uint8 t, uint4 s); extern __attribute__((pure)) float4 __llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float x, float y, uint8 t, uint4 s); #pragma OPENCL EXTENSION cl_khr_fp16 : disable #endif // IRIF_H ROCm-Device-Libs-rocm-5.0.0/irif/src/000077500000000000000000000000001415221260100170505ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/irif/src/atomic.ll000066400000000000000000000171661415221260100206700ustar00rootroot00000000000000target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" target triple = "amdgcn-amd-amdhsa" ;;; ;;; TODO add synchscope(N) ;;; ;;;;; Load define protected i32 @__llvm_ld_atomic_a1_x_dev_i32(i32 addrspace(1)* nocapture readonly) #0 { %2 = load atomic volatile i32, i32 addrspace(1)* %0 monotonic, align 4 ret i32 %2 } define protected i64 @__llvm_ld_atomic_a1_x_dev_i64(i64 addrspace(1)* nocapture readonly) #0 { %2 = load atomic volatile i64, i64 addrspace(1)* %0 monotonic, align 8 ret i64 %2 } define protected i32 @__llvm_ld_atomic_a3_x_wg_i32(i32 addrspace(3)* nocapture readonly) #0 { %2 = load atomic volatile i32, i32 addrspace(3)* %0 monotonic, align 4 ret i32 %2 } define protected i64 @__llvm_ld_atomic_a3_x_wg_i64(i64 addrspace(3)* nocapture readonly) #0 { %2 = load atomic volatile i64, i64 addrspace(3)* %0 monotonic, align 8 ret i64 %2 } ;;;;; Store define protected void @__llvm_st_atomic_a1_x_dev_i32(i32 addrspace(1)* nocapture, i32) #1 { store atomic volatile i32 %1, i32 addrspace(1)* %0 monotonic, align 4 ret void } define protected void @__llvm_st_atomic_a1_x_dev_i64(i64 addrspace(1)* nocapture, i64) #1 { store atomic volatile i64 %1, i64 addrspace(1)* %0 monotonic, align 8 ret void } define protected void @__llvm_st_atomic_a3_x_wg_i32(i32 addrspace(3)* nocapture, i32) #1 { store atomic volatile i32 %1, i32 addrspace(3)* %0 monotonic, align 4 ret void } define protected void @__llvm_st_atomic_a3_x_wg_i64(i64 addrspace(3)* nocapture, i64) #1 { store atomic volatile i64 %1, i64 addrspace(3)* %0 monotonic, align 8 ret void } ;;;;; Add define protected i32 @__llvm_atomic_add_a1_x_dev_i32(i32 addrspace(1)* nocapture, i32) #1 { %3 = atomicrmw volatile add i32 addrspace(1)* %0, i32 %1 monotonic ret i32 %3 } define protected i64 @__llvm_atomic_add_a1_x_dev_i64(i64 addrspace(1)* nocapture, i64) #1 { %3 = atomicrmw volatile add i64 addrspace(1)* %0, i64 %1 monotonic ret i64 %3 } define protected i32 @__llvm_atomic_add_a3_x_wg_i32(i32 addrspace(3)* nocapture, i32) #1 { %3 = atomicrmw volatile add i32 addrspace(3)* %0, i32 %1 monotonic ret i32 %3 } define protected i64 @__llvm_atomic_add_a3_x_wg_i64(i64 addrspace(3)* nocapture, i64) #1 { %3 = atomicrmw volatile add i64 addrspace(3)* %0, i64 %1 monotonic ret i64 %3 } ;;;;; And define protected i32 @__llvm_atomic_and_a1_x_dev_i32(i32 addrspace(1)* nocapture, i32) #1 { %3 = atomicrmw volatile and i32 addrspace(1)* %0, i32 %1 monotonic ret i32 %3 } define protected i64 @__llvm_atomic_and_a1_x_dev_i64(i64 addrspace(1)* nocapture, i64) #1 { %3 = atomicrmw volatile and i64 addrspace(1)* %0, i64 %1 monotonic ret i64 %3 } define protected i32 @__llvm_atomic_and_a3_x_wg_i32(i32 addrspace(3)* nocapture, i32) #1 { %3 = atomicrmw volatile and i32 addrspace(3)* %0, i32 %1 monotonic ret i32 %3 } define protected i64 @__llvm_atomic_and_a3_x_wg_i64(i64 addrspace(3)* nocapture, i64) #1 { %3 = atomicrmw volatile and i64 addrspace(3)* %0, i64 %1 monotonic ret i64 %3 } ;;;;; Or define protected i32 @__llvm_atomic_or_a1_x_dev_i32(i32 addrspace(1)* nocapture, i32) #1 { %3 = atomicrmw volatile or i32 addrspace(1)* %0, i32 %1 monotonic ret i32 %3 } define protected i64 @__llvm_atomic_or_a1_x_dev_i64(i64 addrspace(1)* nocapture, i64) #1 { %3 = atomicrmw volatile or i64 addrspace(1)* %0, i64 %1 monotonic ret i64 %3 } define protected i32 @__llvm_atomic_or_a3_x_wg_i32(i32 addrspace(3)* nocapture, i32) #1 { %3 = atomicrmw volatile or i32 addrspace(3)* %0, i32 %1 monotonic ret i32 %3 } define protected i64 @__llvm_atomic_or_a3_x_wg_i64(i64 addrspace(3)* nocapture, i64) #1 { %3 = atomicrmw volatile or i64 addrspace(3)* %0, i64 %1 monotonic ret i64 %3 } ;;;;; Max define protected i32 @__llvm_atomic_max_a1_x_dev_i32(i32 addrspace(1)* nocapture, i32) #0 { %3 = atomicrmw volatile max i32 addrspace(1)* %0, i32 %1 monotonic ret i32 %3 } define protected i32 @__llvm_atomic_umax_a1_x_dev_i32(i32 addrspace(1)* nocapture, i32) #1 { %3 = atomicrmw volatile umax i32 addrspace(1)* %0, i32 %1 monotonic ret i32 %3 } define protected i64 @__llvm_atomic_max_a1_x_dev_i64(i64 addrspace(1)* nocapture, i64) #1 { %3 = atomicrmw volatile max i64 addrspace(1)* %0, i64 %1 monotonic ret i64 %3 } define protected i64 @__llvm_atomic_umax_a1_x_dev_i64(i64 addrspace(1)* nocapture, i64) #1 { %3 = atomicrmw volatile umax i64 addrspace(1)* %0, i64 %1 monotonic ret i64 %3 } define protected i32 @__llvm_atomic_max_a3_x_wg_i32(i32 addrspace(3)* nocapture, i32) #1 { %3 = atomicrmw volatile max i32 addrspace(3)* %0, i32 %1 monotonic ret i32 %3 } define protected i32 @__llvm_atomic_umax_a3_x_wg_i32(i32 addrspace(3)* nocapture, i32) #1 { %3 = atomicrmw volatile umax i32 addrspace(3)* %0, i32 %1 monotonic ret i32 %3 } define protected i64 @__llvm_atomic_max_a3_x_wg_i64(i64 addrspace(3)* nocapture, i64) #1 { %3 = atomicrmw volatile max i64 addrspace(3)* %0, i64 %1 monotonic ret i64 %3 } define protected i64 @__llvm_atomic_umax_a3_x_wg_i64(i64 addrspace(3)* nocapture, i64) #1 { %3 = atomicrmw volatile umax i64 addrspace(3)* %0, i64 %1 monotonic ret i64 %3 } ;;;;; Min define protected i32 @__llvm_atomic_min_a1_x_dev_i32(i32 addrspace(1)* nocapture, i32) #1 { %3 = atomicrmw volatile min i32 addrspace(1)* %0, i32 %1 monotonic ret i32 %3 } define protected i32 @__llvm_atomic_umin_a1_x_dev_i32(i32 addrspace(1)* nocapture, i32) #1 { %3 = atomicrmw volatile umin i32 addrspace(1)* %0, i32 %1 monotonic ret i32 %3 } define protected i64 @__llvm_atomic_min_a1_x_dev_i64(i64 addrspace(1)* nocapture, i64) #1 { %3 = atomicrmw volatile min i64 addrspace(1)* %0, i64 %1 monotonic ret i64 %3 } define protected i64 @__llvm_atomic_umin_a1_x_dev_i64(i64 addrspace(1)* nocapture, i64) #1 { %3 = atomicrmw volatile umin i64 addrspace(1)* %0, i64 %1 monotonic ret i64 %3 } define protected i32 @__llvm_atomic_min_a3_x_wg_i32(i32 addrspace(3)* nocapture, i32) #1 { %3 = atomicrmw volatile min i32 addrspace(3)* %0, i32 %1 monotonic ret i32 %3 } define protected i32 @__llvm_atomic_umin_a3_x_wg_i32(i32 addrspace(3)* nocapture, i32) #1 { %3 = atomicrmw volatile umin i32 addrspace(3)* %0, i32 %1 monotonic ret i32 %3 } define protected i64 @__llvm_atomic_min_a3_x_wg_i64(i64 addrspace(3)* nocapture, i64) #1 { %3 = atomicrmw volatile min i64 addrspace(3)* %0, i64 %1 monotonic ret i64 %3 } define protected i64 @__llvm_atomic_umin_a3_x_wg_i64(i64 addrspace(3)* nocapture, i64) #1 { %3 = atomicrmw volatile umin i64 addrspace(3)* %0, i64 %1 monotonic ret i64 %3 } ;;;;; cmpxchg define protected i32 @__llvm_cmpxchg_a1_x_x_dev_i32(i32 addrspace(1)* nocapture, i32, i32) #0 { %4 = cmpxchg volatile i32 addrspace(1)* %0, i32 %1, i32 %2 monotonic monotonic %5 = extractvalue { i32, i1 } %4, 0 ret i32 %5 } define protected i64 @__llvm_cmpxchg_a1_x_x_dev_i64(i64 addrspace(1)* nocapture, i64, i64) #1 { %4 = cmpxchg volatile i64 addrspace(1)* %0, i64 %1, i64 %2 monotonic monotonic %5 = extractvalue { i64, i1 } %4, 0 ret i64 %5 } define protected i32 @__llvm_cmpxchg_a3_x_x_wg_i32(i32 addrspace(3)* nocapture, i32, i32) #1 { %4 = cmpxchg volatile i32 addrspace(3)* %0, i32 %1, i32 %2 monotonic monotonic %5 = extractvalue { i32, i1 } %4, 0 ret i32 %5 } define protected i64 @__llvm_cmpxchg_a3_x_x_wg(i64 addrspace(3)* nocapture, i64, i64) #1 { %4 = cmpxchg volatile i64 addrspace(3)* %0, i64 %1, i64 %2 monotonic monotonic %5 = extractvalue { i64, i1 } %4, 0 ret i64 %5 } attributes #0 = { alwaysinline argmemonly norecurse nounwind readonly } attributes #1 = { alwaysinline argmemonly norecurse nounwind } ROCm-Device-Libs-rocm-5.0.0/irif/src/cz.ll000066400000000000000000000035021415221260100200150ustar00rootroot00000000000000; ===-------------------------------------------------------------------------- ; ROCm Device Libraries ; ; This file is distributed under the University of Illinois Open Source ; License. See LICENSE.TXT for details. ; ===-------------------------------------------------------------------------- target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" target triple = "amdgcn-amd-amdhsa" declare i8 @llvm.ctlz.i8(i8, i1) #0 declare i16 @llvm.ctlz.i16(i16, i1) #0 declare i32 @llvm.ctlz.i32(i32, i1) #0 declare i64 @llvm.ctlz.i64(i64, i1) #0 declare i8 @llvm.cttz.i8(i8, i1) #0 declare i16 @llvm.cttz.i16(i16, i1) #0 declare i32 @llvm.cttz.i32(i32, i1) #0 declare i64 @llvm.cttz.i64(i64, i1) #0 define protected i8 @__llvm_ctlz_i8(i8) #1 { %2 = call i8 @llvm.ctlz.i8(i8 %0, i1 false) ret i8 %2 } define protected i16 @__llvm_ctlz_i16(i16) #1 { %2 = call i16 @llvm.ctlz.i16(i16 %0, i1 false) ret i16 %2 } define protected i32 @__llvm_ctlz_i32(i32) #1 { %2 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) ret i32 %2 } define protected i64 @__llvm_ctlz_i64(i64) #1 { %2 = call i64 @llvm.ctlz.i64(i64 %0, i1 false) ret i64 %2 } define protected i8 @__llvm_cttz_i8(i8) #1 { %2 = call i8 @llvm.cttz.i8(i8 %0, i1 false) ret i8 %2 } define protected i16 @__llvm_cttz_i16(i16) #1 { %2 = call i16 @llvm.cttz.i16(i16 %0, i1 false) ret i16 %2 } define protected i32 @__llvm_cttz_i32(i32) #1 { %2 = call i32 @llvm.cttz.i32(i32 %0, i1 false) ret i32 %2 } define protected i64 @__llvm_cttz_i64(i64) #1 { %2 = call i64 @llvm.cttz.i64(i64 %0, i1 false) ret i64 %2 } attributes #0 = { nounwind readnone speculatable } attributes #1 = { alwaysinline norecurse nounwind readnone } ROCm-Device-Libs-rocm-5.0.0/irif/src/imintrin.ll000066400000000000000000002127371415221260100212460ustar00rootroot00000000000000target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" target triple = "amdgcn-amd-amdhsa" ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_1d_v4f32_i32(i32 %arg1, <8 x i32> %arg2) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %arg1, <8 x i32> %arg2, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_2d_v4f32_i32(i32 %arg1, i32 %arg2, <8 x i32> %arg3) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %arg1, i32 %arg2, <8 x i32> %arg3, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_3d_v4f32_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_cube_v4f32_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_1darray_v4f32_i32(i32 %arg1, i32 %arg2, <8 x i32> %arg3) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 %arg1, i32 %arg2, <8 x i32> %arg3, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_2darray_v4f32_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_mip_1d_v4f32_i32(i32 %arg1, i32 %arg2, <8 x i32> %arg3) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %arg1, i32 %arg2, <8 x i32> %arg3, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_mip_2d_v4f32_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_mip_3d_v4f32_i32(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_mip_cube_v4f32_i32(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_mip_1darray_v4f32_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_load_mip_2darray_v4f32_i32(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_1d_v4f16_i32(i32 %arg1, <8 x i32> %arg2) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i32(i32 15, i32 %arg1, <8 x i32> %arg2, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i32(i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_2d_v4f16_i32(i32 %arg1, i32 %arg2, <8 x i32> %arg3) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 15, i32 %arg1, i32 %arg2, <8 x i32> %arg3, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_3d_v4f16_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_cube_v4f16_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.cube.v4f16.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.cube.v4f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_1darray_v4f16_i32(i32 %arg1, i32 %arg2, <8 x i32> %arg3) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.1darray.v4f16.i32(i32 15, i32 %arg1, i32 %arg2, <8 x i32> %arg3, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.1darray.v4f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_2darray_v4f16_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.2darray.v4f16.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.2darray.v4f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.2dmsaa.v4f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.2darraymsaa.v4f16.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_mip_1d_v4f16_i32(i32 %arg1, i32 %arg2, <8 x i32> %arg3) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.mip.1d.v4f16.i32(i32 15, i32 %arg1, i32 %arg2, <8 x i32> %arg3, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.mip.1d.v4f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_mip_2d_v4f16_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.mip.2d.v4f16.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.mip.2d.v4f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_mip_3d_v4f16_i32(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.mip.3d.v4f16.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.mip.3d.v4f16.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_mip_cube_v4f16_i32(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.mip.cube.v4f16.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.mip.cube.v4f16.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_mip_1darray_v4f16_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.mip.1darray.v4f16.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.mip.1darray.v4f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_load_mip_2darray_v4f16_i32(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.load.mip.2darray.v4f16.i32(i32 15, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.load.mip.2darray.v4f16.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.1d.f32.i32(i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_load_2d_f32_i32(i32 %arg1, i32 %arg2, <8 x i32> %arg3) local_unnamed_addr #0 { bb: %tmp = tail call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %arg1, i32 %arg2, <8 x i32> %arg3, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.2d.f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.3d.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.cube.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.1darray.f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_load_2darray_f32_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call float @llvm.amdgcn.image.load.2darray.f32.i32(i32 1, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.2darray.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.2darraymsaa.f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_load_mip_2d_f32_i32(i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call float @llvm.amdgcn.image.load.mip.2d.f32.i32(i32 1, i32 %arg1, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.mip.2d.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.mip.3d.f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.mip.cube.f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.mip.1darray.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_load_mip_2darray_f32_i32(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call float @llvm.amdgcn.image.load.mip.2darray.f32.i32(i32 1, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.load.mip.2darray.f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_1d_v4f32_i32(<4 x float> %arg, i32 %arg2, <8 x i32> %arg3) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, <8 x i32> %arg3, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_2d_v4f32_i32(<4 x float> %arg, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_3d_v4f32_i32(<4 x float> %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_cube_v4f32_i32(<4 x float> %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_1darray_v4f32_i32(<4 x float> %arg, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_2darray_v4f32_i32(<4 x float> %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #7 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #7 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_1d_v4f32_i32(<4 x float> %arg, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_2d_v4f32_i32(<4 x float> %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_3d_v4f32_i32(<4 x float> %arg, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_cube_v4f32_i32(<4 x float> %arg, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_1darray_v4f32_i32(<4 x float> %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_2darray_v4f32_i32(<4 x float> %arg, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_1d_v4f16_i32(<4 x half> %arg, i32 %arg2, <8 x i32> %arg3) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.1d.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, <8 x i32> %arg3, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.1d.v4f16.i32(<4 x half>, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_2d_v4f16_i32(<4 x half> %arg, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half>, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_3d_v4f16_i32(<4 x half> %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.3d.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.3d.v4f16.i32(<4 x half>, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_cube_v4f16_i32(<4 x half> %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.cube.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.cube.v4f16.i32(<4 x half>, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_1darray_v4f16_i32(<4 x half> %arg, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.1darray.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.1darray.v4f16.i32(<4 x half>, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_2darray_v4f16_i32(<4 x half> %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.2darray.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2darray.v4f16.i32(<4 x half>, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2dmsaa.v4f16.i32(<4 x half>, i32, i32, i32, i32, <8 x i32>, i32, i32) #7 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2darraymsaa.v4f16.i32(<4 x half>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #7 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_1d_v4f16_i32(<4 x half> %arg, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.1d.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.1d.v4f16.i32(<4 x half>, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_2d_v4f16_i32(<4 x half> %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.2d.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.2d.v4f16.i32(<4 x half>, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_3d_v4f16_i32(<4 x half> %arg, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.3d.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.3d.v4f16.i32(<4 x half>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_cube_v4f16_i32(<4 x half> %arg, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.cube.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.cube.v4f16.i32(<4 x half>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_1darray_v4f16_i32(<4 x half> %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.1darray.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.1darray.v4f16.i32(<4 x half>, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_2darray_v4f16_i32(<4 x half> %arg, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.2darray.v4f16.i32(<4 x half> %arg, i32 15, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.2darray.v4f16.i32(<4 x half>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.1d.f32.i32(float, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_2d_f32_i32(float %arg, i32 %arg2, i32 %arg3, <8 x i32> %arg4) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.2d.f32.i32(float %arg, i32 15, i32 %arg2, i32 %arg3, <8 x i32> %arg4, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2d.f32.i32(float, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.3d.f32.i32(float, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.cube.f32.i32(float, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.1darray.f32.i32(float, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_2darray_f32_i32(float %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.2darray.f32.i32(float %arg, i32 1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2darray.f32.i32(float, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2dmsaa.f32.i32(float, i32, i32, i32, i32, <8 x i32>, i32, i32) #7 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.2darraymsaa.f32.i32(float, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #7 declare void @llvm.amdgcn.image.store.mip.1d.f32.i32(float, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_2d_f32_i32(float %arg, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.2d.f32.i32(float %arg, i32 1, i32 %arg2, i32 %arg3, i32 %arg4, <8 x i32> %arg5, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.2d.f32.i32(float, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.3d.f32.i32(float, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.cube.f32.i32(float, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.1darray.f32.i32(float, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind writeonly define protected void @__llvm_amdgcn_image_store_mip_2darray_f32_i32(float %arg, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6) local_unnamed_addr #2 { bb: tail call void @llvm.amdgcn.image.store.mip.2darray.f32.i32(float %arg, i32 1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, <8 x i32> %arg6, i32 0, i32 0) ret void } ; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.image.store.mip.2darray.f32.i32(float, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #3 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_1d_v4f32_f32(float %arg1, <8 x i32> %arg2, <4 x i32> %arg3) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %arg1, <8 x i32> %arg2, <4 x i32> %arg3, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_lz_1d_v4f32_f32(float %arg1, <8 x i32> %arg2, <4 x i32> %arg3) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %arg1, <8 x i32> %arg2, <4 x i32> %arg3, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_l_1d_v4f32_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_d_1d_v4f32_f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_2d_v4f32_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_lz_2d_v4f32_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_l_2d_v4f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_d_2d_v4f32_f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, <8 x i32> %arg7, <4 x i32> %arg8) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, <8 x i32> %arg7, <4 x i32> %arg8, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_3d_v4f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_lz_3d_v4f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.lz.3d.v4f32.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.lz.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_l_3d_v4f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.l.3d.v4f32.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.l.3d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_d_3d_v4f32_f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, float %arg8, float %arg9, <8 x i32> %arg10, <4 x i32> %arg11) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, float %arg8, float %arg9, <8 x i32> %arg10, <4 x i32> %arg11, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_cube_v4f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_lz_cube_v4f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.lz.cube.v4f32.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.lz.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_l_cube_v4f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.l.cube.v4f32.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.l.cube.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_1darray_v4f32_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_lz_1darray_v4f32_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.lz.1darray.v4f32.f32(i32 15, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.lz.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_l_1darray_v4f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.l.1darray.v4f32.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.l.1darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_d_1darray_v4f32_f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.d.1darray.v4f32.f32.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.d.1darray.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_2darray_v4f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_lz_2darray_v4f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.lz.2darray.v4f32.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.lz.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_l_2darray_v4f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.l.2darray.v4f32.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.l.2darray.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_sample_d_2darray_v4f32_f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <8 x i32> %arg8, <4 x i32> %arg9) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.sample.d.2darray.v4f32.f32.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <8 x i32> %arg8, <4 x i32> %arg9, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.d.2darray.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_1d_v4f16_f32(float %arg1, <8 x i32> %arg2, <4 x i32> %arg3) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.1d.v4f16.f32(i32 15, float %arg1, <8 x i32> %arg2, <4 x i32> %arg3, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.1d.v4f16.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_lz_1d_v4f16_f32(float %arg1, <8 x i32> %arg2, <4 x i32> %arg3) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.lz.1d.v4f16.f32(i32 15, float %arg1, <8 x i32> %arg2, <4 x i32> %arg3, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.lz.1d.v4f16.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_l_1d_v4f16_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.l.1d.v4f16.f32(i32 15, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.l.1d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_d_1d_v4f16_f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.d.1d.v4f16.f32.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.d.1d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_2d_v4f16_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32 15, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_lz_2d_v4f16_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32(i32 15, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_l_2d_v4f16_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.l.2d.v4f16.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.l.2d.v4f16.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_d_2d_v4f16_f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, <8 x i32> %arg7, <4 x i32> %arg8) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.d.2d.v4f16.f32.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, <8 x i32> %arg7, <4 x i32> %arg8, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.d.2d.v4f16.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_3d_v4f16_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.3d.v4f16.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.3d.v4f16.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_lz_3d_v4f16_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.lz.3d.v4f16.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.lz.3d.v4f16.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_l_3d_v4f16_f32(float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.l.3d.v4f16.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.l.3d.v4f16.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_d_3d_v4f16_f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, float %arg8, float %arg9, <8 x i32> %arg10, <4 x i32> %arg11, i32 %arg13, i32 %arg14) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.d.3d.v4f16.f32.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, float %arg8, float %arg9, <8 x i32> %arg10, <4 x i32> %arg11, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.d.3d.v4f16.f32.f32(i32, float, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_cube_v4f16_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.cube.v4f16.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.cube.v4f16.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_lz_cube_v4f16_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.lz.cube.v4f16.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.lz.cube.v4f16.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_l_cube_v4f16_f32(float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.l.cube.v4f16.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.l.cube.v4f16.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.d.cube.v4f16.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_1darray_v4f16_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.1darray.v4f16.f32(i32 15, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.1darray.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_lz_1darray_v4f16_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.lz.1darray.v4f16.f32(i32 15, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.lz.1darray.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_l_1darray_v4f16_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.l.1darray.v4f16.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.l.1darray.v4f16.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_d_1darray_v4f16_f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.d.1darray.v4f16.f32.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.d.1darray.v4f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_2darray_v4f16_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.2darray.v4f16.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.2darray.v4f16.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_lz_2darray_v4f16_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.lz.2darray.v4f16.f32(i32 15, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.lz.2darray.v4f16.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_l_2darray_v4f16_f32(float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.l.2darray.v4f16.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.l.2darray.v4f16.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x half> @__llvm_amdgcn_image_sample_d_2darray_v4f16_f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <8 x i32> %arg8, <4 x i32> %arg9) local_unnamed_addr #4 { bb: %tmp = tail call <4 x half> @llvm.amdgcn.image.sample.d.2darray.v4f16.f32.f32(i32 15, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <8 x i32> %arg8, <4 x i32> %arg9, i1 false, i32 0, i32 0) ret <4 x half> %tmp } ; Function Attrs: nounwind readonly declare <4 x half> @llvm.amdgcn.image.sample.d.2darray.v4f16.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.lz.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.d.1d.f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_sample_2d_f32_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #0 { bb: %tmp = tail call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.2d.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_sample_lz_2d_f32_f32(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #4 { bb: %tmp = tail call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_sample_l_2d_f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call float @llvm.amdgcn.image.sample.l.2d.f32.f32(i32 1, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.l.2d.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_sample_d_2d_f32_f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, <8 x i32> %arg7, <4 x i32> %arg8) local_unnamed_addr #4 { bb: %tmp = tail call float @llvm.amdgcn.image.sample.d.2d.f32.f32.f32(i32 1, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, <8 x i32> %arg7, <4 x i32> %arg8, i1 false, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.d.2d.f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.lz.3d.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.l.3d.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.d.3d.f32.f32.f32(i32, float, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.lz.cube.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.l.cube.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.lz.1darray.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.l.1darray.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.d.1darray.f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_sample_2darray_f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #0 { bb: %tmp = tail call float @llvm.amdgcn.image.sample.2darray.f32.f32(i32 1, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.2darray.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_sample_lz_2darray_f32_f32(float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5) local_unnamed_addr #4 { bb: %tmp = tail call float @llvm.amdgcn.image.sample.lz.2darray.f32.f32(i32 1, float %arg1, float %arg2, float %arg3, <8 x i32> %arg4, <4 x i32> %arg5, i1 false, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.lz.2darray.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_sample_l_2darray_f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6) local_unnamed_addr #4 { bb: %tmp = tail call float @llvm.amdgcn.image.sample.l.2darray.f32.f32(i32 1, float %arg1, float %arg2, float %arg3, float %arg4, <8 x i32> %arg5, <4 x i32> %arg6, i1 false, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.l.2darray.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected float @__llvm_amdgcn_image_sample_d_2darray_f32_f32_f32(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <8 x i32> %arg8, <4 x i32> %arg9, i32 %arg11, i32 %arg12) local_unnamed_addr #4 { bb: %tmp = tail call float @llvm.amdgcn.image.sample.d.2darray.f32.f32.f32(i32 1, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <8 x i32> %arg8, <4 x i32> %arg9, i1 false, i32 0, i32 0) ret float %tmp } ; Function Attrs: nounwind readonly declare float @llvm.amdgcn.image.sample.d.2darray.f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: alwaysinline nounwind readonly define protected <4 x float> @__llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_r(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x float> %tmp } define protected <4 x float> @__llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_g(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 2, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x float> %tmp } define protected <4 x float> @__llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_b(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 4, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x float> %tmp } define protected <4 x float> @__llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) local_unnamed_addr #4 { bb: %tmp = tail call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 8, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0) ret <4 x float> %tmp } ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.gather4.lz.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.gather4.l.cube.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.gather4.lz.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.gather4.l.2darray.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.gather.4h.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.gather.4h.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.gather.4h.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #5 attributes #0 = { alwaysinline nounwind readonly } attributes #1 = { nounwind readonly } attributes #2 = { alwaysinline nounwind writeonly } attributes #3 = { nounwind writeonly } attributes #4 = { alwaysinline nounwind readonly "target-features"="+extended-image-insts" } attributes #5 = { nounwind readonly "target-features"="+extended-image-insts" } attributes #6 = { alwaysinline nounwind writeonly "target-features"="+extended-image-insts" } attributes #7 = { nounwind writeonly "target-features"="+extended-image-insts" } ROCm-Device-Libs-rocm-5.0.0/ockl/000077500000000000000000000000001415221260100162605ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/ockl/CMakeLists.txt000066400000000000000000000012001415221260100210110ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- file(GLOB sources ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/*.ll ) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../irif/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../oclc/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/inc) opencl_bc_lib(NAME ockl SOURCES ${sources}) ROCm-Device-Libs-rocm-5.0.0/ockl/inc/000077500000000000000000000000001415221260100170315ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/ockl/inc/amd_hsa_common.h000066400000000000000000000076031415221260100221540ustar00rootroot00000000000000//////////////////////////////////////////////////////////////////////////////// // // The University of Illinois/NCSA // Open Source License (NCSA) // // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // // AMD Research and AMD HSA Software Development // // Advanced Micro Devices, Inc. // // www.amd.com // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimers in // the documentation and/or other materials provided with the distribution. // - Neither the names of Advanced Micro Devices, Inc, // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS WITH THE SOFTWARE. // //////////////////////////////////////////////////////////////////////////////// // The following set of header files provides definitions for AMD GPU // Architecture: // - amd_hsa_common.h // - amd_hsa_elf.h // - amd_hsa_kernel_code.h // - amd_hsa_queue.h // - amd_hsa_signal.h // // Refer to "HSA Application Binary Interface: AMD GPU Architecture" for more // information. #ifndef AMD_HSA_COMMON_H #define AMD_HSA_COMMON_H #ifndef DEVICE_COMPILER #include #include #endif // Descriptive version of the HSA Application Binary Interface. #define AMD_HSA_ABI_VERSION "AMD GPU Architecture v0.35 (June 25, 2015)" // Alignment attribute that specifies a minimum alignment (in bytes) for // variables of the specified type. #if defined(__GNUC__) || defined(DEVICE_COMPILER) # define __ALIGNED__(x) __attribute__((aligned(x))) #elif defined(_MSC_VER) # define __ALIGNED__(x) __declspec(align(x)) #elif defined(RC_INVOKED) # define __ALIGNED__(x) #else # error #endif // Creates enumeration entries for packed types. Enumeration entries include // bit shift amount, bit width, and bit mask. #define AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width) \ name ## _SHIFT = (shift), \ name ## _WIDTH = (width), \ name = (((1 << (width)) - 1) << (shift)) \ // Gets bits for specified mask from specified src packed instance. #define AMD_HSA_BITS_GET(src, mask) \ ((src & mask) >> mask ## _SHIFT) \ // Sets val bits for specified mask in specified dst packed instance. #define AMD_HSA_BITS_SET(dst, mask, val) \ dst &= (~(1 << mask ## _SHIFT) & ~mask); \ dst |= (((val) << mask ## _SHIFT) & mask) \ #endif // AMD_HSA_COMMON_H ROCm-Device-Libs-rocm-5.0.0/ockl/inc/amd_hsa_elf.h000066400000000000000000000253721415221260100214350ustar00rootroot00000000000000//////////////////////////////////////////////////////////////////////////////// // // The University of Illinois/NCSA // Open Source License (NCSA) // // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // // AMD Research and AMD HSA Software Development // // Advanced Micro Devices, Inc. // // www.amd.com // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimers in // the documentation and/or other materials provided with the distribution. // - Neither the names of Advanced Micro Devices, Inc, // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS WITH THE SOFTWARE. // //////////////////////////////////////////////////////////////////////////////// #ifndef AMD_HSA_ELF_H #define AMD_HSA_ELF_H #include "amd_hsa_common.h" // ELF Header Enumeration Values. #define EM_AMDGPU 224 #define ELFOSABI_AMDGPU_HSA 64 #define ELFABIVERSION_AMDGPU_HSA 0 #define EF_AMDGPU_XNACK 0x00000001 #define EF_AMDGPU_TRAP_HANDLER 0x00000002 // ELF Section Header Flag Enumeration Values. #define SHF_AMDGPU_HSA_GLOBAL (0x00100000 & SHF_MASKOS) #define SHF_AMDGPU_HSA_READONLY (0x00200000 & SHF_MASKOS) #define SHF_AMDGPU_HSA_CODE (0x00400000 & SHF_MASKOS) #define SHF_AMDGPU_HSA_AGENT (0x00800000 & SHF_MASKOS) // typedef enum { AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM = 0, AMDGPU_HSA_SEGMENT_GLOBAL_AGENT = 1, AMDGPU_HSA_SEGMENT_READONLY_AGENT = 2, AMDGPU_HSA_SEGMENT_CODE_AGENT = 3, AMDGPU_HSA_SEGMENT_LAST, } amdgpu_hsa_elf_segment_t; // ELF Program Header Type Enumeration Values. #define PT_AMDGPU_HSA_LOAD_GLOBAL_PROGRAM (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM) #define PT_AMDGPU_HSA_LOAD_GLOBAL_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT) #define PT_AMDGPU_HSA_LOAD_READONLY_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_READONLY_AGENT) #define PT_AMDGPU_HSA_LOAD_CODE_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_CODE_AGENT) // ELF Symbol Type Enumeration Values. #define STT_AMDGPU_HSA_KERNEL (STT_LOOS + 0) #define STT_AMDGPU_HSA_INDIRECT_FUNCTION (STT_LOOS + 1) #define STT_AMDGPU_HSA_METADATA (STT_LOOS + 2) // ELF Symbol Binding Enumeration Values. #define STB_AMDGPU_HSA_EXTERNAL (STB_LOOS + 0) // ELF Symbol Other Information Creation/Retrieval. #define ELF64_ST_AMDGPU_ALLOCATION(o) (((o) >> 2) & 0x3) #define ELF64_ST_AMDGPU_FLAGS(o) ((o) >> 4) #define ELF64_ST_AMDGPU_OTHER(f, a, v) (((f) << 4) + (((a) & 0x3) << 2) + ((v) & 0x3)) typedef enum { AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT = 0, AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM = 1, AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT = 2, AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT = 3, AMDGPU_HSA_SYMBOL_ALLOCATION_LAST, } amdgpu_hsa_symbol_allocation_t; // ELF Symbol Allocation Enumeration Values. #define STA_AMDGPU_HSA_DEFAULT AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT #define STA_AMDGPU_HSA_GLOBAL_PROGRAM AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM #define STA_AMDGPU_HSA_GLOBAL_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT #define STA_AMDGPU_HSA_READONLY_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT typedef enum { AMDGPU_HSA_SYMBOL_FLAG_DEFAULT = 0, AMDGPU_HSA_SYMBOL_FLAG_CONST = 1, AMDGPU_HSA_SYMBOL_FLAG_LAST, } amdgpu_hsa_symbol_flag_t; // ELF Symbol Flag Enumeration Values. #define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST // AMD GPU Relocation Type Enumeration Values. #define R_AMDGPU_NONE 0 #define R_AMDGPU_32_LOW 1 #define R_AMDGPU_32_HIGH 2 #define R_AMDGPU_64 3 #define R_AMDGPU_INIT_SAMPLER 4 #define R_AMDGPU_INIT_IMAGE 5 // AMD GPU Note Type Enumeration Values. #define NT_AMDGPU_HSA_CODE_OBJECT_VERSION 1 #define NT_AMDGPU_HSA_HSAIL 2 #define NT_AMDGPU_HSA_ISA 3 #define NT_AMDGPU_HSA_PRODUCER 4 #define NT_AMDGPU_HSA_PRODUCER_OPTIONS 5 #define NT_AMDGPU_HSA_EXTENSION 6 #define NT_AMDGPU_HSA_HLDEBUG_DEBUG 101 #define NT_AMDGPU_HSA_HLDEBUG_TARGET 102 // AMD GPU Metadata Kind Enumeration Values. typedef uint16_t amdgpu_hsa_metadata_kind16_t; typedef enum { AMDGPU_HSA_METADATA_KIND_NONE = 0, AMDGPU_HSA_METADATA_KIND_INIT_SAMP = 1, AMDGPU_HSA_METADATA_KIND_INIT_ROIMG = 2, AMDGPU_HSA_METADATA_KIND_INIT_WOIMG = 3, AMDGPU_HSA_METADATA_KIND_INIT_RWIMG = 4 } amdgpu_hsa_metadata_kind_t; // AMD GPU Sampler Coordinate Normalization Enumeration Values. typedef uint8_t amdgpu_hsa_sampler_coord8_t; typedef enum { AMDGPU_HSA_SAMPLER_COORD_UNNORMALIZED = 0, AMDGPU_HSA_SAMPLER_COORD_NORMALIZED = 1 } amdgpu_hsa_sampler_coord_t; // AMD GPU Sampler Filter Enumeration Values. typedef uint8_t amdgpu_hsa_sampler_filter8_t; typedef enum { AMDGPU_HSA_SAMPLER_FILTER_NEAREST = 0, AMDGPU_HSA_SAMPLER_FILTER_LINEAR = 1 } amdgpu_hsa_sampler_filter_t; // AMD GPU Sampler Addressing Enumeration Values. typedef uint8_t amdgpu_hsa_sampler_addressing8_t; typedef enum { AMDGPU_HSA_SAMPLER_ADDRESSING_UNDEFINED = 0, AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_EDGE = 1, AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_BORDER = 2, AMDGPU_HSA_SAMPLER_ADDRESSING_REPEAT = 3, AMDGPU_HSA_SAMPLER_ADDRESSING_MIRRORED_REPEAT = 4 } amdgpu_hsa_sampler_addressing_t; // AMD GPU Sampler Descriptor. typedef struct amdgpu_hsa_sampler_descriptor_s { uint16_t size; amdgpu_hsa_metadata_kind16_t kind; amdgpu_hsa_sampler_coord8_t coord; amdgpu_hsa_sampler_filter8_t filter; amdgpu_hsa_sampler_addressing8_t addressing; uint8_t reserved1; } amdgpu_hsa_sampler_descriptor_t; // AMD GPU Image Geometry Enumeration Values. typedef uint8_t amdgpu_hsa_image_geometry8_t; typedef enum { AMDGPU_HSA_IMAGE_GEOMETRY_1D = 0, AMDGPU_HSA_IMAGE_GEOMETRY_2D = 1, AMDGPU_HSA_IMAGE_GEOMETRY_3D = 2, AMDGPU_HSA_IMAGE_GEOMETRY_1DA = 3, AMDGPU_HSA_IMAGE_GEOMETRY_2DA = 4, AMDGPU_HSA_IMAGE_GEOMETRY_1DB = 5, AMDGPU_HSA_IMAGE_GEOMETRY_2DDEPTH = 6, AMDGPU_HSA_IMAGE_GEOMETRY_2DADEPTH = 7 } amdgpu_hsa_image_geometry_t; // AMD GPU Image Channel Order Enumeration Values. typedef uint8_t amdgpu_hsa_image_channel_order8_t; typedef enum { AMDGPU_HSA_IMAGE_CHANNEL_ORDER_A = 0, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_R = 1, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RX = 2, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RG = 3, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGX = 4, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RA = 5, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGB = 6, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBX = 7, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBA = 8, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_BGRA = 9, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ARGB = 10, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ABGR = 11, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGB = 12, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBX = 13, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBA = 14, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SBGRA = 15, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_INTENSITY = 16, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH = 18, AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 } amdgpu_hsa_image_channel_order_t; // AMD GPU Image Channel Type Enumeration Values. typedef uint8_t amdgpu_hsa_image_channel_type8_t; typedef enum { AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_555 = 5, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_565 = 6, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_INT_101010 = 7, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, AMDGPU_HSA_IMAGE_CHANNEL_TYPE_FLOAT = 15 } amdgpu_hsa_image_channel_type_t; // AMD GPU Image Descriptor. typedef struct amdgpu_hsa_image_descriptor_s { uint16_t size; amdgpu_hsa_metadata_kind16_t kind; amdgpu_hsa_image_geometry8_t geometry; amdgpu_hsa_image_channel_order8_t channel_order; amdgpu_hsa_image_channel_type8_t channel_type; uint8_t reserved1; uint64_t width; uint64_t height; uint64_t depth; uint64_t array; } amdgpu_hsa_image_descriptor_t; typedef struct amdgpu_hsa_note_code_object_version_s { uint32_t major_version; uint32_t minor_version; } amdgpu_hsa_note_code_object_version_t; typedef struct amdgpu_hsa_note_hsail_s { uint32_t hsail_major_version; uint32_t hsail_minor_version; uint8_t profile; uint8_t machine_model; uint8_t default_float_round; } amdgpu_hsa_note_hsail_t; typedef struct amdgpu_hsa_note_isa_s { uint16_t vendor_name_size; uint16_t architecture_name_size; uint32_t major; uint32_t minor; uint32_t stepping; char vendor_and_architecture_name[1]; } amdgpu_hsa_note_isa_t; typedef struct amdgpu_hsa_note_producer_s { uint16_t producer_name_size; uint16_t reserved; uint32_t producer_major_version; uint32_t producer_minor_version; char producer_name[1]; } amdgpu_hsa_note_producer_t; typedef struct amdgpu_hsa_note_producer_options_s { uint16_t producer_options_size; char producer_options[1]; } amdgpu_hsa_note_producer_options_t; typedef enum { AMDGPU_HSA_RODATA_GLOBAL_PROGRAM = 0, AMDGPU_HSA_RODATA_GLOBAL_AGENT, AMDGPU_HSA_RODATA_READONLY_AGENT, AMDGPU_HSA_DATA_GLOBAL_PROGRAM, AMDGPU_HSA_DATA_GLOBAL_AGENT, AMDGPU_HSA_DATA_READONLY_AGENT, AMDGPU_HSA_BSS_GLOBAL_PROGRAM, AMDGPU_HSA_BSS_GLOBAL_AGENT, AMDGPU_HSA_BSS_READONLY_AGENT, AMDGPU_HSA_SECTION_LAST, } amdgpu_hsa_elf_section_t; #endif // AMD_HSA_ELF_H ROCm-Device-Libs-rocm-5.0.0/ockl/inc/amd_hsa_kernel_code.h000066400000000000000000000305631415221260100231370ustar00rootroot00000000000000//////////////////////////////////////////////////////////////////////////////// // // The University of Illinois/NCSA // Open Source License (NCSA) // // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // // AMD Research and AMD HSA Software Development // // Advanced Micro Devices, Inc. // // www.amd.com // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimers in // the documentation and/or other materials provided with the distribution. // - Neither the names of Advanced Micro Devices, Inc, // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS WITH THE SOFTWARE. // //////////////////////////////////////////////////////////////////////////////// #ifndef AMD_HSA_KERNEL_CODE_H #define AMD_HSA_KERNEL_CODE_H #include "amd_hsa_common.h" #include "hsa.h" // AMD Kernel Code Version Enumeration Values. typedef uint32_t amd_kernel_code_version32_t; enum amd_kernel_code_version_t { AMD_KERNEL_CODE_VERSION_MAJOR = 1, AMD_KERNEL_CODE_VERSION_MINOR = 1 }; // AMD Machine Kind Enumeration Values. typedef uint16_t amd_machine_kind16_t; enum amd_machine_kind_t { AMD_MACHINE_KIND_UNDEFINED = 0, AMD_MACHINE_KIND_AMDGPU = 1 }; // AMD Machine Version. typedef uint16_t amd_machine_version16_t; // AMD Float Round Mode Enumeration Values. enum amd_float_round_mode_t { AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0, AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1, AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2, AMD_FLOAT_ROUND_MODE_ZERO = 3 }; // AMD Float Denorm Mode Enumeration Values. enum amd_float_denorm_mode_t { AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0, AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1, AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2, AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3 }; // AMD Compute Program Resource Register One. typedef uint32_t amd_compute_pgm_rsrc_one32_t; enum amd_compute_pgm_rsrc_one_t { AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, 0, 6), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY, 10, 2), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32, 12, 2), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64, 14, 2), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32, 16, 2), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 18, 2), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 20, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP, 21, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE, 22, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 23, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_BULKY, 24, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER, 25, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1, 26, 6) }; // AMD System VGPR Workitem ID Enumeration Values. enum amd_system_vgpr_workitem_id_t { AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0, AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1, AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2, AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3 }; // AMD Compute Program Resource Register Two. typedef uint32_t amd_compute_pgm_rsrc_two32_t; enum amd_compute_pgm_rsrc_two_t { AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET, 0, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 1, 5), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER, 6, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 7, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO, 10, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID, 11, 2), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION, 14, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE, 15, 9), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO, 30, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1, 31, 1) }; // AMD Element Byte Size Enumeration Values. enum amd_element_byte_size_t { AMD_ELEMENT_BYTE_SIZE_2 = 0, AMD_ELEMENT_BYTE_SIZE_4 = 1, AMD_ELEMENT_BYTE_SIZE_8 = 2, AMD_ELEMENT_BYTE_SIZE_16 = 3 }; // AMD Kernel Code Properties. typedef uint32_t amd_kernel_code_properties32_t; enum amd_kernel_code_properties_t { AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR, 1, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR, 2, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID, 4, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 7, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 8, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z, 9, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED1, 10, 6), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS, 16, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE, 17, 2), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_PTR64, 19, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK, 20, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED, 21, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED, 22, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED2, 23, 9) }; // AMD Power Of Two Enumeration Values. typedef uint8_t amd_powertwo8_t; enum amd_powertwo_t { AMD_POWERTWO_1 = 0, AMD_POWERTWO_2 = 1, AMD_POWERTWO_4 = 2, AMD_POWERTWO_8 = 3, AMD_POWERTWO_16 = 4, AMD_POWERTWO_32 = 5, AMD_POWERTWO_64 = 6, AMD_POWERTWO_128 = 7, AMD_POWERTWO_256 = 8 }; // AMD Enabled Control Directive Enumeration Values. typedef uint64_t amd_enabled_control_directive64_t; enum amd_enabled_control_directive_t { AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1, AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2, AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4, AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8, AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16, AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32, AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64, AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128, AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256 }; // AMD Exception Kind Enumeration Values. typedef uint16_t amd_exception_kind16_t; enum amd_exception_kind_t { AMD_EXCEPTION_KIND_INVALID_OPERATION = 1, AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2, AMD_EXCEPTION_KIND_OVERFLOW = 4, AMD_EXCEPTION_KIND_UNDERFLOW = 8, AMD_EXCEPTION_KIND_INEXACT = 16 }; // AMD Control Directives. #define AMD_CONTROL_DIRECTIVES_ALIGN_BYTES 64 #define AMD_CONTROL_DIRECTIVES_ALIGN __ALIGNED__(AMD_CONTROL_DIRECTIVES_ALIGN_BYTES) typedef AMD_CONTROL_DIRECTIVES_ALIGN struct amd_control_directives_s { amd_enabled_control_directive64_t enabled_control_directives; uint16_t enable_break_exceptions; uint16_t enable_detect_exceptions; uint32_t max_dynamic_group_size; uint64_t max_flat_grid_size; uint32_t max_flat_workgroup_size; uint8_t required_dim; uint8_t reserved1[3]; uint64_t required_grid_size[3]; uint32_t required_workgroup_size[3]; uint8_t reserved2[60]; } amd_control_directives_t; // AMD Kernel Code. #define AMD_ISA_ALIGN_BYTES 256 #define AMD_KERNEL_CODE_ALIGN_BYTES 64 #define AMD_KERNEL_CODE_ALIGN __ALIGNED__(AMD_KERNEL_CODE_ALIGN_BYTES) typedef AMD_KERNEL_CODE_ALIGN struct amd_kernel_code_s { amd_kernel_code_version32_t amd_kernel_code_version_major; amd_kernel_code_version32_t amd_kernel_code_version_minor; amd_machine_kind16_t amd_machine_kind; amd_machine_version16_t amd_machine_version_major; amd_machine_version16_t amd_machine_version_minor; amd_machine_version16_t amd_machine_version_stepping; int64_t kernel_code_entry_byte_offset; int64_t kernel_code_prefetch_byte_offset; uint64_t kernel_code_prefetch_byte_size; uint64_t max_scratch_backing_memory_byte_size; amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1; amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2; amd_kernel_code_properties32_t kernel_code_properties; uint32_t workitem_private_segment_byte_size; uint32_t workgroup_group_segment_byte_size; uint32_t gds_segment_byte_size; uint64_t kernarg_segment_byte_size; uint32_t workgroup_fbarrier_count; uint16_t wavefront_sgpr_count; uint16_t workitem_vgpr_count; uint16_t reserved_vgpr_first; uint16_t reserved_vgpr_count; uint16_t reserved_sgpr_first; uint16_t reserved_sgpr_count; uint16_t debug_wavefront_private_segment_offset_sgpr; uint16_t debug_private_segment_buffer_sgpr; amd_powertwo8_t kernarg_segment_alignment; amd_powertwo8_t group_segment_alignment; amd_powertwo8_t private_segment_alignment; amd_powertwo8_t wavefront_size; int32_t call_convention; uint8_t reserved1[12]; uint64_t runtime_loader_kernel_symbol; amd_control_directives_t control_directives; } amd_kernel_code_t; // TODO: this struct should be completely gone once debugger designs/implements // Debugger APIs. typedef struct amd_runtime_loader_debug_info_s { const void* elf_raw; size_t elf_size; const char *kernel_name; const void *owning_segment; } amd_runtime_loader_debug_info_t; #endif // AMD_HSA_KERNEL_CODE_H ROCm-Device-Libs-rocm-5.0.0/ockl/inc/amd_hsa_queue.h000066400000000000000000000070241415221260100220050ustar00rootroot00000000000000//////////////////////////////////////////////////////////////////////////////// // // The University of Illinois/NCSA // Open Source License (NCSA) // // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // // AMD Research and AMD HSA Software Development // // Advanced Micro Devices, Inc. // // www.amd.com // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimers in // the documentation and/or other materials provided with the distribution. // - Neither the names of Advanced Micro Devices, Inc, // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS WITH THE SOFTWARE. // //////////////////////////////////////////////////////////////////////////////// #ifndef AMD_HSA_QUEUE_H #define AMD_HSA_QUEUE_H #include "amd_hsa_common.h" #include "hsa.h" // AMD Queue Properties. typedef uint32_t amd_queue_properties32_t; enum amd_queue_properties_t { AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER, 0, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_IS_PTR64, 1, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 4, 28) }; // AMD Queue. #define AMD_QUEUE_ALIGN_BYTES 64 #define AMD_QUEUE_ALIGN __ALIGNED__(AMD_QUEUE_ALIGN_BYTES) typedef struct AMD_QUEUE_ALIGN amd_queue_s { hsa_queue_t hsa_queue; uint32_t reserved1[4]; volatile uint64_t write_dispatch_id; uint32_t group_segment_aperture_base_hi; uint32_t private_segment_aperture_base_hi; uint32_t max_cu_id; uint32_t max_wave_id; volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1; volatile uint32_t legacy_doorbell_lock; uint32_t reserved2[9]; volatile uint64_t read_dispatch_id; uint32_t read_dispatch_id_field_base_byte_offset; uint32_t compute_tmpring_size; uint32_t scratch_resource_descriptor[4]; uint64_t scratch_backing_memory_location; uint64_t scratch_backing_memory_byte_size; uint32_t scratch_workitem_byte_size; amd_queue_properties32_t queue_properties; uint32_t reserved3[2]; hsa_signal_t queue_inactive_signal; uint32_t reserved4[14]; } amd_queue_t; #endif // AMD_HSA_QUEUE_H ROCm-Device-Libs-rocm-5.0.0/ockl/inc/amd_hsa_signal.h000066400000000000000000000060041415221260100221330ustar00rootroot00000000000000//////////////////////////////////////////////////////////////////////////////// // // The University of Illinois/NCSA // Open Source License (NCSA) // // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // // AMD Research and AMD HSA Software Development // // Advanced Micro Devices, Inc. // // www.amd.com // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimers in // the documentation and/or other materials provided with the distribution. // - Neither the names of Advanced Micro Devices, Inc, // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS WITH THE SOFTWARE. // //////////////////////////////////////////////////////////////////////////////// #ifndef AMD_HSA_SIGNAL_H #define AMD_HSA_SIGNAL_H #include "amd_hsa_common.h" #include "amd_hsa_queue.h" // AMD Signal Kind Enumeration Values. typedef int64_t amd_signal_kind64_t; enum amd_signal_kind_t { AMD_SIGNAL_KIND_INVALID = 0, AMD_SIGNAL_KIND_USER = 1, AMD_SIGNAL_KIND_DOORBELL = -1, AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2 }; // AMD Signal. #define AMD_SIGNAL_ALIGN_BYTES 64 #define AMD_SIGNAL_ALIGN __ALIGNED__(AMD_SIGNAL_ALIGN_BYTES) typedef struct AMD_SIGNAL_ALIGN amd_signal_s { amd_signal_kind64_t kind; union { volatile int64_t value; #ifdef DEVICE_COMPILER __global #endif volatile uint32_t* legacy_hardware_doorbell_ptr; #ifdef DEVICE_COMPILER __global #endif volatile uint64_t* hardware_doorbell_ptr; }; uint64_t event_mailbox_ptr; uint32_t event_id; uint32_t reserved1; uint64_t start_ts; uint64_t end_ts; union { #ifdef DEVICE_COMPILER __global #endif amd_queue_t* queue_ptr; uint64_t reserved2; }; uint32_t reserved3[2]; } amd_signal_t; #endif // AMD_HSA_SIGNAL_H ROCm-Device-Libs-rocm-5.0.0/ockl/inc/device_amd_hsa.h000066400000000000000000000016411415221260100221170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #ifndef DEVICE_AMD_HSA_H #define DEVICE_AMD_HSA_H typedef char int8_t; typedef unsigned char uint8_t; typedef short int16_t; typedef unsigned short uint16_t; typedef int int32_t; typedef unsigned int uint32_t; typedef long int64_t; typedef unsigned long uint64_t; #ifdef __LP64__ #undef __LP64__ #endif #define __LP64__ #define DEVICE_COMPILER #define LITTLEENDIAN_CPU #include "hsa.h" #include "amd_hsa_common.h" #include "amd_hsa_elf.h" #include "amd_hsa_kernel_code.h" #include "amd_hsa_queue.h" #include "amd_hsa_signal.h" #include "device_amd_hsa.h" #undef DEVICE_COMPILER #endif // DEVICE_AMD_HSA_H ROCm-Device-Libs-rocm-5.0.0/ockl/inc/hsa.h000066400000000000000000003673671415221260100200030ustar00rootroot00000000000000//////////////////////////////////////////////////////////////////////////////// // // The University of Illinois/NCSA // Open Source License (NCSA) // // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // // AMD Research and AMD HSA Software Development // // Advanced Micro Devices, Inc. // // www.amd.com // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimers in // the documentation and/or other materials provided with the distribution. // - Neither the names of Advanced Micro Devices, Inc, // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS WITH THE SOFTWARE. // //////////////////////////////////////////////////////////////////////////////// #ifndef HSA_RUNTIME_INC_HSA_H_ #define HSA_RUNTIME_INC_HSA_H_ #ifndef DEVICE_COMPILER #include /* size_t */ #include /* uintXX_t */ #ifndef __cplusplus #include #endif /* __cplusplus */ #endif // Placeholder for calling convention and import/export macros #ifndef HSA_CALL #define HSA_CALL #endif #ifndef HSA_EXPORT_DECORATOR #ifdef __GNUC__ #define HSA_EXPORT_DECORATOR __attribute__ ((visibility ("default"))) #else #define HSA_EXPORT_DECORATOR #endif #endif #define HSA_API_EXPORT HSA_EXPORT_DECORATOR HSA_CALL #define HSA_API_IMPORT HSA_CALL #if !defined(HSA_API) && defined(HSA_EXPORT) #define HSA_API HSA_API_EXPORT #else #define HSA_API HSA_API_IMPORT #endif // Detect and set large model builds. #undef HSA_LARGE_MODEL #if defined(__LP64__) || defined(_M_X64) #define HSA_LARGE_MODEL #endif // Try to detect CPU endianness #if !defined(LITTLEENDIAN_CPU) && !defined(BIGENDIAN_CPU) #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ defined(_M_X64) #define LITTLEENDIAN_CPU #endif #endif #undef HSA_LITTLE_ENDIAN #if defined(LITTLEENDIAN_CPU) #define HSA_LITTLE_ENDIAN #elif defined(BIGENDIAN_CPU) #else #error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined" #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /** \defgroup status Runtime Notifications * @{ */ /** * @brief Status codes. */ typedef enum { /** * The function has been executed successfully. */ HSA_STATUS_SUCCESS = 0x0, /** * A traversal over a list of elements has been interrupted by the * application before completing. */ HSA_STATUS_INFO_BREAK = 0x1, /** * A generic error has occurred. */ HSA_STATUS_ERROR = 0x1000, /** * One of the actual arguments does not meet a precondition stated in the * documentation of the corresponding formal argument. */ HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001, /** * The requested queue creation is not valid. */ HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002, /** * The requested allocation is not valid. */ HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003, /** * The agent is invalid. */ HSA_STATUS_ERROR_INVALID_AGENT = 0x1004, /** * The memory region is invalid. */ HSA_STATUS_ERROR_INVALID_REGION = 0x1005, /** * The signal is invalid. */ HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006, /** * The queue is invalid. */ HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007, /** * The HSA runtime failed to allocate the necessary resources. This error * may also occur when the HSA runtime needs to spawn threads or create * internal OS-specific events. */ HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008, /** * The AQL packet is malformed. */ HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009, /** * An error has been detected while releasing a resource. */ HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A, /** * An API other than ::hsa_init has been invoked while the reference count * of the HSA runtime is 0. */ HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B, /** * The maximum reference count for the object has been reached. */ HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C, /** * The arguments passed to a functions are not compatible. */ HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D, /** * The index is invalid. */ HSA_STATUS_ERROR_INVALID_INDEX = 0x100E, /** * The instruction set architecture is invalid. */ HSA_STATUS_ERROR_INVALID_ISA = 0x100F, /** * The instruction set architecture name is invalid. */ HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017, /** * The code object is invalid. */ HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010, /** * The executable is invalid. */ HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011, /** * The executable is frozen. */ HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012, /** * There is no symbol with the given name. */ HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013, /** * The variable is already defined. */ HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014, /** * The variable is undefined. */ HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015, /** * An HSAIL operation resulted on a hardware exception. */ HSA_STATUS_ERROR_EXCEPTION = 0x1016 } hsa_status_t; /** * @brief Query additional information about a status code. * * @param[in] status Status code. * * @param[out] status_string A NUL-terminated string that describes the error * status. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p status is an invalid * status code, or @p status_string is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_status_string(hsa_status_t status, const char **status_string); #endif /** @} */ /** \defgroup common Common Definitions * @{ */ /** * @brief Three-dimensional coordinate. */ typedef struct hsa_dim3_s { /** * X dimension. */ uint32_t x; /** * Y dimension. */ uint32_t y; /** * Z dimension. */ uint32_t z; } hsa_dim3_t; /** * @brief Access permissions. */ typedef enum { /** * Read-only access. */ HSA_ACCESS_PERMISSION_RO = 1, /** * Write-only access. */ HSA_ACCESS_PERMISSION_WO = 2, /** * Read and write access. */ HSA_ACCESS_PERMISSION_RW = 3 } hsa_access_permission_t; /** @} **/ /** \defgroup initshutdown Initialization and Shut Down * @{ */ /** * @brief Initialize the HSA runtime. * * @details Initializes the HSA runtime if it is not already initialized, and * increases the reference counter associated with the HSA runtime for the * current process. Invocation of any HSA function other than ::hsa_init results * in undefined behavior if the current HSA runtime reference counter is less * than one. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate * the resources required by the implementation. * * @retval ::HSA_STATUS_ERROR_REFCOUNT_OVERFLOW The HSA runtime reference * count reaches INT32_MAX. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_init(); #endif /** * @brief Shut down the HSA runtime. * * @details Decreases the reference count of the HSA runtime instance. When the * reference count reaches 0, the HSA runtime is no longer considered valid * but the application might call ::hsa_init to initialize the HSA runtime * again. * * Once the reference count of the HSA runtime reaches 0, all the resources * associated with it (queues, signals, agent information, etc.) are * considered invalid and any attempt to reference them in subsequent API calls * results in undefined behavior. When the reference count reaches 0, the HSA * runtime may release resources associated with it. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_shut_down(); #endif /** @} **/ /** \defgroup agentinfo System and Agent Information * @{ */ /** * @brief Endianness. A convention used to interpret the bytes making up a data * word. */ typedef enum { /** * The least significant byte is stored in the smallest address. */ HSA_ENDIANNESS_LITTLE = 0, /** * The most significant byte is stored in the smallest address. */ HSA_ENDIANNESS_BIG = 1 } hsa_endianness_t; /** * @brief Machine model. A machine model determines the size of certain data * types in HSA runtime and an agent. */ typedef enum { /** * Small machine model. Addresses use 32 bits. */ HSA_MACHINE_MODEL_SMALL = 0, /** * Large machine model. Addresses use 64 bits. */ HSA_MACHINE_MODEL_LARGE = 1 } hsa_machine_model_t; /** * @brief Profile. A profile indicates a particular level of feature * support. For example, in the base profile the application must use the HSA * runtime allocator to reserve Shared Virtual Memory, while in the full profile * any host pointer can be shared across all the agents. */ typedef enum { /** * Base profile. */ HSA_PROFILE_BASE = 0, /** * Full profile. */ HSA_PROFILE_FULL = 1 } hsa_profile_t; /** * @brief System attributes. */ typedef enum { /** * Major version of the HSA runtime specification supported by the * implementation. The type of this attribute is uint16_t. */ HSA_SYSTEM_INFO_VERSION_MAJOR = 0, /** * Minor version of the HSA runtime specification supported by the * implementation. The type of this attribute is uint16_t. */ HSA_SYSTEM_INFO_VERSION_MINOR = 1, /** * Current timestamp. The value of this attribute monotonically increases at a * constant rate. The type of this attribute is uint64_t. */ HSA_SYSTEM_INFO_TIMESTAMP = 2, /** * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is * in the range 1-400MHz. The type of this attribute is uint64_t. */ HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3, /** * Maximum duration of a signal wait operation. Expressed as a count based on * the timestamp frequency. The type of this attribute is uint64_t. */ HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT = 4, /** * Endianness of the system. The type of this attribute us ::hsa_endianness_t. */ HSA_SYSTEM_INFO_ENDIANNESS = 5, /** * Machine model supported by the HSA runtime. The type of this attribute is * ::hsa_machine_model_t. */ HSA_SYSTEM_INFO_MACHINE_MODEL = 6, /** * Bit-mask indicating which extensions are supported by the * implementation. An extension with an ID of @p i is supported if the bit at * position @p i is set. The type of this attribute is uint8_t[128]. */ HSA_SYSTEM_INFO_EXTENSIONS = 7 } hsa_system_info_t; /** * @brief Get the current value of a system attribute. * * @param[in] attribute Attribute to query. * * @param[out] value Pointer to an application-allocated buffer where to store * the value of the attribute. If the buffer passed by the application is not * large enough to hold the value of @p attribute, the behavior is undefined. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid * system attribute, or @p value is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_system_get_info(hsa_system_info_t attribute, void *value); #endif /** * @brief HSA extensions. */ typedef enum { /** * Finalizer extension. */ HSA_EXTENSION_FINALIZER = 0, /** * Images extension. */ HSA_EXTENSION_IMAGES = 1, /** * Profiler extension. */ HSA_EXTENSION_AMD_PROFILER = 2, /** * Loaded code object extension. */ HSA_EXTENSION_AMD_LOADED_CODE_OBJECT = 3 } hsa_extension_t; /** * @brief Query if a given version of an extension is supported by the HSA * implementation. * * @param[in] extension Extension identifier. * * @param[in] version_major Major version number. * * @param[in] version_minor Minor version number. * * @param[out] result Pointer to a memory location where the HSA runtime stores * the result of the check. The result is true if the specified version of the * extension is supported, and false otherwise. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid * extension, or @p result is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_system_extension_supported(uint16_t extension, uint16_t version_major, uint16_t version_minor, bool *result); #endif /** * @brief Retrieve the function pointers corresponding to a given version of an * extension. Portable applications are expected to invoke the extension API * using the returned function pointers * * @details The application is responsible for verifying that the given version * of the extension is supported by the HSA implementation (see * ::hsa_system_extension_supported). If the given combination of extension, * major version, and minor version is not supported by the implementation, the * behavior is undefined. * * @param[in] extension Extension identifier. * * @param[in] version_major Major version number for which to retrieve the * function pointer table. * * @param[in] version_minor Minor version number for which to retrieve the * function pointer table. * * @param[out] table Pointer to an application-allocated function pointer table * that is populated by the HSA runtime. Must not be NULL. The memory associated * with table can be reused or freed after the function returns. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid * extension, or @p table is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_system_get_extension_table(uint16_t extension, uint16_t version_major, uint16_t version_minor, void *table); #endif /** * @brief Opaque handle representing an agent, a device that participates in * the HSA memory model. An agent can submit AQL packets for execution, and * may also accept AQL packets for execution (agent dispatch packets or kernel * dispatch packets launching HSAIL-derived binaries). */ typedef struct hsa_agent_s { /** * Opaque handle. */ uint64_t handle; } hsa_agent_t; /** * @brief Agent features. */ typedef enum { /** * The agent supports AQL packets of kernel dispatch type. If this * feature is enabled, the agent is also a kernel agent. */ HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1, /** * The agent supports AQL packets of agent dispatch type. */ HSA_AGENT_FEATURE_AGENT_DISPATCH = 2 } hsa_agent_feature_t; /** * @brief Hardware device type. */ typedef enum { /** * CPU device. */ HSA_DEVICE_TYPE_CPU = 0, /** * GPU device. */ HSA_DEVICE_TYPE_GPU = 1, /** * DSP device. */ HSA_DEVICE_TYPE_DSP = 2 } hsa_device_type_t; /** * @brief Default floating-point rounding mode. */ typedef enum { /** * Use a default floating-point rounding mode specified elsewhere. */ HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0, /** * Operations that specify the default floating-point mode are rounded to zero * by default. */ HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1, /** * Operations that specify the default floating-point mode are rounded to the * nearest representable number and that ties should be broken by selecting * the value with an even least significant bit. */ HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2 } hsa_default_float_rounding_mode_t; /** * @brief Agent attributes. */ typedef enum { /** * Agent name. The type of this attribute is a NUL-terminated char[64]. If * the name of the agent uses less than 63 characters, the rest of the * array must be filled with NULs. */ HSA_AGENT_INFO_NAME = 0, /** * Name of vendor. The type of this attribute is a NUL-terminated char[64]. If * the name of the vendor uses less than 63 characters, the rest of the array * must be filled with NULs. */ HSA_AGENT_INFO_VENDOR_NAME = 1, /** * Agent capability. The type of this attribute is ::hsa_agent_feature_t. */ HSA_AGENT_INFO_FEATURE = 2, /** * Machine model supported by the agent. The type of this attribute is * ::hsa_machine_model_t. */ HSA_AGENT_INFO_MACHINE_MODEL = 3, /** * Profile supported by the agent. The type of this attribute is * ::hsa_profile_t. */ HSA_AGENT_INFO_PROFILE = 4, /** * Default floating-point rounding mode. The type of this attribute is * ::hsa_default_float_rounding_mode_t, but the value * ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT is not allowed. */ HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5, /** * Default floating-point rounding modes supported by the agent in the Base * profile. The type of this attribute is a mask of * ::hsa_default_float_rounding_mode_t. The default floating-point rounding * mode (::HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE) bit must not be set. */ HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 23, /** * Flag indicating that the f16 HSAIL operation is at least as fast as the * f32 operation in the current agent. The value of this attribute is * undefined if the agent is not a kernel agent. The type of this * attribute is bool. */ HSA_AGENT_INFO_FAST_F16_OPERATION = 24, /** * Number of work-items in a wavefront. Must be a power of 2 in the range * [1,256]. The value of this attribute is undefined if the agent is not * a kernel agent. The type of this attribute is uint32_t. */ HSA_AGENT_INFO_WAVEFRONT_SIZE = 6, /** * Maximum number of work-items of each dimension of a work-group. Each * maximum must be greater than 0. No maximum can exceed the value of * ::HSA_AGENT_INFO_WORKGROUP_MAX_SIZE. The value of this attribute is * undefined if the agent is not a kernel agent. The type of this * attribute is uint16_t[3]. */ HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7, /** * Maximum total number of work-items in a work-group. The value of this * attribute is undefined if the agent is not a kernel agent. The type * of this attribute is uint32_t. */ HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8, /** * Maximum number of work-items of each dimension of a grid. Each maximum must * be greater than 0, and must not be smaller than the corresponding value in * ::HSA_AGENT_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of * ::HSA_AGENT_INFO_GRID_MAX_SIZE. The value of this attribute is undefined if * the agent is not a kernel agent. The type of this attribute is * ::hsa_dim3_t. */ HSA_AGENT_INFO_GRID_MAX_DIM = 9, /** * Maximum total number of work-items in a grid. The value of this attribute * is undefined if the agent is not a kernel agent. The type of this * attribute is uint32_t. */ HSA_AGENT_INFO_GRID_MAX_SIZE = 10, /** * Maximum number of fbarriers per work-group. Must be at least 32. The value * of this attribute is undefined if the agent is not a kernel agent. The * type of this attribute is uint32_t. */ HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11, /** * Maximum number of queues that can be active (created but not destroyed) at * one time in the agent. The type of this attribute is uint32_t. */ HSA_AGENT_INFO_QUEUES_MAX = 12, /** * Minimum number of packets that a queue created in the agent * can hold. Must be a power of 2 greater than 0. Must not exceed * the value of ::HSA_AGENT_INFO_QUEUE_MAX_SIZE. The type of this * attribute is uint32_t. */ HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13, /** * Maximum number of packets that a queue created in the agent can * hold. Must be a power of 2 greater than 0. The type of this attribute * is uint32_t. */ HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14, /** * Type of a queue created in the agent. The type of this attribute is * ::hsa_queue_type_t. */ HSA_AGENT_INFO_QUEUE_TYPE = 15, /** * Identifier of the NUMA node associated with the agent. The type of this * attribute is uint32_t. */ HSA_AGENT_INFO_NODE = 16, /** * Type of hardware device associated with the agent. The type of this * attribute is ::hsa_device_type_t. */ HSA_AGENT_INFO_DEVICE = 17, /** * Array of data cache sizes (L1..L4). Each size is expressed in bytes. A size * of 0 for a particular level indicates that there is no cache information * for that level. The type of this attribute is uint32_t[4]. */ HSA_AGENT_INFO_CACHE_SIZE = 18, /** * Instruction set architecture of the agent. The type of this attribute * is ::hsa_isa_t. */ HSA_AGENT_INFO_ISA = 19, /** * Bit-mask indicating which extensions are supported by the agent. An * extension with an ID of @p i is supported if the bit at position @p i is * set. The type of this attribute is uint8_t[128]. */ HSA_AGENT_INFO_EXTENSIONS = 20, /** * Major version of the HSA runtime specification supported by the * agent. The type of this attribute is uint16_t. */ HSA_AGENT_INFO_VERSION_MAJOR = 21, /** * Minor version of the HSA runtime specification supported by the * agent. The type of this attribute is uint16_t. */ HSA_AGENT_INFO_VERSION_MINOR = 22 } hsa_agent_info_t; /** * @brief Get the current value of an attribute for a given agent. * * @param[in] agent A valid agent. * * @param[in] attribute Attribute to query. * * @param[out] value Pointer to an application-allocated buffer where to store * the value of the attribute. If the buffer passed by the application is not * large enough to hold the value of @p attribute, the behavior is undefined. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid * agent attribute, or @p value is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_agent_get_info(hsa_agent_t agent, hsa_agent_info_t attribute, void *value); #endif /** * @brief Iterate over the available agents, and invoke an * application-defined callback on every iteration. * * @param[in] callback Callback to be invoked once per agent. The HSA * runtime passes two arguments to the callback, the agent and the * application data. If @p callback returns a status other than * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and * ::hsa_iterate_agents returns that status value. * * @param[in] data Application data that is passed to @p callback on every * iteration. May be NULL. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_iterate_agents(hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data); #endif /* // If we do not know the size of an attribute, we need to query it first // Note: this API will not be in the spec unless needed hsa_status_t HSA_API hsa_agent_get_info_size( hsa_agent_t agent, hsa_agent_info_t attribute, size_t* size); // Set the value of an agents attribute // Note: this API will not be in the spec unless needed hsa_status_t HSA_API hsa_agent_set_info( hsa_agent_t agent, hsa_agent_info_t attribute, void* value); */ /** * @brief Exception policies applied in the presence of hardware exceptions. */ typedef enum { /** * If a hardware exception is detected, a work-item signals an exception. */ HSA_EXCEPTION_POLICY_BREAK = 1, /** * If a hardware exception is detected, a hardware status bit is set. */ HSA_EXCEPTION_POLICY_DETECT = 2 } hsa_exception_policy_t; /** * @brief Retrieve the exception policy support for a given combination of * agent and profile * * @param[in] agent Agent. * * @param[in] profile Profile. * * @param[out] mask Pointer to a memory location where the HSA runtime stores a * mask of ::hsa_exception_policy_t values. Must not be NULL. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid * profile, or @p mask is NULL. * */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_agent_get_exception_policies(hsa_agent_t agent, hsa_profile_t profile, uint16_t *mask); #endif /** * @brief Query if a given version of an extension is supported by an agent * * @param[in] extension Extension identifier. * * @param[in] agent Agent. * * @param[in] version_major Major version number. * * @param[in] version_minor Minor version number. * * @param[out] result Pointer to a memory location where the HSA runtime stores * the result of the check. The result is true if the specified version of the * extension is supported, and false otherwise. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid * extension, or @p result is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_agent_extension_supported(uint16_t extension, hsa_agent_t agent, uint16_t version_major, uint16_t version_minor, bool *result); #endif /** @} */ /** \defgroup signals Signals * @{ */ /** * @brief Signal handle. */ typedef struct hsa_signal_s { /** * Opaque handle. The value 0 is reserved. */ uint64_t handle; } hsa_signal_t; /** * @brief Signal value. The value occupies 32 bits in small machine mode, and 64 * bits in large machine mode. */ #ifdef HSA_LARGE_MODEL typedef int64_t hsa_signal_value_t; #else typedef int32_t hsa_signal_value_t; #endif /** * @brief Create a signal. * * @param[in] initial_value Initial value of the signal. * * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that * any agent might wait on the signal. * * @param[in] consumers List of agents that might consume (wait on) the * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the * HSA runtime might use the list to optimize the handling of the signal * object. If an agent not listed in @p consumers waits on the returned * signal, the behavior is undefined. The memory associated with @p consumers * can be reused or freed after the function returns. * * @param[out] signal Pointer to a memory location where the HSA runtime will * store the newly created signal handle. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate the * resources required by the implementation. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers * contains duplicates. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers, const hsa_agent_t *consumers, hsa_signal_t *signal); #endif /** * @brief Destroy a signal previous created by ::hsa_signal_create. * * @param[in] signal Signal. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p signal is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The handle in @p signal is 0. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_signal_destroy(hsa_signal_t signal); #endif /** * @brief Atomically read the current value of a signal. * * @param[in] signal Signal. * * @return Value of the signal. */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_load_acquire(hsa_signal_t signal); #endif /** * @copydoc hsa_signal_load_acquire */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_load_relaxed(hsa_signal_t signal); #endif /** * @brief Atomically set the value of a signal. * * @details If the value of the signal is changed, all the agents waiting * on @p signal for which @p value satisfies their wait condition are awakened. * * @param[in] signal Signal. * * @param[in] value New signal value. */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_store_relaxed(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_store_relaxed */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_store_release(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @brief Atomically set the value of a signal and return its previous value. * * @details If the value of the signal is changed, all the agents waiting * on @p signal for which @p value satisfies their wait condition are awakened. * * @param[in] signal Signal. If @p signal is a queue doorbell signal, the * behavior is undefined. * * @param[in] value New value. * * @return Value of the signal prior to the exchange. * */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_exchange_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_exchange_acq_rel */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_exchange_acquire(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_exchange_acq_rel */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_exchange_relaxed(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_exchange_acq_rel */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_exchange_release(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @brief Atomically set the value of a signal if the observed value is equal to * the expected value. The observed value is returned regardless of whether the * replacement was done. * * @details If the value of the signal is changed, all the agents waiting * on @p signal for which @p value satisfies their wait condition are awakened. * * @param[in] signal Signal. If @p signal is a queue * doorbell signal, the behavior is undefined. * * @param[in] expected Value to compare with. * * @param[in] value New value. * * @return Observed value of the signal. * */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_cas_acq_rel(hsa_signal_t signal, hsa_signal_value_t expected, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_cas_acq_rel */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_cas_acquire(hsa_signal_t signal, hsa_signal_value_t expected, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_cas_acq_rel */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_cas_relaxed(hsa_signal_t signal, hsa_signal_value_t expected, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_cas_acq_rel */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_cas_release(hsa_signal_t signal, hsa_signal_value_t expected, hsa_signal_value_t value); #endif /** * @brief Atomically increment the value of a signal by a given amount. * * @details If the value of the signal is changed, all the agents waiting on * @p signal for which @p value satisfies their wait condition are awakened. * * @param[in] signal Signal. If @p signal is a queue doorbell signal, the * behavior is undefined. * * @param[in] value Value to add to the value of the signal. * */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_add_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_add_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_add_acquire(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_add_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_add_relaxed(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_add_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_add_release(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @brief Atomically decrement the value of a signal by a given amount. * * @details If the value of the signal is changed, all the agents waiting on * @p signal for which @p value satisfies their wait condition are awakened. * * @param[in] signal Signal. If @p signal is a queue doorbell signal, the * behavior is undefined. * * @param[in] value Value to subtract from the value of the signal. * */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_subtract_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_subtract_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_subtract_acquire(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_subtract_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_subtract_relaxed(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_subtract_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_subtract_release(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @brief Atomically perform a bitwise AND operation between the value of a * signal and a given value. * * @details If the value of the signal is changed, all the agents waiting on * @p signal for which @p value satisfies their wait condition are awakened. * * @param[in] signal Signal. If @p signal is a queue doorbell signal, the * behavior is undefined. * * @param[in] value Value to AND with the value of the signal. * */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_and_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_and_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_and_acquire(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_and_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_and_relaxed(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_and_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_and_release(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @brief Atomically perform a bitwise OR operation between the value of a * signal and a given value. * * @details If the value of the signal is changed, all the agents waiting on * @p signal for which @p value satisfies their wait condition are awakened. * * @param[in] signal Signal. If @p signal is a queue doorbell signal, the * behavior is undefined. * * @param[in] value Value to OR with the value of the signal. */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_or_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_or_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_or_acquire(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_or_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_or_relaxed(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_or_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_or_release(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @brief Atomically perform a bitwise XOR operation between the value of a * signal and a given value. * * @details If the value of the signal is changed, all the agents waiting on * @p signal for which @p value satisfies their wait condition are awakened. * * @param[in] signal Signal. If @p signal is a queue doorbell signal, the * behavior is undefined. * * @param[in] value Value to XOR with the value of the signal. * */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_xor_acq_rel(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_xor_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_xor_acquire(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_xor_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_xor_relaxed(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @copydoc hsa_signal_xor_acq_rel */ #ifndef DEVICE_COMPILER void HSA_API hsa_signal_xor_release(hsa_signal_t signal, hsa_signal_value_t value); #endif /** * @brief Wait condition operator. */ typedef enum { /** * The two operands are equal. */ HSA_SIGNAL_CONDITION_EQ = 0, /** * The two operands are not equal. */ HSA_SIGNAL_CONDITION_NE = 1, /** * The first operand is less than the second operand. */ HSA_SIGNAL_CONDITION_LT = 2, /** * The first operand is greater than or equal to the second operand. */ HSA_SIGNAL_CONDITION_GTE = 3 } hsa_signal_condition_t; /** * @brief State of the application thread during a signal wait. */ typedef enum { /** * The application thread may be rescheduled while waiting on the signal. */ HSA_WAIT_STATE_BLOCKED = 0, /** * The application thread stays active while waiting on a signal. */ HSA_WAIT_STATE_ACTIVE = 1 } hsa_wait_state_t; /** * @brief Wait until a signal value satisfies a specified condition, or a * certain amount of time has elapsed. * * @details A wait operation can spuriously resume at any time sooner than the * timeout (for example, due to system or other external factors) even when the * condition has not been met. * * The function is guaranteed to return if the signal value satisfies the * condition at some point in time during the wait, but the value returned to * the application might not satisfy the condition. The application must ensure * that signals are used in such way that wait wakeup conditions are not * invalidated before dependent threads have woken up. * * When the wait operation internally loads the value of the passed signal, it * uses the memory order indicated in the function name. * * @param[in] signal Signal. * * @param[in] condition Condition used to compare the signal value with @p * compare_value. * * @param[in] compare_value Value to compare with. * * @param[in] timeout_hint Maximum duration of the wait. Specified in the same * unit as the system timestamp. The operation might block for a shorter or * longer time even if the condition is not met. A value of UINT64_MAX indicates * no maximum. * * @param[in] wait_state_hint Hint used by the application to indicate the * preferred waiting state. The actual waiting state is ultimately decided by * HSA runtime and may not match the provided hint. A value of * ::HSA_WAIT_STATE_ACTIVE may improve the latency of response to a signal * update by avoiding rescheduling overhead. * * @return Observed value of the signal, which might not satisfy the specified * condition. * */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_wait_acquire(hsa_signal_t signal, hsa_signal_condition_t condition, hsa_signal_value_t compare_value, uint64_t timeout_hint, hsa_wait_state_t wait_state_hint); #endif /** * @copydoc hsa_signal_wait_acquire */ #ifndef DEVICE_COMPILER hsa_signal_value_t HSA_API hsa_signal_wait_relaxed(hsa_signal_t signal, hsa_signal_condition_t condition, hsa_signal_value_t compare_value, uint64_t timeout_hint, hsa_wait_state_t wait_state_hint); #endif /** @} */ /** \defgroup memory Memory * @{ */ /** * @brief A memory region represents a block of virtual memory with certain * properties. For example, the HSA runtime represents fine-grained memory in * the global segment using a region. A region might be associated with more * than one agent. */ typedef struct hsa_region_s { /** * Opaque handle. */ uint64_t handle; } hsa_region_t; /** @} */ /** \defgroup queue Queues * @{ */ /** * @brief Queue type. Intended to be used for dynamic queue protocol * determination. */ typedef enum { /** * Queue supports multiple producers. */ HSA_QUEUE_TYPE_MULTI = 0, /** * Queue only supports a single producer. */ HSA_QUEUE_TYPE_SINGLE = 1 } hsa_queue_type_t; /** * @brief Queue features. */ typedef enum { /** * Queue supports kernel dispatch packets. */ HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1, /** * Queue supports agent dispatch packets. */ HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2 } hsa_queue_feature_t; /** * @brief User mode queue. * * @details The queue structure is read-only and allocated by the HSA runtime, * but agents can directly modify the contents of the buffer pointed by @a * base_address, or use HSA runtime APIs to access the doorbell signal. * */ typedef struct hsa_queue_s { /** * Queue type. */ hsa_queue_type_t type; /** * Queue features mask. This is a bit-field of ::hsa_queue_feature_t * values. Applications should ignore any unknown set bits. */ uint32_t features; #ifdef HSA_LARGE_MODEL #ifdef DEVICE_COMPILER __global #endif void *base_address; #elif defined HSA_LITTLE_ENDIAN /** * Starting address of the HSA runtime-allocated buffer used to store the AQL * packets. Must be aligned to the size of an AQL packet. */ #ifdef DEVICE_COMPILER __global #endif void *base_address; /** * Reserved. Must be 0. */ uint32_t reserved0; #else uint32_t reserved0; #ifdef DEVICE_COMPILER __global #endif void *base_address; #endif /** * Signal object used by the application to indicate the ID of a packet that * is ready to be processed. The HSA runtime manages the doorbell signal. If * the application tries to replace or destroy this signal, the behavior is * undefined. * * If @a type is ::HSA_QUEUE_TYPE_SINGLE the doorbell signal value must be * updated in a monotonically increasing fashion. If @a type is * ::HSA_QUEUE_TYPE_MULTI, the doorbell signal value can be updated with any * value. */ hsa_signal_t doorbell_signal; /** * Maximum number of packets the queue can hold. Must be a power of 2. */ uint32_t size; /** * Reserved. Must be 0. */ uint32_t reserved1; /** * Queue identifier, which is unique over the lifetime of the application. */ uint64_t id; } hsa_queue_t; /** * @brief Create a user mode queue. * * @details The HSA runtime creates the queue structure, the underlying packet * buffer, the completion signal, and the write and read indexes. The initial * value of the write and read indexes is 0. The type of every packet in the * buffer is initialized to ::HSA_PACKET_TYPE_INVALID. * * The application should only rely on the error code returned to determine if * the queue is valid. * * @param[in] agent Agent where to create the queue. * * @param[in] size Number of packets the queue is expected to * hold. Must be a power of 2 between 1 and the value of * ::HSA_AGENT_INFO_QUEUE_MAX_SIZE in @p agent. The size of the newly * created queue is the maximum of @p size and the value of * ::HSA_AGENT_INFO_QUEUE_MIN_SIZE in @p agent. * * @param[in] type Type of the queue. If the value of * ::HSA_AGENT_INFO_QUEUE_TYPE in @p agent is ::HSA_QUEUE_TYPE_SINGLE, then @p * type must also be ::HSA_QUEUE_TYPE_SINGLE. * * @param[in] callback Callback invoked by the HSA runtime for every * asynchronous event related to the newly created queue. May be NULL. The HSA * runtime passes three arguments to the callback: a code identifying the event * that triggered the invocation, a pointer to the queue where the event * originated, and the application data. * * @param[in] data Application data that is passed to @p callback on every * iteration. May be NULL. * * @param[in] private_segment_size Hint indicating the maximum * expected private segment usage per work-item, in bytes. There may * be performance degradation if the application places a kernel * dispatch packet in the queue and the corresponding private segment * usage exceeds @p private_segment_size. If the application does not * want to specify any particular value for this argument, @p * private_segment_size must be UINT32_MAX. If the queue does not * support kernel dispatch packets, this argument is ignored. * * @param[in] group_segment_size Hint indicating the maximum expected * group segment usage per work-group, in bytes. There may be * performance degradation if the application places a kernel dispatch * packet in the queue and the corresponding group segment usage * exceeds @p group_segment_size. If the application does not want to * specify any particular value for this argument, @p * group_segment_size must be UINT32_MAX. If the queue does not * support kernel dispatch packets, this argument is ignored. * * @param[out] queue Memory location where the HSA runtime stores a pointer to * the newly created queue. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate * the resources required by the implementation. * * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p agent does not * support queues of the given type. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, * @p size is 0, @p type is an invalid queue type, or @p queue is NULL. * */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_queue_create(hsa_agent_t agent, uint32_t size, hsa_queue_type_t type, void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), void *data, uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t **queue); #endif /** * @brief Create a queue for which the application or a kernel is responsible * for processing the AQL packets. * * @details The application can use this function to create queues where AQL * packets are not parsed by the packet processor associated with an agent, * but rather by a unit of execution running on that agent (for example, a * thread in the host application). * * The application is responsible for ensuring that all the producers and * consumers of the resulting queue can access the provided doorbell signal * and memory region. The application is also responsible for ensuring that the * unit of execution processing the queue packets supports the indicated * features (AQL packet types). * * When the queue is created, the HSA runtime allocates the packet buffer using * @p region, and the write and read indexes. The initial value of the write and * read indexes is 0, and the type of every packet in the buffer is initialized * to ::HSA_PACKET_TYPE_INVALID. The value of the @e size, @e type, @e features, * and @e doorbell_signal fields in the returned queue match the values passed * by the application. * * @param[in] region Memory region that the HSA runtime should use to allocate * the AQL packet buffer and any other queue metadata. * * @param[in] size Number of packets the queue is expected to hold. Must be a * power of 2 greater than 0. * * @param[in] type Queue type. * * @param[in] features Supported queue features. This is a bit-field of * ::hsa_queue_feature_t values. * * @param[in] doorbell_signal Doorbell signal that the HSA runtime must * associate with the returned queue. The signal handle must not be 0. * * @param[out] queue Memory location where the HSA runtime stores a pointer to * the newly created queue. The application should not rely on the value * returned for this argument but only in the status code to determine if the * queue is valid. Must not be NULL. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is failure to allocate * the resources required by the implementation. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, @p * size is 0, @p type is an invalid queue type, the doorbell signal handle is * 0, or @p queue is NULL. * */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_soft_queue_create(hsa_region_t region, uint32_t size, hsa_queue_type_t type, uint32_t features, hsa_signal_t doorbell_signal, hsa_queue_t **queue); #endif /** * @brief Destroy a user mode queue. * * @details When a queue is destroyed, the state of the AQL packets that have * not been yet fully processed (their completion phase has not finished) * becomes undefined. It is the responsibility of the application to ensure that * all pending queue operations are finished if their results are required. * * The resources allocated by the HSA runtime during queue creation (queue * structure, ring buffer, doorbell signal) are released. The queue should not * be accessed after being destroyed. * * @param[in] queue Pointer to a queue created using ::hsa_queue_create. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_queue_destroy(hsa_queue_t *queue); #endif /** * @brief Inactivate a queue. * * @details Inactivating the queue aborts any pending executions and prevent any * new packets from being processed. Any more packets written to the queue once * it is inactivated will be ignored by the packet processor. * * @param[in] queue Pointer to a queue. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_queue_inactivate(hsa_queue_t *queue); #endif /** * @brief Atomically load the read index of a queue. * * @param[in] queue Pointer to a queue. * * @return Read index of the queue pointed by @p queue. */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_load_read_index_acquire(const hsa_queue_t *queue); #endif /** * @copydoc hsa_queue_load_read_index_acquire */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_load_read_index_relaxed(const hsa_queue_t *queue); #endif /** * @brief Atomically load the write index of a queue. * * @param[in] queue Pointer to a queue. * * @return Write index of the queue pointed by @p queue. */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_load_write_index_acquire(const hsa_queue_t *queue); #endif /** * @copydoc hsa_queue_load_write_index_acquire */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_load_write_index_relaxed(const hsa_queue_t *queue); #endif /** * @brief Atomically set the write index of a queue. * * @param[in] queue Pointer to a queue. * * @param[in] value Value to assign to the write index. * */ #ifndef DEVICE_COMPILER void HSA_API hsa_queue_store_write_index_relaxed(const hsa_queue_t *queue, uint64_t value); #endif /** * @copydoc hsa_queue_store_write_index_relaxed */ #ifndef DEVICE_COMPILER void HSA_API hsa_queue_store_write_index_release(const hsa_queue_t *queue, uint64_t value); #endif /** * @brief Atomically set the write index of a queue if the observed value is * equal to the expected value. The application can inspect the returned value * to determine if the replacement was done. * * @param[in] queue Pointer to a queue. * * @param[in] expected Expected value. * * @param[in] value Value to assign to the write index if @p expected matches * the observed write index. Must be greater than @p expected. * * @return Previous value of the write index. */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_cas_write_index_acq_rel(const hsa_queue_t *queue, uint64_t expected, uint64_t value); #endif /** * @copydoc hsa_queue_cas_write_index_acq_rel */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_cas_write_index_acquire(const hsa_queue_t *queue, uint64_t expected, uint64_t value); #endif /** * @copydoc hsa_queue_cas_write_index_acq_rel */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_cas_write_index_relaxed(const hsa_queue_t *queue, uint64_t expected, uint64_t value); #endif /** * @copydoc hsa_queue_cas_write_index_acq_rel */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_cas_write_index_release(const hsa_queue_t *queue, uint64_t expected, uint64_t value); #endif /** * @brief Atomically increment the write index of a queue by an offset. * * @param[in] queue Pointer to a queue. * * @param[in] value Value to add to the write index. * * @return Previous value of the write index. */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_add_write_index_acq_rel(const hsa_queue_t *queue, uint64_t value); #endif /** * @copydoc hsa_queue_add_write_index_acq_rel */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_add_write_index_acquire(const hsa_queue_t *queue, uint64_t value); #endif /** * @copydoc hsa_queue_add_write_index_acq_rel */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_add_write_index_relaxed(const hsa_queue_t *queue, uint64_t value); #endif /** * @copydoc hsa_queue_add_write_index_acq_rel */ #ifndef DEVICE_COMPILER uint64_t HSA_API hsa_queue_add_write_index_release(const hsa_queue_t *queue, uint64_t value); #endif /** * @brief Atomically set the read index of a queue. * * @details Modifications of the read index are not allowed and result in * undefined behavior if the queue is associated with an agent for which * only the corresponding packet processor is permitted to update the read * index. * * @param[in] queue Pointer to a queue. * * @param[in] value Value to assign to the read index. * */ #ifndef DEVICE_COMPILER void HSA_API hsa_queue_store_read_index_relaxed(const hsa_queue_t *queue, uint64_t value); #endif /** * @copydoc hsa_queue_store_read_index_relaxed */ #ifndef DEVICE_COMPILER void HSA_API hsa_queue_store_read_index_release(const hsa_queue_t *queue, uint64_t value); #endif /** @} */ /** \defgroup aql Architected Queuing Language * @{ */ /** * @brief Packet type. */ typedef enum { /** * Vendor-specific packet. */ HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0, /** * The packet has been processed in the past, but has not been reassigned to * the packet processor. A packet processor must not process a packet of this * type. All queues support this packet type. */ HSA_PACKET_TYPE_INVALID = 1, /** * Packet used by agents for dispatching jobs to kernel agents. Not all * queues support packets of this type (see ::hsa_queue_feature_t). */ HSA_PACKET_TYPE_KERNEL_DISPATCH = 2, /** * Packet used by agents to delay processing of subsequent packets, and to * express complex dependencies between multiple packets. All queues support * this packet type. */ HSA_PACKET_TYPE_BARRIER_AND = 3, /** * Packet used by agents for dispatching jobs to agents. Not all * queues support packets of this type (see ::hsa_queue_feature_t). */ HSA_PACKET_TYPE_AGENT_DISPATCH = 4, /** * Packet used by agents to delay processing of subsequent packets, and to * express complex dependencies between multiple packets. All queues support * this packet type. */ HSA_PACKET_TYPE_BARRIER_OR = 5 } hsa_packet_type_t; /** * @brief Scope of the memory fence operation associated with a packet. */ typedef enum { /** * No scope (no fence is applied). The packet relies on external fences to * ensure visibility of memory updates. */ HSA_FENCE_SCOPE_NONE = 0, /** * The fence is applied with agent scope for the global segment. */ HSA_FENCE_SCOPE_AGENT = 1, /** * The fence is applied across both agent and system scope for the global * segment. */ HSA_FENCE_SCOPE_SYSTEM = 2 } hsa_fence_scope_t; /** * @brief Sub-fields of the @a header field that is present in any AQL * packet. The offset (with respect to the address of @a header) of a sub-field * is identical to its enumeration constant. The width of each sub-field is * determined by the corresponding value in ::hsa_packet_header_width_t. The * offset and the width are expressed in bits. */ typedef enum { /** * Packet type. The value of this sub-field must be one of * ::hsa_packet_type_t. If the type is ::HSA_PACKET_TYPE_VENDOR_SPECIFIC, the * packet layout is vendor-specific. */ HSA_PACKET_HEADER_TYPE = 0, /** * Barrier bit. If the barrier bit is set, the processing of the current * packet only launches when all preceding packets (within the same queue) are * complete. */ HSA_PACKET_HEADER_BARRIER = 8, /** * Acquire fence scope. The value of this sub-field determines the scope and * type of the memory fence operation applied before the packet enters the * active phase. An acquire fence ensures that any subsequent global segment * or image loads by any unit of execution that belongs to a dispatch that has * not yet entered the active phase on any queue of the same kernel agent, * sees any data previously released at the scopes specified by the acquire * fence. The value of this sub-field must be one of ::hsa_fence_scope_t. */ HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9, /** * Release fence scope, The value of this sub-field determines the scope and * type of the memory fence operation applied after kernel completion but * before the packet is completed. A release fence makes any global segment or * image data that was stored by any unit of execution that belonged to a * dispatch that has completed the active phase on any queue of the same * kernel agent visible in all the scopes specified by the release fence. The * value of this sub-field must be one of ::hsa_fence_scope_t. */ HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11 } hsa_packet_header_t; /** * @brief Width (in bits) of the sub-fields in ::hsa_packet_header_t. */ typedef enum { HSA_PACKET_HEADER_WIDTH_TYPE = 8, HSA_PACKET_HEADER_WIDTH_BARRIER = 1, HSA_PACKET_HEADER_WIDTH_ACQUIRE_FENCE_SCOPE = 2, HSA_PACKET_HEADER_WIDTH_RELEASE_FENCE_SCOPE = 2 } hsa_packet_header_width_t; /** * @brief Sub-fields of the kernel dispatch packet @a setup field. The offset * (with respect to the address of @a setup) of a sub-field is identical to its * enumeration constant. The width of each sub-field is determined by the * corresponding value in ::hsa_kernel_dispatch_packet_setup_width_t. The * offset and the width are expressed in bits. */ typedef enum { /** * Number of dimensions of the grid. Valid values are 1, 2, or 3. * */ HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0 } hsa_kernel_dispatch_packet_setup_t; /** * @brief Width (in bits) of the sub-fields in * ::hsa_kernel_dispatch_packet_setup_t. */ typedef enum { HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2 } hsa_kernel_dispatch_packet_setup_width_t; /** * @brief AQL kernel dispatch packet */ typedef struct hsa_kernel_dispatch_packet_s { /** * Packet header. Used to configure multiple packet parameters such as the * packet type. The parameters are described by ::hsa_packet_header_t. */ uint16_t header; /** * Dispatch setup parameters. Used to configure kernel dispatch parameters * such as the number of dimensions in the grid. The parameters are described * by ::hsa_kernel_dispatch_packet_setup_t. */ uint16_t setup; /** * X dimension of work-group, in work-items. Must be greater than 0. */ uint16_t workgroup_size_x; /** * Y dimension of work-group, in work-items. Must be greater than * 0. If the grid has 1 dimension, the only valid value is 1. */ uint16_t workgroup_size_y; /** * Z dimension of work-group, in work-items. Must be greater than * 0. If the grid has 1 or 2 dimensions, the only valid value is 1. */ uint16_t workgroup_size_z; /** * Reserved. Must be 0. */ uint16_t reserved0; /** * X dimension of grid, in work-items. Must be greater than 0. Must * not be smaller than @a workgroup_size_x. */ uint32_t grid_size_x; /** * Y dimension of grid, in work-items. Must be greater than 0. If the grid has * 1 dimension, the only valid value is 1. Must not be smaller than @a * workgroup_size_y. */ uint32_t grid_size_y; /** * Z dimension of grid, in work-items. Must be greater than 0. If the grid has * 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a * workgroup_size_z. */ uint32_t grid_size_z; /** * Size in bytes of private memory allocation request (per work-item). */ uint32_t private_segment_size; /** * Size in bytes of group memory allocation request (per work-group). Must not * be less than the sum of the group memory used by the kernel (and the * functions it calls directly or indirectly) and the dynamically allocated * group segment variables. */ uint32_t group_segment_size; /** * Opaque handle to a code object that includes an implementation-defined * executable code for the kernel. */ uint64_t kernel_object; #ifdef HSA_LARGE_MODEL #ifdef DEVICE_COMPILER __global #endif void *kernarg_address; #elif defined HSA_LITTLE_ENDIAN /** * Pointer to a buffer containing the kernel arguments. May be NULL. * * The buffer must be allocated using ::hsa_memory_allocate, and must not be * modified once the kernel dispatch packet is enqueued until the dispatch has * completed execution. */ #ifdef DEVICE_COMPILER __global #endif void *kernarg_address; /** * Reserved. Must be 0. */ uint32_t reserved1; #else uint32_t reserved1; #ifdef DEVICE_COMPILER __global #endif void *kernarg_address; #endif /** * Reserved. Must be 0. */ uint64_t reserved2; /** * Signal used to indicate completion of the job. The application can use the * special signal handle 0 to indicate that no signal is used. */ hsa_signal_t completion_signal; } hsa_kernel_dispatch_packet_t; /** * @brief Agent dispatch packet. */ typedef struct hsa_agent_dispatch_packet_s { /** * Packet header. Used to configure multiple packet parameters such as the * packet type. The parameters are described by ::hsa_packet_header_t. */ uint16_t header; /** * Application-defined function to be performed by the destination agent. */ uint16_t type; /** * Reserved. Must be 0. */ uint32_t reserved0; #ifdef HSA_LARGE_MODEL #ifdef DEVICE_COMPILER __constant #endif void *return_address; #elif defined HSA_LITTLE_ENDIAN /** * Address where to store the function return values, if any. */ #ifdef DEVICE_COMPILER __constant #endif void *return_address; /** * Reserved. Must be 0. */ uint32_t reserved1; #else uint32_t reserved1; #ifdef DEVICE_COMPILER __constant #endif void *return_address; #endif /** * Function arguments. */ uint64_t arg[4]; /** * Reserved. Must be 0. */ uint64_t reserved2; /** * Signal used to indicate completion of the job. The application can use the * special signal handle 0 to indicate that no signal is used. */ hsa_signal_t completion_signal; } hsa_agent_dispatch_packet_t; /** * @brief Barrier-AND packet. */ typedef struct hsa_barrier_and_packet_s { /** * Packet header. Used to configure multiple packet parameters such as the * packet type. The parameters are described by ::hsa_packet_header_t. */ uint16_t header; /** * Reserved. Must be 0. */ uint16_t reserved0; /** * Reserved. Must be 0. */ uint32_t reserved1; /** * Array of dependent signal objects. Signals with a handle value of 0 are * allowed and are interpreted by the packet processor as satisfied * dependencies. */ hsa_signal_t dep_signal[5]; /** * Reserved. Must be 0. */ uint64_t reserved2; /** * Signal used to indicate completion of the job. The application can use the * special signal handle 0 to indicate that no signal is used. */ hsa_signal_t completion_signal; } hsa_barrier_and_packet_t; /** * @brief Barrier-OR packet. */ typedef struct hsa_barrier_or_packet_s { /** * Packet header. Used to configure multiple packet parameters such as the * packet type. The parameters are described by ::hsa_packet_header_t. */ uint16_t header; /** * Reserved. Must be 0. */ uint16_t reserved0; /** * Reserved. Must be 0. */ uint32_t reserved1; /** * Array of dependent signal objects. Signals with a handle value of 0 are * allowed and are interpreted by the packet processor as dependencies not * satisfied. */ hsa_signal_t dep_signal[5]; /** * Reserved. Must be 0. */ uint64_t reserved2; /** * Signal used to indicate completion of the job. The application can use the * special signal handle 0 to indicate that no signal is used. */ hsa_signal_t completion_signal; } hsa_barrier_or_packet_t; /** @} */ /** \addtogroup memory Memory * @{ */ /** * @brief Memory segments associated with a region. */ typedef enum { /** * Global segment. Used to hold data that is shared by all agents. */ HSA_REGION_SEGMENT_GLOBAL = 0, /** * Read-only segment. Used to hold data that remains constant during the * execution of a kernel. */ HSA_REGION_SEGMENT_READONLY = 1, /** * Private segment. Used to hold data that is local to a single work-item. */ HSA_REGION_SEGMENT_PRIVATE = 2, /** * Group segment. Used to hold data that is shared by the work-items of a * work-group. */ HSA_REGION_SEGMENT_GROUP = 3 } hsa_region_segment_t; /** * @brief Global region flags. */ typedef enum { /** * The application can use memory in the region to store kernel arguments, and * provide the values for the kernarg segment of a kernel dispatch. If this * flag is set, then ::HSA_REGION_GLOBAL_FLAG_FINE_GRAINED must be set. */ HSA_REGION_GLOBAL_FLAG_KERNARG = 1, /** * Updates to memory in this region are immediately visible to all the * agents under the terms of the HSA memory model. If this * flag is set, then ::HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED must not be set. */ HSA_REGION_GLOBAL_FLAG_FINE_GRAINED = 2, /** * Updates to memory in this region can be performed by a single agent at * a time. If a different agent in the system is allowed to access the * region, the application must explicitely invoke ::hsa_memory_assign_agent * in order to transfer ownership to that agent for a particular buffer. */ HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4 } hsa_region_global_flag_t; /** * @brief Attributes of a memory region. */ typedef enum { /** * Segment where memory in the region can be used. The type of this * attribute is ::hsa_region_segment_t. */ HSA_REGION_INFO_SEGMENT = 0, /** * Flag mask. The value of this attribute is undefined if the value of * ::HSA_REGION_INFO_SEGMENT is not ::HSA_REGION_SEGMENT_GLOBAL. The type of * this attribute is uint32_t, a bit-field of ::hsa_region_global_flag_t * values. */ HSA_REGION_INFO_GLOBAL_FLAGS = 1, /** * Size of this region, in bytes. The type of this attribute is size_t. */ HSA_REGION_INFO_SIZE = 2, /** * Maximum allocation size in this region, in bytes. Must not exceed the value * of ::HSA_REGION_INFO_SIZE. The type of this attribute is size_t. * * If the region is in the global or readonly segments, this is the maximum * size that the application can pass to ::hsa_memory_allocate. If the region * is in the group segment, this is the maximum size (per work-group) that can * be requested for a given kernel dispatch. If the region is in the private * segment, this is the maximum size (per work-item) that can be request for a * specific kernel dispatch. */ HSA_REGION_INFO_ALLOC_MAX_SIZE = 4, /** * Indicates whether memory in this region can be allocated using * ::hsa_memory_allocate. The type of this attribute is bool. * * The value of this flag is always false for regions in the group and private * segments. */ HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED = 5, /** * Allocation granularity of buffers allocated by ::hsa_memory_allocate in * this region. The size of a buffer allocated in this region is a multiple of * the value of this attribute. The value of this attribute is only defined if * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region. The type * of this attribute is size_t. */ HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE = 6, /** * Alignment of buffers allocated by ::hsa_memory_allocate in this region. The * value of this attribute is only defined if * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region, and must * be a power of 2. The type of this attribute is size_t. */ HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT = 7 } hsa_region_info_t; /** * @brief Get the current value of an attribute of a region. * * @param[in] region A valid region. * * @param[in] attribute Attribute to query. * * @param[out] value Pointer to a application-allocated buffer where to store * the value of the attribute. If the buffer passed by the application is not * large enough to hold the value of @p attribute, the behavior is undefined. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid * region attribute, or @p value is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_region_get_info(hsa_region_t region, hsa_region_info_t attribute, void *value); #endif /** * @brief Iterate over the memory regions associated with a given agent, and * invoke an application-defined callback on every iteration. * * @param[in] agent A valid agent. * * @param[in] callback Callback to be invoked once per region that is * accessible from the agent. The HSA runtime passes two arguments to the * callback, the region and the application data. If @p callback returns a * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the * traversal stops and ::hsa_agent_iterate_regions returns that status value. * * @param[in] data Application data that is passed to @p callback on every * iteration. May be NULL. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_agent_iterate_regions( hsa_agent_t agent, hsa_status_t (*callback)(hsa_region_t region, void *data), void *data); #endif /** * @brief Allocate a block of memory in a given region. * * @param[in] region Region where to allocate memory from. The region must have * the ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED flag set. * * @param[in] size Allocation size, in bytes. Must not be zero. This value is * rounded up to the nearest multiple of ::HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE * in @p region. * * @param[out] ptr Pointer to the location where to store the base address of * the allocated block. The returned base address is aligned to the value of * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT in @p region. If the allocation * fails, the returned value is undefined. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES No memory is available. * * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to * allocate memory in @p region, or @p size is greater than the value of * HSA_REGION_INFO_ALLOC_MAX_SIZE in @p region. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_memory_allocate(hsa_region_t region, size_t size, void **ptr); #endif /** * @brief Deallocate a block of memory previously allocated using * ::hsa_memory_allocate. * * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value * previously returned by ::hsa_memory_allocate, the behavior is undefined. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_memory_free(void *ptr); #endif /** * @brief Copy a block of memory. * * @param[out] dst Buffer where the content is to be copied. * * @param[in] src A valid pointer to the source of data to be copied. * * @param[in] size Number of bytes to copy. If @p size is 0, no copy is * performed and the function returns success. Copying a number of bytes larger * than the size of the buffers pointed by @p dst or @p src results in undefined * behavior. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination * pointers are NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_memory_copy(void *dst, const void *src, size_t size); #endif /** * @brief Change the ownership of a global, coarse-grained buffer. * * @details The contents of a coarse-grained buffer are visible to an agent * only after ownership has been explicitely transferred to that agent. Once the * operation completes, the previous owner cannot longer access the data in the * buffer. * * An implementation of the HSA runtime is allowed, but not required, to change * the physical location of the buffer when ownership is transferred to a * different agent. In general the application must not assume this * behavior. The virtual location (address) of the passed buffer is never * modified. * * @param[in] ptr Base address of a global buffer. The pointer should match an * address previously returned by ::hsa_memory_allocate. The size of the buffer * affected by the ownership change is identical to the size of that previous * allocation. If @p ptr points to a fine-grained global buffer, no operation is * performed and the function returns success. If @p ptr does not point to * global memory, the behavior is undefined. * * @param[in] agent Agent that becomes the owner of the buffer. The * application is responsible for ensuring that @p agent has access to the * region that contains the buffer. It is allowed to change ownership to an * agent that is already the owner of the buffer, with the same or different * access permissions. * * @param[in] access Access permissions requested for the new owner. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is unable to * acquire the resources required by the operation. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p access is * not a valid access value. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_memory_assign_agent(void *ptr, hsa_agent_t agent, hsa_access_permission_t access); #endif /** * * @brief Register a global, fine-grained buffer. * * @details Registering a buffer serves as an indication to the HSA runtime that * the memory might be accessed from a kernel agent other than the * host. Registration is a performance hint that allows the HSA runtime * implementation to know which buffers will be accessed by some of the kernel * agents ahead of time. * * Registration is only recommended for buffers in the global segment that have * not been allocated using the HSA allocator (::hsa_memory_allocate), but an OS * allocator instead. * * Registrations should not overlap. * * @param[in] ptr A buffer in global memory. If a NULL pointer is passed, no * operation is performed. * * @param[in] size Requested registration size in bytes. A size of 0 is * only allowed if @p ptr is NULL. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in * allocating the necessary resources. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 but @p ptr * is not NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_memory_register(void *ptr, size_t size); #endif /** * * @brief Deregister memory previously registered using ::hsa_memory_register. * * @details If the memory interval being deregistered does not match a previous * registration (start and end addresses), the behavior is undefined. * * @param[in] ptr A pointer to the base of the buffer to be deregistered. If * a NULL pointer is passed, no operation is performed. * * @param[in] size Size of the buffer to be deregistered. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_memory_deregister(void *ptr, size_t size); #endif /** @} */ /** \defgroup symbol-attributes Symbol Attributes * @{ */ /** * @brief Symbol type. */ typedef enum { /** * Variable. */ HSA_SYMBOL_KIND_VARIABLE = 0, /** * Kernel. */ HSA_SYMBOL_KIND_KERNEL = 1, /** * Indirect function. */ HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2 } hsa_symbol_kind_t; /** * @brief Allocation type of a variable. */ typedef enum { /** * Agent allocation. */ HSA_VARIABLE_ALLOCATION_AGENT = 0, /** * Program allocation. */ HSA_VARIABLE_ALLOCATION_PROGRAM = 1 } hsa_variable_allocation_t; /** * @brief Linkage type of a symbol. */ typedef enum { /** * Module linkage. */ HSA_SYMBOL_LINKAGE_MODULE = 0, /** * Program linkage. */ HSA_SYMBOL_LINKAGE_PROGRAM = 1 } hsa_symbol_linkage_t; /** * @brief Memory segment associated with a variable. */ typedef enum { /** * Global memory segment. */ HSA_VARIABLE_SEGMENT_GLOBAL = 0, /** * Readonly memory segment. */ HSA_VARIABLE_SEGMENT_READONLY = 1 } hsa_variable_segment_t; /** @} */ /** \defgroup code-object Code Object * @{ */ /** * @brief Instruction set architecture. */ typedef struct hsa_isa_s { /** * Opaque handle. */ uint64_t handle; } hsa_isa_t; /** * @brief Retrieve a reference to an ISA handle out of a symbolic name. * * @param[in] name Vendor-specific name associated with a particular instruction * set architecture. Must be a NUL-terminated string. * * @param[out] isa Memory location where the HSA runtime stores the ISA handle * corresponding to the given name. Must not be NULL. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p name is NULL, or @p isa is * NULL. * * @retval ::HSA_STATUS_ERROR_INVALID_ISA_NAME The given name does not * correspond to any instruction set architecture. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_isa_from_name( const char* name, hsa_isa_t* isa); #endif /** * @brief Instruction set architecture attributes. */ typedef enum { /** * The length of the ISA name. The type of this attribute is uint32_t. */ HSA_ISA_INFO_NAME_LENGTH = 0, /** * Human-readable description. The type of this attribute is character array * with the length equal to the value of ::HSA_ISA_INFO_NAME_LENGTH attribute. */ HSA_ISA_INFO_NAME = 1, /** * Number of call conventions supported by the instruction set architecture. * The type of this attribute is uint32_t. */ HSA_ISA_INFO_CALL_CONVENTION_COUNT = 2, /** * Number of work-items in a wavefront for a given call convention. Must be a * power of 2 in the range [1,256]. The type of this attribute is uint32_t. */ HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONT_SIZE = 3, /** * Number of wavefronts per compute unit for a given call convention. In * practice, other factors (for example, the amount of group memory used by a * work-group) may further limit the number of wavefronts per compute * unit. The type of this attribute is uint32_t. */ HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONTS_PER_COMPUTE_UNIT = 4 } hsa_isa_info_t; /** * @brief Get the current value of an attribute for a given instruction set * architecture (ISA). * * @param[in] isa A valid instruction set architecture. * * @param[in] attribute Attribute to query. * * @param[in] index Call convention index. Used only for call convention * attributes, otherwise ignored. Must have a value between 0 (inclusive) and * the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT (not * inclusive) in @p isa. * * @param[out] value Pointer to an application-allocated buffer where to store * the value of the attribute. If the buffer passed by the application is not * large enough to hold the value of @p attribute, the behavior is undefined. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is * invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_INDEX @p index out of range. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid * instruction set architecture attribute, or @p value is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_isa_get_info( hsa_isa_t isa, hsa_isa_info_t attribute, uint32_t index, void* value); #endif /** * @brief Check if the instruction set architecture of a code object can be * executed on an agent associated with another architecture. * * @param[in] code_object_isa Instruction set architecture associated with a * code object. * * @param[in] agent_isa Instruction set architecture associated with an agent. * * @param[out] result Pointer to a memory location where the HSA runtime stores * the result of the check. If the two architectures are compatible, the result * is true; if they are incompatible, the result is false. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p code_object_isa or @p agent_isa are * invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_isa_compatible( hsa_isa_t code_object_isa, hsa_isa_t agent_isa, bool* result); #endif /** * @brief An opaque handle to a code object, which contains ISA for finalized * kernels and indirect functions together with information about the * global/readonly segment variables they reference. */ typedef struct hsa_code_object_s { /** * Opaque handle. */ uint64_t handle; } hsa_code_object_t; /** * @brief Opaque handle to application data that is passed to the serialization * and deserialization functions. */ typedef struct hsa_callback_data_s { /** * Opaque handle. */ uint64_t handle; } hsa_callback_data_t; /** * @brief Serialize a code object. Can be used for offline finalization, * install-time finalization, disk code caching, etc. * * @param[in] code_object Code object. * * @param[in] alloc_callback Callback function for memory allocation. Must not * be NULL. The HSA runtime passes three arguments to the callback: the * allocation size, the application data, and a pointer to a memory location * where the application stores the allocation result. The HSA runtime invokes * @p alloc_callback once to allocate a buffer that contains the serialized * version of @p code_object. If the callback returns a status code other than * ::HSA_STATUS_SUCCESS, this function returns the same code. * * @param[in] callback_data Application data that is passed to @p * alloc_callback. May be NULL. * * @param[in] options Vendor-specific options. May be NULL. * * @param[out] serialized_code_object Memory location where the HSA runtime * stores a pointer to the serialized code object. Must not be NULL. * * @param[out] serialized_code_object_size Memory location where the HSA runtime * stores the size (in bytes) of @p serialized_code_object. The returned value * matches the allocation size passed by the HSA runtime to @p * alloc_callback. Must not be NULL. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate * resources required for the operation. * * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p alloc_callback, @p * serialized_code_object, or @p serialized_code_object_size are NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_code_object_serialize( hsa_code_object_t code_object, hsa_status_t (*alloc_callback)(size_t size, hsa_callback_data_t data, void **address), hsa_callback_data_t callback_data, const char *options, void **serialized_code_object, size_t *serialized_code_object_size); #endif /** * @brief Deserialize a code object. * * @param[in] serialized_code_object A serialized code object. Must not be NULL. * * @param[in] serialized_code_object_size The size (in bytes) of @p * serialized_code_object. Must not be 0. * * @param[in] options Vendor-specific options. May be NULL. * * @param[out] code_object Memory location where the HSA runtime stores the * deserialized code object. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate * resources required for the operation. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p serialized_code_object, or @p * code_object are NULL. @p serialized_code_object_size is 0. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_code_object_deserialize( void *serialized_code_object, size_t serialized_code_object_size, const char *options, hsa_code_object_t *code_object); #endif /** * @brief Destroy a code object. * * @details The lifetime of a code object must exceed that of any executable * where it has been loaded. If an executable that loaded @p code_object has not * been destroyed, the behavior is undefined. * * @param[in] code_object Code object. The handle becomes invalid after it has * been destroyed. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_code_object_destroy( hsa_code_object_t code_object); #endif /** * @brief Code object type. */ typedef enum { /** * Produces code object that contains ISA for all kernels and indirect * functions in HSA source. */ HSA_CODE_OBJECT_TYPE_PROGRAM = 0 } hsa_code_object_type_t; /** * @brief Code object attributes. */ typedef enum { /** * The version of the code object. The type of this attribute is a * NUL-terminated char[64]. If the version of the code object uses less than * 63 characters, the rest of the array must be filled with NULs. */ HSA_CODE_OBJECT_INFO_VERSION = 0, /** * Type of code object. The type of this attribute is * ::hsa_code_object_type_t. */ HSA_CODE_OBJECT_INFO_TYPE = 1, /** * Instruction set architecture this code object is produced for. The type of * this attribute is ::hsa_isa_t. */ HSA_CODE_OBJECT_INFO_ISA = 2, /** * Machine model this code object is produced for. The type of this attribute * is ::hsa_machine_model_t. */ HSA_CODE_OBJECT_INFO_MACHINE_MODEL = 3, /** * Profile this code object is produced for. The type of this attribute is * ::hsa_profile_t. */ HSA_CODE_OBJECT_INFO_PROFILE = 4, /** * Default floating-point rounding mode used when the code object is * produced. The type of this attribute is * ::hsa_default_float_rounding_mode_t. */ HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5 } hsa_code_object_info_t; /** * @brief Get the current value of an attribute for a given code object. * * @param[in] code_object Code object. * * @param[in] attribute Attribute to query. * * @param[out] value Pointer to an application-allocated buffer where to store * the value of the attribute. If the buffer passed by the application is not * large enough to hold the value of @p attribute, the behavior is undefined. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid * code object attribute, or @p value is NULL. * * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_code_object_get_info( hsa_code_object_t code_object, hsa_code_object_info_t attribute, void *value); #endif /** * @brief Code object symbol. */ typedef struct hsa_code_symbol_s { /** * Opaque handle. */ uint64_t handle; } hsa_code_symbol_t; /** * @brief Get the symbol handle within a code object for a given a symbol name. * * @param[in] code_object Code object. * * @param[in] symbol_name Symbol name. * * @param[out] symbol Memory location where the HSA runtime stores the symbol * handle. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name * that matches @p symbol_name. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or * @p symbol is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_code_object_get_symbol( hsa_code_object_t code_object, const char *symbol_name, hsa_code_symbol_t *symbol); #endif /** * @brief Code object symbol attributes. */ typedef enum { /** * The type of the symbol. The type of this attribute is ::hsa_symbol_kind_t. */ HSA_CODE_SYMBOL_INFO_TYPE = 0, /** * The length of the symbol name. The type of this attribute is uint32_t. */ HSA_CODE_SYMBOL_INFO_NAME_LENGTH = 1, /** * The name of the symbol. The type of this attribute is character array with * the length equal to the value of ::HSA_CODE_SYMBOL_INFO_NAME_LENGTH * attribute */ HSA_CODE_SYMBOL_INFO_NAME = 2, /** * The length of the module name to which this symbol belongs if this symbol * has module linkage, otherwise 0 is returned. The type of this attribute is * uint32_t. */ HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, /** * The module name to which this symbol belongs if this symbol has module * linkage, otherwise empty string is returned. The type of this attribute is * character array with the length equal to the value of * ::HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. */ HSA_CODE_SYMBOL_INFO_MODULE_NAME = 4, /** * The linkage kind of the symbol. The type of this attribute is * ::hsa_symbol_linkage_t. */ HSA_CODE_SYMBOL_INFO_LINKAGE = 5, /** * Indicates whether the symbol corresponds to a definition. The type of this * attribute is bool. */ HSA_CODE_SYMBOL_INFO_IS_DEFINITION = 17, /** * The allocation kind of the variable. The value of this attribute is * undefined if the symbol is not a variable. The type of this attribute is * ::hsa_variable_allocation_t. */ HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, /** * The segment kind of the variable. The value of this attribute is * undefined if the symbol is not a variable. The type of this attribute is * ::hsa_variable_segment_t. */ HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, /** * Alignment of the variable. The value of this attribute is undefined if the * symbol is not a variable. The type of this attribute is uint32_t. */ HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, /** * Size of the variable. The value of this attribute is undefined if the * symbol is not a variable. The type of this attribute is uint32_t. * * A size of 0 is returned if the variable is an external variable and has an * unknown dimension. */ HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE = 9, /** * Indicates whether the variable is constant. The value of this attribute is * undefined if the symbol is not a variable. The type of this attribute is * bool. */ HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, /** * Size of kernarg segment memory that is required to hold the values of the * kernel arguments, in bytes. The value of this attribute is undefined if the * symbol is not a kernel. The type of this attribute is uint32_t. */ HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, /** * Alignment (in bytes) of the buffer used to pass arguments to the kernel, * which is the maximum of 16 and the maximum alignment of any of the kernel * arguments. The value of this attribute is undefined if the symbol is not a * kernel. The type of this attribute is uint32_t. */ HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, /** * Size of static group segment memory required by the kernel (per * work-group), in bytes. The value of this attribute is undefined * if the symbol is not a kernel. The type of this attribute is uint32_t. * * The reported amount does not include any dynamically allocated group * segment memory that may be requested by the application when a kernel is * dispatched. */ HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, /** * Size of static private, spill, and arg segment memory required by * this kernel (per work-item), in bytes. The value of this attribute is * undefined if the symbol is not a kernel. The type of this attribute is * uint32_t. * * If the value of ::HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is true, * the kernel may use more private memory than the reported value, and the * application must add the dynamic call stack usage to @a * private_segment_size when populating a kernel dispatch packet. */ HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, /** * Dynamic callstack flag. The value of this attribute is undefined if the * symbol is not a kernel. The type of this attribute is bool. * * If this flag is set (the value is true), the kernel uses a dynamically * sized call stack. This can happen if recursive calls, calls to indirect * functions, or the HSAIL alloca instruction are present in the kernel. */ HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, /** * Call convention of the indirect function. The value of this attribute is * undefined if the symbol is not an indirect function. The type of this * attribute is uint32_t. */ HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 } hsa_code_symbol_info_t; /** * @brief Get the current value of an attribute for a given code symbol. * * @param[in] code_symbol Code symbol. * * @param[in] attribute Attribute to query. * * @param[out] value Pointer to an application-allocated buffer where to store * the value of the attribute. If the buffer passed by the application is not * large enough to hold the value of @p attribute, the behavior is undefined. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid * code symbol attribute, or @p value is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_code_symbol_get_info( hsa_code_symbol_t code_symbol, hsa_code_symbol_info_t attribute, void *value); #endif /** * @brief Iterate over the symbols in a code object, and invoke an * application-defined callback on every iteration. * * @param[in] code_object Code object. * * @param[in] callback Callback to be invoked once per code object symbol. The * HSA runtime passes three arguments to the callback: the code object, a * symbol, and the application data. If @p callback returns a status other than * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and * ::hsa_code_object_iterate_symbols returns that status value. * * @param[in] data Application data that is passed to @p callback on every * iteration. May be NULL. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_code_object_iterate_symbols( hsa_code_object_t code_object, hsa_status_t (*callback)(hsa_code_object_t code_object, hsa_code_symbol_t symbol, void* data), void* data); #endif /** @} */ /** \defgroup executable Executable * @{ */ /** * @brief An opaque handle to an executable, which contains ISA for finalized * kernels and indirect functions together with the allocated global/readonly * segment variables they reference. */ typedef struct hsa_executable_s { /** * Opaque handle. */ uint64_t handle; } hsa_executable_t; /** * @brief Executable state. */ typedef enum { /** * Executable state, which allows the user to load code objects and define * external variables. Variable addresses, kernel code handles, and * indirect function code handles are not available in query operations until * the executable is frozen (zero always returned). */ HSA_EXECUTABLE_STATE_UNFROZEN = 0, /** * Executable state, which allows the user to query variable addresses, * kernel code handles, and indirect function code handles using query * operation. Loading new code objects, as well as defining external variables * is not allowed in this state. */ HSA_EXECUTABLE_STATE_FROZEN = 1 } hsa_executable_state_t; /** * @brief Create an empty executable. * * @param[in] profile Profile used in the executable. * * @param[in] executable_state Executable state. If the state is * ::HSA_EXECUTABLE_STATE_FROZEN, the resulting executable is useless because no * code objects can be loaded, and no variables can be defined. * * @param[in] options Vendor-specific options. May be NULL. * * @param[out] executable Memory location where the HSA runtime stores newly * created executable handle. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate * resources required for the operation. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or * @p executable is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_create( hsa_profile_t profile, hsa_executable_state_t executable_state, const char *options, hsa_executable_t *executable); #endif /** * @brief Destroy an executable. * * @details Executable handle becomes invalid after the executable has been * destroyed. Code object handles that were loaded into this executable are * still valid after the executable has been destroyed, and can be used as * intended. Resources allocated outside and associated with this executable * (such as external global/readonly variables) can be released after the * executable has been destroyed. * * Executable should not be destroyed while kernels are in flight. * * @param[in] executable Executable. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_destroy( hsa_executable_t executable); #endif /** * @brief Load code object into the executable. * * @details Every global/readonly variable that is external must be defined * using define set of operations before loading code objects. Internal * global/readonly variable is allocated once the code object, that is being * loaded, references this variable and this variable is not allocated. * * Any module linkage declaration must have been defined either by a define * variable or by loading a code object that has a symbol with module linkage * definition. * * @param[in] executable Executable. * * @param[in] agent Agent to load code object for. The agent must support the * default floating-point rounding mode used by @p code_object. * * @param[in] code_object Code object to load. The lifetime of the code object * must exceed that of the executable: if @p code_object is destroyed before @p * executable, the behavior is undefined. * * @param[in] options Vendor-specific options. May be NULL. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate * resources required for the operation. * * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. * * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p agent is not compatible * with @p code_object (for example, @p agent does not support the default * floating-point rounding mode specified by @p code_object), or @p code_object * is not compatible with @p executable (for example, @p code_object and @p * executable have different machine models or profiles). * * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_load_code_object( hsa_executable_t executable, hsa_agent_t agent, hsa_code_object_t code_object, const char *options); #endif /** * @brief Freeze the executable. * * @details No modifications to executable can be made after freezing: no * code objects can be loaded to the executable, no external variables can * be defined. Freezing the executable does not prevent querying executable's * attributes. * * @param[in] executable Executable. * * @param[in] options Vendor-specific options. May be NULL. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. * * @retval ::HSA_STATUS_ERROR_VARIABLE_UNDEFINED One or more variable is * undefined in the executable. * * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is already frozen. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_freeze( hsa_executable_t executable, const char *options); #endif /** * @brief Executable attributes. */ typedef enum { /** * Profile this executable is created for. The type of this attribute is * ::hsa_profile_t. */ HSA_EXECUTABLE_INFO_PROFILE = 1, /** * Executable state. The type of this attribute is ::hsa_executable_state_t. */ HSA_EXECUTABLE_INFO_STATE = 2 } hsa_executable_info_t; /** * @brief Get the current value of an attribute for a given executable. * * @param[in] executable Executable. * * @param[in] attribute Attribute to query. * * @param[out] value Pointer to an application-allocated buffer where to store * the value of the attribute. If the buffer passed by the application is not * large enough to hold the value of @p attribute, the behavior is undefined. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid * executable attribute, or @p value is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_get_info( hsa_executable_t executable, hsa_executable_info_t attribute, void *value); #endif /** * @brief Define an external global variable with program allocation. * * @details This function allows the application to provide the definition * of a variable in the global segment memory with program allocation. The * variable must be defined before loading a code object into an executable. * In addition, code objects loaded must not define the variable. * * @param[in] executable Executable. * * @param[in] variable_name Name of the variable. * * @param[in] address Address where the variable is defined. The buffer pointed * by @p address is owned by the application, and cannot be deallocated before * @p executable is destroyed. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate * resources required for the operation. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. * * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. * * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is * already defined. * * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the * @p variable_name. * * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_global_variable_define( hsa_executable_t executable, const char *variable_name, void *address); #endif /** * @brief Define an external global variable with agent allocation. * * @details This function allows the application to provide the definition * of a variable in the global segment memory with agent allocation. The * variable must be defined before loading a code object into an executable. * In addition, code objects loaded must not define the variable. * * @param[in] executable Executable. * * @param[in] agent Agent for which the variable is being defined. * * @param[in] variable_name Name of the variable. * * @param[in] address Address where the variable is defined. The buffer pointed * by @p address is owned by the application, and cannot be deallocated before * @p executable is destroyed. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate * resources required for the operation. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. * * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. * * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is * already defined. * * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the * @p variable_name. * * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_agent_global_variable_define( hsa_executable_t executable, hsa_agent_t agent, const char *variable_name, void *address); #endif /** * @brief Define an external readonly variable. * * @details This function allows the application to provide the definition * of a variable in the readonly segment memory. The variable must be defined * before loading a code object into an executable. In addition, code objects * loaded must not define the variable. * * @param[in] executable Executable. * * @param[in] agent Agent for which the variable is being defined. * * @param[in] variable_name Name of the variable. * * @param[in] address Address where the variable is defined. The buffer pointed * by @p address is owned by the application, and cannot be deallocated before * @p executable is destroyed. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate * resources required for the operation. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. * * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE Executable is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. * * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is * already defined. * * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the * @p variable_name. * * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_readonly_variable_define( hsa_executable_t executable, hsa_agent_t agent, const char *variable_name, void *address); #endif /** * @brief Validate executable. Checks that all code objects have matching * machine model, profile, and default floating-point rounding mode. Checks that * all declarations have definitions. Checks declaration-definition * compatibility (see HSA Programming Reference Manual for compatibility rules). * * @param[in] executable Executable. * * @param[out] result Memory location where the HSA runtime stores the * validation result. If the executable is valid, the result is 0. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_validate( hsa_executable_t executable, uint32_t* result); #endif /** * @brief Executable symbol. */ typedef struct hsa_executable_symbol_s { /** * Opaque handle. */ uint64_t handle; } hsa_executable_symbol_t; /** * @brief Get the symbol handle for a given a symbol name. * * @param[in] executable Executable. * * @param[in] module_name Module name. Must be NULL if the symbol has * program linkage. * * @param[in] symbol_name Symbol name. * * @param[in] agent Agent associated with the symbol. If the symbol is * independent of any agent (for example, a variable with program * allocation), this argument is ignored. * * @param[in] call_convention Call convention associated with the symbol. If the * symbol does not correspond to an indirect function, this argument is ignored. * * @param[out] symbol Memory location where the HSA runtime stores the symbol * handle. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name * that matches @p symbol_name. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or * @p symbol is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_get_symbol( hsa_executable_t executable, const char *module_name, const char *symbol_name, hsa_agent_t agent, int32_t call_convention, hsa_executable_symbol_t *symbol); #endif /** * @brief Executable symbol attributes. */ typedef enum { /** * The kind of the symbol. The type of this attribute is ::hsa_symbol_kind_t. */ HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0, /** * The length of the symbol name. The type of this attribute is uint32_t. */ HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1, /** * The name of the symbol. The type of this attribute is character array with * the length equal to the value of ::HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH * attribute */ HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2, /** * The length of the module name to which this symbol belongs if this symbol * has module linkage, otherwise 0 is returned. The type of this attribute is * uint32_t. */ HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, /** * The module name to which this symbol belongs if this symbol has module * linkage, otherwise empty string is returned. The type of this attribute is * character array with the length equal to the value of * ::HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. */ HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME = 4, /** * Agent associated with this symbol. If the symbol is a variable, the * value of this attribute is only defined if * ::HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION is * ::HSA_VARIABLE_ALLOCATION_AGENT. The type of this attribute is hsa_agent_t. */ HSA_EXECUTABLE_SYMBOL_INFO_AGENT = 20, /** * The address of the variable. The value of this attribute is undefined if * the symbol is not a variable. The type of this attribute is uint64_t. * * If executable's state is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 is * returned. */ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21, /** * The linkage kind of the symbol. The type of this attribute is * ::hsa_symbol_linkage_t. */ HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE = 5, /** * Indicates whether the symbol corresponds to a definition. The type of this * attribute is bool. */ HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION = 17, /** * The allocation kind of the variable. The value of this attribute is * undefined if the symbol is not a variable. The type of this attribute is * ::hsa_variable_allocation_t. */ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, /** * The segment kind of the variable. The value of this attribute is undefined * if the symbol is not a variable. The type of this attribute is * ::hsa_variable_segment_t. */ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, /** * Alignment of the variable. The value of this attribute is undefined if * the symbol is not a variable. The type of this attribute is uint32_t. */ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, /** * Size of the variable. The value of this attribute is undefined if * the symbol is not a variable. The type of this attribute is uint32_t. * * A value of 0 is returned if the variable is an external variable and has an * unknown dimension. */ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9, /** * Indicates whether the variable is constant. The value of this attribute is * undefined if the symbol is not a variable. The type of this attribute is * bool. */ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, /** * Kernel object handle, used in the kernel dispatch packet. The value of this * attribute is undefined if the symbol is not a kernel. The type of this * attribute is uint64_t. * * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 * is returned. */ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22, /** * Size of kernarg segment memory that is required to hold the values of the * kernel arguments, in bytes. The value of this attribute is undefined if the * symbol is not a kernel. The type of this attribute is uint32_t. */ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, /** * Alignment (in bytes) of the buffer used to pass arguments to the kernel, * which is the maximum of 16 and the maximum alignment of any of the kernel * arguments. The value of this attribute is undefined if the symbol is not a * kernel. The type of this attribute is uint32_t. */ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, /** * Size of static group segment memory required by the kernel (per * work-group), in bytes. The value of this attribute is undefined * if the symbol is not a kernel. The type of this attribute is uint32_t. * * The reported amount does not include any dynamically allocated group * segment memory that may be requested by the application when a kernel is * dispatched. */ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, /** * Size of static private, spill, and arg segment memory required by * this kernel (per work-item), in bytes. The value of this attribute is * undefined if the symbol is not a kernel. The type of this attribute is * uint32_t. * * If the value of ::HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is * true, the kernel may use more private memory than the reported value, and * the application must add the dynamic call stack usage to @a * private_segment_size when populating a kernel dispatch packet. */ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, /** * Dynamic callstack flag. The value of this attribute is undefined if the * symbol is not a kernel. The type of this attribute is bool. * * If this flag is set (the value is true), the kernel uses a dynamically * sized call stack. This can happen if recursive calls, calls to indirect * functions, or the HSAIL alloca instruction are present in the kernel. */ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, /** * Indirect function object handle. The value of this attribute is undefined * if the symbol is not an indirect function, or the associated agent does * not support the Full Profile. The type of this attribute depends on the * machine model: if machine model is small, then the type is uint32_t, if * machine model is large, then the type is uint64_t. * * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 * is returned. */ HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT = 23, /** * Call convention of the indirect function. The value of this attribute is * undefined if the symbol is not an indirect function, or the associated * agent does not support the Full Profile. The type of this attribute is * uint32_t. */ HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 } hsa_executable_symbol_info_t; /** * @brief Get the current value of an attribute for a given executable symbol. * * @param[in] executable_symbol Executable symbol. * * @param[in] attribute Attribute to query. * * @param[out] value Pointer to an application-allocated buffer where to store * the value of the attribute. If the buffer passed by the application is not * large enough to hold the value of @p attribute, the behavior is undefined. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid * executable symbol attribute, or @p value is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_symbol_get_info( hsa_executable_symbol_t executable_symbol, hsa_executable_symbol_info_t attribute, void *value); #endif /** * @brief Iterate over the symbols in a executable, and invoke an * application-defined callback on every iteration. * * @param[in] executable Executable. * * @param[in] callback Callback to be invoked once per executable symbol. The * HSA runtime passes three arguments to the callback: the executable, a symbol, * and the application data. If @p callback returns a status other than * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and * ::hsa_executable_iterate_symbols returns that status value. * * @param[in] data Application data that is passed to @p callback on every * iteration. May be NULL. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE Th executable is invalid. * * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. */ #ifndef DEVICE_COMPILER hsa_status_t HSA_API hsa_executable_iterate_symbols( hsa_executable_t executable, hsa_status_t (*callback)(hsa_executable_t executable, hsa_executable_symbol_t symbol, void* data), void* data); #endif /** @} */ #ifdef __cplusplus } // end extern "C" block #endif #endif // header guard ROCm-Device-Libs-rocm-5.0.0/ockl/inc/ockl.h000066400000000000000000000715651415221260100201500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #ifndef OCKL_H #define OCKL_H // This C header declares the functions provided by the OCKL library // Aspects of this library's behavior can be controlled via the // oclc library. See the oclc header for further information #define _MANGLE3x(P,N,S) P##_##N##S #define MANGLE3x(P,N,S) _MANGLE3x(P,N,S) #define _MANGLE3(P,N,S) P##_##N##_##S #define MANGLE3(P,N,S) _MANGLE3(P,N,S) #define OCKL_MANGLE_T(N,T) MANGLE3(__ockl, N, T) #define OCKL_MANGLE_Tx(N,T) MANGLE3x(__ockl, N, T) #define OCKL_MANGLE_I32(N) OCKL_MANGLE_T(N, i32) #define OCKL_MANGLE_U32(N) OCKL_MANGLE_T(N, u32) #define OCKL_MANGLE_F32(N) OCKL_MANGLE_T(N, f32) #define OCKL_MANGLE_F16(N) OCKL_MANGLE_T(N, f16) #define OCKL_MANGLE_I64(N) OCKL_MANGLE_T(N, i64) #define OCKL_MANGLE_U64(N) OCKL_MANGLE_T(N, u64) #define DECL_OCKL_NULLARY_U32(N) extern uint OCKL_MANGLE_U32(N)(void); #define _DECL_X_OCKL_NULLARY_U32(A,N) extern __attribute__((A)) uint OCKL_MANGLE_U32(N)(void); #define DECL_PURE_OCKL_NULLARY_U32(N) _DECL_X_OCKL_NULLARY_U32(pure, N) #define DECL_CONST_OCKL_NULLARY_U32(N) _DECL_X_OCKL_NULLARY_U32(const, N) #define DECL_OCKL_NULLARY_U64(N) extern ulong OCKL_MANGLE_U64(N)(void); #define _DECL_X_OCKL_NULLARY_U64(A,N) extern __attribute__((A)) ulong OCKL_MANGLE_U64(N)(void); #define DECL_PURE_OCKL_NULLARY_U64(N) _DECL_X_OCKL_NULLARY_U64(pure, N) #define DECL_CONST_OCKL_NULLARY_U64(N) _DECL_X_OCKL_NULLARY_U64(const, N) #define DECL_OCKL_UNARY_I32(N) extern int OCKL_MANGLE_I32(N)(int); #define _DECL_X_OCKL_UNARY_I32(A,N) extern __attribute__((A)) int OCKL_MANGLE_I32(N)(int); #define DECL_PURE_OCKL_UNARY_I32(N) _DECL_X_OCKL_UNARY_I32(pure, N) #define DECL_CONST_OCKL_UNARY_I32(N) _DECL_X_OCKL_UNARY_I32(const, N) #define DECL_OCKL_UNARY_I64(N) extern long OCKL_MANGLE_I64(N)(long); #define _DECL_X_OCKL_UNARY_I64(A,N) extern __attribute__((A)) long OCKL_MANGLE_I64(N)(long); #define DECL_PURE_OCKL_UNARY_I64(N) _DECL_X_OCKL_UNARY_I64(pure, N) #define DECL_CONST_OCKL_UNARY_I64(N) _DECL_X_OCKL_UNARY_I64(const, N) #define DECL_OCKL_UNARY_U32(N) extern uint OCKL_MANGLE_U32(N)(uint); #define _DECL_X_OCKL_UNARY_U32(A,N) extern __attribute__((A)) uint OCKL_MANGLE_U32(N)(uint); #define DECL_PURE_OCKL_UNARY_U32(N) _DECL_X_OCKL_UNARY_U32(pure, N) #define DECL_CONST_OCKL_UNARY_U32(N) _DECL_X_OCKL_UNARY_U32(const, N) #define DECL_OCKL_UNARY_U64(N) extern ulong OCKL_MANGLE_U64(N)(ulong); #define _DECL_X_OCKL_UNARY_U64(A,N) extern __attribute__((A)) ulong OCKL_MANGLE_U64(N)(ulong); #define DECL_PURE_OCKL_UNARY_U64(N) _DECL_X_OCKL_UNARY_U64(pure, N) #define DECL_CONST_OCKL_UNARY_U64(N) _DECL_X_OCKL_UNARY_U64(const, N) #define DECL_OCKL_BINARY_I32(N) extern int OCKL_MANGLE_I32(N)(int,int); #define _DECL_X_OCKL_BINARY_I32(A,N) extern __attribute__((A)) int OCKL_MANGLE_I32(N)(int,int); #define DECL_PURE_OCKL_BINARY_I32(N) _DECL_X_OCKL_BINARY_I32(pure, N) #define DECL_CONST_OCKL_BINARY_I32(N) _DECL_X_OCKL_BINARY_I32(const, N) #define DECL_OCKL_BINARY_I64(N) extern long OCKL_MANGLE_I64(N)(long,long); #define _DECL_X_OCKL_BINARY_I64(A,N) extern __attribute__((A)) long OCKL_MANGLE_I64(N)(long,long); #define DECL_PURE_OCKL_BINARY_I64(N) _DECL_X_OCKL_BINARY_I64(pure, N) #define DECL_CONST_OCKL_BINARY_I64(N) _DECL_X_OCKL_BINARY_I64(const, N) #define DECL_OCKL_BINARY_U32(N) extern uint OCKL_MANGLE_U32(N)(uint,uint); #define _DECL_X_OCKL_BINARY_U32(A,N) extern __attribute__((A)) uint OCKL_MANGLE_U32(N)(uint,uint); #define DECL_PURE_OCKL_BINARY_U32(N) _DECL_X_OCKL_BINARY_U32(pure, N) #define DECL_CONST_OCKL_BINARY_U32(N) _DECL_X_OCKL_BINARY_U32(const, N) #define DECL_OCKL_BINARY_U64(N) extern ulong OCKL_MANGLE_U64(N)(ulong,ulong); #define _DECL_X_OCKL_BINARY_U64(A,N) extern __attribute__((A)) ulong OCKL_MANGLE_U64(N)(ulong,ulong); #define DECL_PURE_OCKL_BINARY_U64(N) _DECL_X_OCKL_BINARY_U64(pure, N) #define DECL_CONST_OCKL_BINARY_U64(N) _DECL_X_OCKL_BINARY_U64(const, N) #define DECL_OCKL_TERNARY_I32(N) extern int OCKL_MANGLE_I32(N)(int,int,int); #define _DECL_X_OCKL_TERNARY_I32(A,N) extern __attribute__((A)) int OCKL_MANGLE_I32(N)(int,int,int); #define DECL_PURE_OCKL_TERNARY_I32(N) _DECL_X_OCKL_TERNARY_I32(pure, N) #define DECL_CONST_OCKL_TERNARY_I32(N) _DECL_X_OCKL_TERNARY_I32(const, N) #define DECL_OCKL_TERNARY_F32(N) extern float OCKL_MANGLE_F32(N)(float,float,float); #define _DECL_X_OCKL_TERNARY_F32(A,N) extern __attribute__((A)) float OCKL_MANGLE_F32(N)(float,float,float); #define DECL_PURE_OCKL_TERNARY_F32(N) _DECL_X_OCKL_TERNARY_F32(pure, N) #define DECL_CONST_OCKL_TERNARY_F32(N) _DECL_X_OCKL_TERNARY_F32(const, N) #define DECL_OCKL_TERNARY_F16(N) extern half OCKL_MANGLE_F16(N)(half,half,half); #define _DECL_X_OCKL_TERNARY_F16(A,N) extern __attribute__((A)) half OCKL_MANGLE_F16(N)(half,half,half); #define DECL_PURE_OCKL_TERNARY_F16(N) _DECL_X_OCKL_TERNARY_F16(pure, N) #define DECL_CONST_OCKL_TERNARY_F16(N) _DECL_X_OCKL_TERNARY_F16(const, N) #define DECL_OCKL_TERNARY_I64(N) extern long OCKL_MANGLE_I64(N)(long,long,long); #define _DECL_X_OCKL_TERNARY_I64(A,N) extern __attribute__((A)) long OCKL_MANGLE_I64(N)(long,long,long); #define DECL_PURE_OCKL_TERNARY_I64(N) _DECL_X_OCKL_TERNARY_I64(pure, N) #define DECL_CONST_OCKL_TERNARY_I64(N) _DECL_X_OCKL_TERNARY_I64(const, N) #define DECL_OCKL_TERNARY_U32(N) extern uint OCKL_MANGLE_U32(N)(uint,uint,uint); #define _DECL_X_OCKL_TERNARY_U32(A,N) extern __attribute__((A)) uint OCKL_MANGLE_U32(N)(uint,uint,uint); #define DECL_PURE_OCKL_TERNARY_U32(N) _DECL_X_OCKL_TERNARY_U32(pure, N) #define DECL_CONST_OCKL_TERNARY_U32(N) _DECL_X_OCKL_TERNARY_U32(const, N) #define DECL_OCKL_TERNARY_U64(N) extern ulong OCKL_MANGLE_U64(N)(ulong,ulong,ulong); #define _DECL_X_OCKL_TERNARY_U64(A,N) extern __attribute__((A)) ulong OCKL_MANGLE_U64(N)(ulong,ulong,ulong); #define DECL_PURE_OCKL_TERNARY_U64(N) _DECL_X_OCKL_TERNARY_U64(pure, N) #define DECL_CONST_OCKL_TERNARY_U64(N) _DECL_X_OCKL_TERNARY_U64(const, N) #pragma OPENCL EXTENSION cl_khr_fp16 : enable extern __attribute__((const)) uchar OCKL_MANGLE_T(clz,u8)(uchar); extern __attribute__((const)) ushort OCKL_MANGLE_T(clz,u16)(ushort); DECL_CONST_OCKL_UNARY_U32(clz) DECL_CONST_OCKL_UNARY_U64(clz) extern __attribute__((const)) uchar OCKL_MANGLE_T(ctz,u8)(uchar); extern __attribute__((const)) ushort OCKL_MANGLE_T(ctz,u16)(ushort); DECL_CONST_OCKL_UNARY_U32(ctz) DECL_CONST_OCKL_UNARY_U64(ctz) DECL_CONST_OCKL_UNARY_U32(popcount) DECL_CONST_OCKL_UNARY_U64(popcount) DECL_CONST_OCKL_BINARY_I32(add_sat) DECL_CONST_OCKL_BINARY_U32(add_sat) DECL_CONST_OCKL_BINARY_I64(add_sat) DECL_CONST_OCKL_BINARY_U64(add_sat) DECL_CONST_OCKL_BINARY_I32(sub_sat) DECL_CONST_OCKL_BINARY_U32(sub_sat) DECL_CONST_OCKL_BINARY_I64(sub_sat) DECL_CONST_OCKL_BINARY_U64(sub_sat) DECL_CONST_OCKL_BINARY_I32(mul_hi) DECL_CONST_OCKL_BINARY_U32(mul_hi) DECL_CONST_OCKL_BINARY_I64(mul_hi) DECL_CONST_OCKL_BINARY_U64(mul_hi) DECL_CONST_OCKL_BINARY_I32(mul24) DECL_CONST_OCKL_BINARY_U32(mul24) DECL_OCKL_NULLARY_U32(lane) DECL_OCKL_NULLARY_U32(activelane) DECL_OCKL_NULLARY_U64(memtime) DECL_OCKL_NULLARY_U64(memrealtime) extern half OCKL_MANGLE_T(wfred_add,f16)(half x); extern float OCKL_MANGLE_T(wfred_add,f32)(float x); extern double OCKL_MANGLE_T(wfred_add,f64)(double x); extern int OCKL_MANGLE_T(wfred_add,i32)(int x); extern long OCKL_MANGLE_T(wfred_add,i64)(long x); extern uint OCKL_MANGLE_T(wfred_add,u32)(uint x); extern ulong OCKL_MANGLE_T(wfred_add,u64)(ulong x); extern int OCKL_MANGLE_T(wfred_and,i32)(int x); extern long OCKL_MANGLE_T(wfred_and,i64)(long x); extern uint OCKL_MANGLE_T(wfred_and,u32)(uint x); extern ulong OCKL_MANGLE_T(wfred_and,u64)(ulong x); extern half OCKL_MANGLE_T(wfred_max,f16)(half x); extern float OCKL_MANGLE_T(wfred_max,f32)(float x); extern double OCKL_MANGLE_T(wfred_max,f64)(double x); extern int OCKL_MANGLE_T(wfred_max,i32)(int x); extern long OCKL_MANGLE_T(wfred_max,i64)(long x); extern uint OCKL_MANGLE_T(wfred_max,u32)(uint x); extern ulong OCKL_MANGLE_T(wfred_max,u64)(ulong x); extern half OCKL_MANGLE_T(wfred_min,f16)(half x); extern float OCKL_MANGLE_T(wfred_min,f32)(float x); extern double OCKL_MANGLE_T(wfred_min,f64)(double x); extern int OCKL_MANGLE_T(wfred_min,i32)(int x); extern long OCKL_MANGLE_T(wfred_min,i64)(long x); extern uint OCKL_MANGLE_T(wfred_min,u32)(uint x); extern ulong OCKL_MANGLE_T(wfred_min,u64)(ulong x); extern int OCKL_MANGLE_T(wfred_or,i32)(int x); extern long OCKL_MANGLE_T(wfred_or,i64)(long x); extern uint OCKL_MANGLE_T(wfred_or,u32)(uint x); extern ulong OCKL_MANGLE_T(wfred_or,u64)(ulong x); extern int OCKL_MANGLE_T(wfred_xor,i32)(int x); extern long OCKL_MANGLE_T(wfred_xor,i64)(long x); extern uint OCKL_MANGLE_T(wfred_xor,u32)(uint x); extern ulong OCKL_MANGLE_T(wfred_xor,u64)(ulong x); extern half OCKL_MANGLE_T(wfscan_add,f16)(half x, bool inclusive); extern float OCKL_MANGLE_T(wfscan_add,f32)(float x, bool inclusive); extern double OCKL_MANGLE_T(wfscan_add,f64)(double x, bool inclusive); extern int OCKL_MANGLE_T(wfscan_add,i32)(int x, bool inclusive); extern long OCKL_MANGLE_T(wfscan_add,i64)(long x, bool inclusive); extern uint OCKL_MANGLE_T(wfscan_add,u32)(uint x, bool inclusive); extern ulong OCKL_MANGLE_T(wfscan_add,u64)(ulong x, bool inclusive); extern int OCKL_MANGLE_T(wfscan_and,i32)(int x, bool inclusive); extern long OCKL_MANGLE_T(wfscan_and,i64)(long x, bool inclusive); extern uint OCKL_MANGLE_T(wfscan_and,u32)(uint x, bool inclusive); extern ulong OCKL_MANGLE_T(wfscan_and,u64)(ulong x, bool inclusive); extern half OCKL_MANGLE_T(wfscan_max,f16)(half x, bool inclusive); extern float OCKL_MANGLE_T(wfscan_max,f32)(float x, bool inclusive); extern double OCKL_MANGLE_T(wfscan_max,f64)(double x, bool inclusive); extern int OCKL_MANGLE_T(wfscan_max,i32)(int x, bool inclusive); extern long OCKL_MANGLE_T(wfscan_max,i64)(long x, bool inclusive); extern uint OCKL_MANGLE_T(wfscan_max,u32)(uint x, bool inclusive); extern ulong OCKL_MANGLE_T(wfscan_max,u64)(ulong x, bool inclusive); extern half OCKL_MANGLE_T(wfscan_min,f16)(half x, bool inclusive); extern float OCKL_MANGLE_T(wfscan_min,f32)(float x, bool inclusive); extern double OCKL_MANGLE_T(wfscan_min,f64)(double x, bool inclusive); extern int OCKL_MANGLE_T(wfscan_min,i32)(int x, bool inclusive); extern long OCKL_MANGLE_T(wfscan_min,i64)(long x, bool inclusive); extern uint OCKL_MANGLE_T(wfscan_min,u32)(uint x, bool inclusive); extern ulong OCKL_MANGLE_T(wfscan_min,u64)(ulong x, bool inclusive); extern int OCKL_MANGLE_T(wfscan_or,i32)(int x, bool inclusive); extern long OCKL_MANGLE_T(wfscan_or,i64)(long x, bool inclusive); extern uint OCKL_MANGLE_T(wfscan_or,u32)(uint x, bool inclusive); extern ulong OCKL_MANGLE_T(wfscan_or,u64)(ulong x, bool inclusive); extern int OCKL_MANGLE_T(wfscan_xor,i32)(int x, bool inclusive); extern long OCKL_MANGLE_T(wfscan_xor,i64)(long x, bool inclusive); extern uint OCKL_MANGLE_T(wfscan_xor,u32)(uint x, bool inclusive); extern ulong OCKL_MANGLE_T(wfscan_xor,u64)(ulong x, bool inclusive); extern uint OCKL_MANGLE_U32(wfbcast)(uint x, uint i); extern ulong OCKL_MANGLE_U64(wfbcast)(ulong x, uint i); extern bool OCKL_MANGLE_I32(wfany)(int e); extern bool OCKL_MANGLE_I32(wfall)(int e); extern bool OCKL_MANGLE_I32(wfsame)(int e); DECL_CONST_OCKL_BINARY_U32(bfm) extern __attribute__((const)) int OCKL_MANGLE_I32(bfe)(int, uint, uint); DECL_CONST_OCKL_TERNARY_U32(bfe) DECL_CONST_OCKL_TERNARY_U32(bitalign) DECL_CONST_OCKL_TERNARY_U32(bytealign) DECL_CONST_OCKL_TERNARY_U32(lerp) DECL_CONST_OCKL_TERNARY_F32(max3) DECL_CONST_OCKL_TERNARY_F32(median3) DECL_CONST_OCKL_TERNARY_F32(min3) DECL_CONST_OCKL_TERNARY_F16(max3) DECL_CONST_OCKL_TERNARY_F16(median3) DECL_CONST_OCKL_TERNARY_F16(min3) DECL_CONST_OCKL_TERNARY_I32(max3) DECL_CONST_OCKL_TERNARY_I32(median3) DECL_CONST_OCKL_TERNARY_I32(min3) DECL_CONST_OCKL_TERNARY_U32(max3) DECL_CONST_OCKL_TERNARY_U32(median3) DECL_CONST_OCKL_TERNARY_U32(min3) extern __attribute__((const)) ulong OCKL_MANGLE_U64(mqsad)(ulong, uint, ulong); extern __attribute__((const)) uint OCKL_MANGLE_U32(pack)(float4); extern __attribute__((const)) ulong OCKL_MANGLE_U64(qsad)(ulong, uint, ulong); DECL_CONST_OCKL_TERNARY_U32(msad) DECL_CONST_OCKL_TERNARY_U32(sad) DECL_CONST_OCKL_TERNARY_U32(sadd) DECL_CONST_OCKL_TERNARY_U32(sadhi) DECL_CONST_OCKL_TERNARY_U32(sadw) extern __attribute__((const)) float OCKL_MANGLE_F32(unpack0)(uint); extern __attribute__((const)) float OCKL_MANGLE_F32(unpack1)(uint); extern __attribute__((const)) float OCKL_MANGLE_F32(unpack2)(uint); extern __attribute__((const)) float OCKL_MANGLE_F32(unpack3)(uint); #define SSHARP __constant uint * #define TSHARP __constant uint * extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load,1D)(TSHARP i, int c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load,1Da)(TSHARP i, int2 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load,1Db)(TSHARP i, int c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load,2D)(TSHARP i, int2 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load,2Da)(TSHARP i, int4 c); extern __attribute__((pure)) float OCKL_MANGLE_T(image_load,2Dad)(TSHARP i, int4 c); extern __attribute__((pure)) float OCKL_MANGLE_T(image_load,2Dd)(TSHARP i, int2 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load,3D)(TSHARP i, int4 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load,CM)(TSHARP i, int2 c, int f); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load,CMa)(TSHARP i, int4 c, int f); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load_mip,1D)(TSHARP i, int c, int l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load_mip,1Da)(TSHARP i, int2 c, int l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load_mip,2D)(TSHARP i, int2 c, int l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load_mip,2Da)(TSHARP i, int4 c, int l); extern __attribute__((pure)) float OCKL_MANGLE_T(image_load_mip,2Dad)(TSHARP i, int4 c, int l); extern __attribute__((pure)) float OCKL_MANGLE_T(image_load_mip,2Dd)(TSHARP i, int2 c, int l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load_mip,3D)(TSHARP i, int4 c, int l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load_mip,CM)(TSHARP i, int2 c, int f, int l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_load_mip,CMa)(TSHARP i, int4 c, int f, int l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh,1D)(TSHARP i, int c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh,1Da)(TSHARP i, int2 c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh,1Db)(TSHARP i, int c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh,2D)(TSHARP i, int2 c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh,2Da)(TSHARP i, int4 c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh,3D)(TSHARP i, int4 c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh,CM)(TSHARP i, int2 c, int f); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh,CMa)(TSHARP i, int4 c, int f); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh_mip,1D)(TSHARP i, int c, int l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh_mip,1Da)(TSHARP i, int2 c, int l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh_mip,2D)(TSHARP i, int2 c, int l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh_mip,2Da)(TSHARP i, int4 c, int l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh_mip,3D)(TSHARP i, int4 c, int l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh_mip,CM)(TSHARP i, int2 c, int f, int l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_loadh_mip,CMa)(TSHARP i, int4 c, int f, int l); extern void OCKL_MANGLE_T(image_store,1D)(TSHARP i, int c, float4 p); extern void OCKL_MANGLE_T(image_store,1Da)(TSHARP i, int2 c, float4 p); extern void OCKL_MANGLE_T(image_store,1Db)(TSHARP i, int c, float4 p); extern void OCKL_MANGLE_T(image_store,2D)(TSHARP i, int2 c, float4 p); extern void OCKL_MANGLE_T(image_store,2Da)(TSHARP i, int4 c, float4 p); extern void OCKL_MANGLE_T(image_store,2Dad)(TSHARP i, int4 c, float p); extern void OCKL_MANGLE_T(image_store,2Dd)(TSHARP i, int2 c, float p); extern void OCKL_MANGLE_T(image_store,3D)(TSHARP i, int4 c, float4 p); extern void OCKL_MANGLE_T(image_store,CM)(TSHARP i, int2 c, int f, float4 p); extern void OCKL_MANGLE_T(image_store,CMa)(TSHARP i, int4 c, int f, float4 p); extern void OCKL_MANGLE_T(image_store_lod,1D)(TSHARP i, int c, int l, float4 p); extern void OCKL_MANGLE_T(image_store_lod,1Da)(TSHARP i, int2 c, int l, float4 p); extern void OCKL_MANGLE_T(image_store_lod,2D)(TSHARP i, int2 c, int l, float4 p); extern void OCKL_MANGLE_T(image_store_lod,2Da)(TSHARP i, int4 c, int l, float4 p); extern void OCKL_MANGLE_T(image_store_lod,2Dad)(TSHARP i, int4 c, int l, float p); extern void OCKL_MANGLE_T(image_store_lod,2Dd)(TSHARP i, int2 c, int l, float p); extern void OCKL_MANGLE_T(image_store_lod,3D)(TSHARP i, int4 c, int l, float4 p); extern void OCKL_MANGLE_T(image_store_lod,CM)(TSHARP i, int2 c, int f, int l, float4 p); extern void OCKL_MANGLE_T(image_store_lod,CMa)(TSHARP i, int4 c, int f, int l, float4 p); extern void OCKL_MANGLE_T(image_storeh,1D)(TSHARP i, int c, half4 p); extern void OCKL_MANGLE_T(image_storeh,1Da)(TSHARP i, int2 c, half4 p); extern void OCKL_MANGLE_T(image_storeh,1Db)(TSHARP i, int c, half4 p); extern void OCKL_MANGLE_T(image_storeh,2D)(TSHARP i, int2 c, half4 p); extern void OCKL_MANGLE_T(image_storeh,2Da)(TSHARP i, int4 c, half4 p); extern void OCKL_MANGLE_T(image_storeh,3D)(TSHARP i, int4 c, half4 p); extern void OCKL_MANGLE_T(image_storeh,CM)(TSHARP i, int2 c, int f, half4 p); extern void OCKL_MANGLE_T(image_storeh,CMa)(TSHARP i, int4 c, int f, half4 p); extern void OCKL_MANGLE_T(image_storeh_lod,1D)(TSHARP i, int c, int l, half4 p); extern void OCKL_MANGLE_T(image_storeh_lod,1Da)(TSHARP i, int2 c, int l, half4 p); extern void OCKL_MANGLE_T(image_storeh_lod,2D)(TSHARP i, int2 c, int l, half4 p); extern void OCKL_MANGLE_T(image_storeh_lod,2Da)(TSHARP i, int4 c, int l, half4 p); extern void OCKL_MANGLE_T(image_storeh_lod,3D)(TSHARP i, int4 c, int l, half4 p); extern void OCKL_MANGLE_T(image_storeh_lod,CM)(TSHARP i, int2 c, int f, int l, half4 p); extern void OCKL_MANGLE_T(image_storeh_lod,CMa)(TSHARP i, int4 c, int f, int l, half4 p); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample,1D)(TSHARP i, SSHARP s, float c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample,1Da)(TSHARP i, SSHARP s, float2 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample,2D)(TSHARP i, SSHARP s, float2 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample,2Da)(TSHARP i, SSHARP s, float4 c); extern __attribute__((pure)) float OCKL_MANGLE_T(image_sample,2Dad)(TSHARP i, SSHARP s, float4 c); extern __attribute__((pure)) float OCKL_MANGLE_T(image_sample,2Dd)(TSHARP i, SSHARP s, float2 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample,3D)(TSHARP i, SSHARP s, float4 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample,CM)(TSHARP i, SSHARP s, float4 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample,CMa)(TSHARP i, SSHARP s, float4 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_grad,1D)(TSHARP i, SSHARP s, float c, float dx, float dy); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_grad,1Da)(TSHARP i, SSHARP s, float2 c, float dx, float dy); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_grad,2D)(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_grad,2Da)(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy); extern __attribute__((pure)) float OCKL_MANGLE_T(image_sample_grad,2Dad)(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy); extern __attribute__((pure)) float OCKL_MANGLE_T(image_sample_grad,2Dd)(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_grad,3D)(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_lod,1D)(TSHARP i, SSHARP s, float c, float l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_lod,1Da)(TSHARP i, SSHARP s, float2 c, float l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_lod,2D)(TSHARP i, SSHARP s, float2 c, float l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_lod,2Da)(TSHARP i, SSHARP s, float4 c, float l); extern __attribute__((pure)) float OCKL_MANGLE_T(image_sample_lod,2Dad)(TSHARP i, SSHARP s, float4 c, float l); extern __attribute__((pure)) float OCKL_MANGLE_T(image_sample_lod,2Dd)(TSHARP i, SSHARP s, float2 c, float l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_lod,3D)(TSHARP i, SSHARP s, float4 c, float l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_lod,CM)(TSHARP i, SSHARP s, float4 c, float l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_sample_lod,CMa)(TSHARP i, SSHARP s, float4 c, float l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh,1D)(TSHARP i, SSHARP s, float c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh,1Da)(TSHARP i, SSHARP s, float2 c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh,2D)(TSHARP i, SSHARP s, float2 c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh,2Da)(TSHARP i, SSHARP s, float4 c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh,3D)(TSHARP i, SSHARP s, float4 c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh,CM)(TSHARP i, SSHARP s, float4 c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh,CMa)(TSHARP i, SSHARP s, float4 c); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_grad,1D)(TSHARP i, SSHARP s, float c, float dx, float dy); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_grad,1Da)(TSHARP i, SSHARP s, float2 c, float dx, float dy); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_grad,2D)(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_grad,2Da)(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_grad,3D)(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_lod,1D)(TSHARP i, SSHARP s, float c, float l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_lod,1Da)(TSHARP i, SSHARP s, float2 c, float l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_lod,2D)(TSHARP i, SSHARP s, float2 c, float l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_lod,2Da)(TSHARP i, SSHARP s, float4 c, float l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_lod,3D)(TSHARP i, SSHARP s, float4 c, float l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_lod,CM)(TSHARP i, SSHARP s, float4 c, float l); extern __attribute__((pure)) half4 OCKL_MANGLE_T(image_sampleh_lod,CMa)(TSHARP i, SSHARP s, float4 c, float l); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_gather4r,2D)(TSHARP i, SSHARP s, float2 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_gather4g,2D)(TSHARP i, SSHARP s, float2 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_gather4b,2D)(TSHARP i, SSHARP s, float2 c); extern __attribute__((pure)) float4 OCKL_MANGLE_T(image_gather4a,2D)(TSHARP i, SSHARP s, float2 c); extern __attribute__((const)) int OCKL_MANGLE_T(image_array_size,1Da)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_array_size,2Da)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_array_size,2Dad)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_array_size,CMa)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_data_type,1D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_data_type,1Da)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_data_type,1Db)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_data_type,2D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_data_type,2Da)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_data_type,2Dad)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_data_type,2Dd)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_data_type,3D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_data_type,CM)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_data_type,CMa)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_order,1D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_order,1Da)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_order,1Db)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_order,2D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_order,2Da)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_order,2Dad)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_order,2Dd)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_order,3D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_order,CM)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_channel_order,CMa)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_depth,3D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_height,2D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_height,2Da)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_height,2Dad)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_height,2Dd)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_height,3D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_height,CM)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_height,CMa)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_num_mip_levels,1D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_num_mip_levels,1Da)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_num_mip_levels,2D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_num_mip_levels,2Da)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_num_mip_levels,2Dad)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_num_mip_levels,2Dd)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_num_mip_levels,3D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_num_mip_levels,CM)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_num_mip_levels,CMa)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_width,1D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_width,1Da)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_width,1Db)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_width,2D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_width,2Da)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_width,2Dad)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_width,2Dd)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_width,3D)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_width,CM)(TSHARP i); extern __attribute__((const)) int OCKL_MANGLE_T(image_width,CMa)(TSHARP i); extern __attribute__((const)) size_t __ockl_get_global_offset(uint); extern __attribute__((const)) size_t __ockl_get_global_id(uint); extern __attribute__((const)) size_t __ockl_get_local_id(uint); extern __attribute__((const)) size_t __ockl_get_group_id(uint); extern __attribute__((const)) size_t __ockl_get_global_size(uint); extern __attribute__((const)) size_t __ockl_get_local_size(uint); extern __attribute__((const)) size_t __ockl_get_num_groups(uint); extern __attribute__((const)) uint __ockl_get_work_dim(void); extern __attribute__((const)) size_t __ockl_get_enqueued_local_size(uint); extern __attribute__((const)) size_t __ockl_get_global_linear_id(void); extern __attribute__((const)) size_t __ockl_get_local_linear_id(void); extern __attribute__((const)) int __ockl_readuplane_i32(int, int); extern __attribute__((const)) long __ockl_readuplane_i64(long, int); extern __attribute__((const)) bool OCKL_MANGLE_T(is_local,addr)(const void *); extern __attribute__((const)) bool OCKL_MANGLE_T(is_private,addr)(const void *); extern __attribute__((const)) __global void * OCKL_MANGLE_T(to,global)(void *); extern __attribute__((const)) __local void * OCKL_MANGLE_T(to,local)(void *); extern __attribute__((const)) __private void * OCKL_MANGLE_T(to,private)(void *); extern void OCKL_MANGLE_T(rtcwait,u32)(uint); extern void __ockl_sanitizer_report(ulong, ulong, ulong, ulong, ulong, ulong, ulong, ulong); #pragma OPENCL EXTENSION cl_khr_fp16 : disable #endif // OCKL_H ROCm-Device-Libs-rocm-5.0.0/ockl/inc/ockl_hsa.h000066400000000000000000000044471415221260100207760ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #ifndef OCKL_HSA_H #define OCKL_HSA_H #include "ockl.h" #include "device_amd_hsa.h" typedef enum __ockl_memory_order_e { __ockl_memory_order_relaxed = __ATOMIC_RELAXED, __ockl_memory_order_acquire = __ATOMIC_ACQUIRE, __ockl_memory_order_release = __ATOMIC_RELEASE, __ockl_memory_order_acq_rel = __ATOMIC_ACQ_REL, __ockl_memory_order_seq_cst = __ATOMIC_SEQ_CST, } __ockl_memory_order; extern ulong OCKL_MANGLE_T(hsa_queue,load_read_index)(const __global hsa_queue_t *queue, __ockl_memory_order mem_order); extern ulong OCKL_MANGLE_T(hsa_queue,load_write_index)(const __global hsa_queue_t *queue, __ockl_memory_order mem_order); extern ulong OCKL_MANGLE_T(hsa_queue,add_write_index)(__global hsa_queue_t *queue, ulong value, __ockl_memory_order mem_order); extern ulong OCKL_MANGLE_T(hsa_queue,cas_write_index)(__global hsa_queue_t *queue, ulong expected, ulong value, __ockl_memory_order mem_order); extern void OCKL_MANGLE_T(hsa_queue,store_write_index)(__global hsa_queue_t *queue, ulong value, __ockl_memory_order mem_order); extern long OCKL_MANGLE_T(hsa_signal,load)(const hsa_signal_t sig, __ockl_memory_order mem_order); extern void OCKL_MANGLE_T(hsa_signal,add)(hsa_signal_t sig, long value, __ockl_memory_order mem_order); extern void OCKL_MANGLE_T(hsa_signal,and)(hsa_signal_t sig, long value, __ockl_memory_order mem_order); extern void OCKL_MANGLE_T(hsa_signal,or)(hsa_signal_t sig, long value, __ockl_memory_order mem_order); extern void OCKL_MANGLE_T(hsa_signal,xor)(hsa_signal_t sig, long value, __ockl_memory_order mem_order); extern long OCKL_MANGLE_T(hsa_signal,exchange)(hsa_signal_t sig, long value, __ockl_memory_order mem_order); extern void OCKL_MANGLE_T(hsa_signal,subtract)(hsa_signal_t sig, long value, __ockl_memory_order mem_order); extern long OCKL_MANGLE_T(hsa_signal,cas)(hsa_signal_t sig, long expected, long value, __ockl_memory_order mem_order); extern void OCKL_MANGLE_T(hsa_signal,store)(hsa_signal_t sig, long value, __ockl_memory_order mem_order); #endif // OCKL_HSA_H ROCm-Device-Libs-rocm-5.0.0/ockl/inc/wgscratch.h000066400000000000000000000006101415221260100211640ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ extern __attribute__((const)) __local ulong *__get_scratch_lds(void); ROCm-Device-Libs-rocm-5.0.0/ockl/src/000077500000000000000000000000001415221260100170475ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/ockl/src/activelane.cl000066400000000000000000000012601415221260100215010ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" #include "ockl.h" uint OCKL_MANGLE_U32(activelane)(void) { if (__oclc_wavefrontsize64) { return __builtin_amdgcn_mbcnt_hi(__builtin_amdgcn_read_exec_hi(), __builtin_amdgcn_mbcnt_lo(__builtin_amdgcn_read_exec_lo(), 0u)); } else { return __builtin_amdgcn_mbcnt_lo(__builtin_amdgcn_read_exec_lo(), 0u); } } ROCm-Device-Libs-rocm-5.0.0/ockl/src/add_sat.cl000066400000000000000000000017011415221260100207650ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "ockl.h" int OCKL_MANGLE_I32(add_sat)(int x, int y) { int s; bool c = __builtin_sadd_overflow(x, y, &s); return c ? (x < 0 ? INT_MIN : INT_MAX) : s; } uint OCKL_MANGLE_U32(add_sat)(uint x, uint y) { uint s; bool c = __builtin_uadd_overflow(x, y, &s); return c ? UINT_MAX : s; } long OCKL_MANGLE_I64(add_sat)(long x, long y) { long s; bool c = __builtin_saddl_overflow(x, y, &s); return c ? (x < 0 ? LONG_MIN : LONG_MAX) : s; } ulong OCKL_MANGLE_U64(add_sat)(ulong x, ulong y) { ulong s; bool c = __builtin_uaddl_overflow(x, y, &s); return c ? ULONG_MAX : s; } ROCm-Device-Libs-rocm-5.0.0/ockl/src/cg.cl000066400000000000000000000075171415221260100177720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "ockl.h" #define AL(P) __opencl_atomic_load((__global atomic_uint *)P, memory_order_relaxed, memory_scope_all_svm_devices) #define AA(P,V) __opencl_atomic_fetch_add((__global atomic_uint *)P, V, memory_order_relaxed, memory_scope_all_svm_devices) // XXX do not change these two structs without changing the language runtime struct mg_sync { uint w0; uint w1; }; struct mg_info { __global struct mg_sync *mgs; uint grid_id; uint num_grids; ulong prev_sum; ulong all_sum; }; static inline size_t get_mg_info_arg(void) { return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[6]; } static inline bool choose_one_workgroup_workitem(void) { return (__builtin_amdgcn_workitem_id_x() | __builtin_amdgcn_workitem_id_y() | __builtin_amdgcn_workitem_id_z()) == 0; } static inline bool choose_one_grid_workitem(void) { return (__builtin_amdgcn_workitem_id_x() | __builtin_amdgcn_workgroup_id_x() | __builtin_amdgcn_workitem_id_y() | __builtin_amdgcn_workgroup_id_y() | __builtin_amdgcn_workitem_id_z() | __builtin_amdgcn_workgroup_id_z()) == 0; } static inline void multi_grid_sync(__global struct mg_sync *s, uint members) { // Assumes 255 or fewer GPUs in multi_grid uint v = AA(&s->w0, 1U); if ((v & 0xff) == members-1) { AA(&s->w0, 0x100 - members); } else { v &= ~0xff; do { __builtin_amdgcn_s_sleep(2); } while ((AL(&s->w0) & ~0xff) == v); } } __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid) { __builtin_amdgcn_ds_gws_init(nwm1, rid); } __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid) { __builtin_amdgcn_ds_gws_barrier(nwm1, rid); } __attribute__((const)) int __ockl_grid_is_valid(void) { return get_mg_info_arg() != 0UL; } __attribute__((convergent)) void __ockl_grid_sync(void) { __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); if (choose_one_workgroup_workitem()) { uint nwm1 = (uint)__ockl_get_num_groups(0) * (uint)__ockl_get_num_groups(1) * (uint)__ockl_get_num_groups(2) - 1; __ockl_gws_barrier(nwm1, 0); } __builtin_amdgcn_s_barrier(); } __attribute__((const)) uint __ockl_multi_grid_num_grids(void) { return ((__constant struct mg_info *)get_mg_info_arg())->num_grids; } __attribute__((const)) uint __ockl_multi_grid_grid_rank(void) { return ((__constant struct mg_info *)get_mg_info_arg())->grid_id; } __attribute__((const)) uint __ockl_multi_grid_size(void) { return ((__constant struct mg_info *)get_mg_info_arg())->all_sum; } __attribute__((const)) uint __ockl_multi_grid_thread_rank(void) { size_t r = ((__constant struct mg_info *)get_mg_info_arg())->prev_sum; r += __ockl_get_global_linear_id(); return r; } __attribute__((const)) int __ockl_multi_grid_is_valid(void) { size_t mi = get_mg_info_arg(); return (mi != 0UL) & (mi != 1UL); } __attribute__((convergent)) void __ockl_multi_grid_sync(void) { __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); uint nwm1 = (uint)__ockl_get_num_groups(0) * (uint)__ockl_get_num_groups(1) * (uint)__ockl_get_num_groups(2) - 1; bool cwwi = choose_one_workgroup_workitem(); if (cwwi) __ockl_gws_barrier(nwm1, 0); __builtin_amdgcn_s_barrier(); if (choose_one_grid_workitem()) { __constant struct mg_info *m = (__constant struct mg_info *)get_mg_info_arg(); multi_grid_sync(m->mgs, m->num_grids); } if (cwwi) __ockl_gws_barrier(nwm1, 0); __builtin_amdgcn_s_barrier(); } ROCm-Device-Libs-rocm-5.0.0/ockl/src/clz.cl000066400000000000000000000014371415221260100201640ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "ockl.h" __attribute__((always_inline, const)) uchar OCKL_MANGLE_T(clz,u8)(uchar i) { return __llvm_ctlz_i8(i); } __attribute__((always_inline, const)) ushort OCKL_MANGLE_T(clz,u16)(ushort i) { return __llvm_ctlz_i16(i); } __attribute__((always_inline, const)) uint OCKL_MANGLE_U32(clz)(uint i) { return __llvm_ctlz_i32(i); } __attribute__((always_inline, const)) ulong OCKL_MANGLE_U64(clz)(ulong i) { return __llvm_ctlz_i64(i); } ROCm-Device-Libs-rocm-5.0.0/ockl/src/ctz.cl000066400000000000000000000014401415221260100201660ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "ockl.h" __attribute__((always_inline, const)) uchar OCKL_MANGLE_T(ctz,u8)(uchar i) { return __llvm_cttz_i8(i); } __attribute__((always_inline, const)) ushort OCKL_MANGLE_T(ctz,u16)(ushort i) { return __llvm_cttz_i16(i); } __attribute__((always_inline, const)) uint OCKL_MANGLE_U32(ctz)(uint i) { return __llvm_cttz_i32(i); } __attribute__((always_inline, const)) ulong OCKL_MANGLE_U64(ctz)(ulong i) { return __llvm_cttz_i64(i); } ROCm-Device-Libs-rocm-5.0.0/ockl/src/dm.cl000066400000000000000000000670101415221260100177730ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" #include "irif.h" #include "ockl.h" #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable extern ulong __ockl_devmem_request(ulong addr, ulong size); // XXX from llvm/include/llvm/IR/InstrTypes.h #define ICMP_NE 33 // Define this to track user requested non-slab (i.e. "large") in-use // allocations. This adds the definition of a query function nna() that // returns a snapshot of the current value. #define NON_SLAB_TRACKING 1 // The number of kinds of blocks. Do not change. #define NUM_KINDS 16 // The size where we switch the large & slow mechanism. Do not change. #define ALLOC_THRESHOLD 3072 // This controls the size of the heap, and also how often // we need to expand the capacity of the array that tracks // the allocations that have been made. // // With the definition below, 256, one level can hold 256 // slabs (512 MiB), and two levels can hold (256+1)*256 = 65792 // slabs (131585 MiB) #define SDATA_SHIFT 8 #define NUM_SDATA (1 << SDATA_SHIFT) #define SDATA_MASK (NUM_SDATA - 1) #define MAX_RECORDABLE_SLABS ((NUM_SDATA + 1) * NUM_SDATA) // Type of variable use to hold a kind typedef uint kind_t; // Type of variable used to hold a sdata index typedef uint sid_t; // Various info about a given kind of block struct kind_info_s { uint num_blocks; uint num_usable_blocks; uint skip_threshold; uint block_offset; uint first_unusable; uint gap_unusable; uint pattern_unusable; uint spread_factor; }; static const __constant struct kind_info_s kinfo[NUM_KINDS] = { { /* 0: 16 */ 130054, 129546, 110114, 16288, 6, 256, 0x00000000, 4195 }, { /* 1: 24 */ 86927, 86758, 73744, 10904, 399, 512, 0x00000000, 2804 }, { /* 2: 32 */ 65280, 64770, 55054, 8192, 0, 128, 0x00000000, 2107 }, { /* 3: 48 */ 43576, 43406, 36895, 5504, 56, 256, 0x00000000, 1405 }, { /* 4: 64 */ 32703, 32193, 27364, 4160, 63, 64, 0x00000000, 1054 }, { /* 5: 96 */ 21816, 21646, 18399, 2816, 56, 128, 0x00000000, 703 }, { /* 6: 128 */ 16367, 15856, 13477, 2176, 15, 32, 0x00008000, 527 }, { /* 7: 192 */ 10915, 10745, 9133, 1472, 35, 64, 0x00000000, 352 }, { /* 8: 256 */ 8187, 7676, 6524, 1280, 11, 16, 0x08000800, 265 }, { /* 9: 384 */ 5459, 5289, 4495, 896, 19, 32, 0x00080000, 176 }, { /* 10: 512 */ 4094, 3583, 3045, 1024, 6, 8, 0x40404040, 133 }, { /* 11: 768 */ 2730, 2560, 2176, 512, 10, 16, 0x04000400, 89 }, { /* 12: 1024 */ 2047, 1536, 1305, 1024, 3, 4, 0x88888888, 66 }, { /* 13: 1536 */ 1365, 1195, 1015, 512, 5, 8, 0x20202020, 44 }, { /* 14: 2048 */ 1023, 512, 435, 2048, 1, 2, 0xaaaaaaaa, 34 }, { /* 15: 3072 */ 682, 512, 435, 2048, 2, 4, 0x44444444, 35 }, }; // A slab is a chunk of memory used to provide "block"s whose addresses are // returned by malloc. The slab tracks which blocks are in use using a bit // array "bits". The blocks themselves start at offset "block_offset". typedef struct slab_s { kind_t k; // The kind of the blocks sid_t i; // The index of the slab in the heap atomic_uint start; // Used to guide the search for unused blocks uint pad; atomic_uint in_use[2*1024*1024 / 4 - 4]; // An array of per-block bits, followed by the blocks } slab_t; // The minimum number of ticks each slab allocation must be separated by #define SLAB_TICKS 20000 // This struct captures a little more information about a given slab // such as its address and its number of used blocks. There is another // member used to increase the number of slabs that can be recorded in // the heap typedef struct sdata_s { atomic_ulong array; // Address of an array of sdata_t atomic_ulong saddr; // Slab address is really a __global slab_t * atomic_uint num_used_blocks; } sdata_t; // The number of ulong that cover an sdata_t #define ULONG_PER_SDATA 3 // The length of a CAS loop sleep #define CAS_SLEEP 2 // This is used to communicate that a result is // not currently available due to a limit on how // fast we are allowed to create new slabs #define SDATA_BUSY (__global sdata_t *)1 // Possible results when trying to increase the number of recordable slabs #define GROW_SUCCESS 0 #define GROW_BUSY 1 #define GROW_FAILURE 2 // The minimum number of ticks each grow must be separated by #define GROW_TICKS 30000 // The number of ulong per cache line used to separate atomics #define ULONG_PER_CACHE_LINE 4 #define ATOMIC_PAD (ULONG_PER_CACHE_LINE-1) // Type used to hold a search start index typedef struct start_s { atomic_uint value; #if ATOMIC_PAD > 0 ulong pad[ATOMIC_PAD]; #endif } start_t; // Type used to hold the number of allocated slabs typedef struct nallocated_s { atomic_uint value; #if ATOMIC_PAD > 0 ulong pad[ATOMIC_PAD]; #endif } nallocated_t; // Type used to hold the number of recordable slabs typedef struct nrecordable_s { atomic_uint value; #if ATOMIC_PAD > 0 ulong pad[ATOMIC_PAD]; #endif } nrecordable_t; // Type used to hold a real-time clock sample typedef struct rtcsample_s { atomic_ulong value; #if ATOMIC_PAD > 0 ulong pad[ATOMIC_PAD]; #endif } rtcsample_t; // The management structure // All bits 0 is an acceptable state, and the expected initial state typedef struct heap_s { start_t start[NUM_KINDS]; // Used to guide the search for a slab to allocate from nallocated_t num_allocated_slabs[NUM_KINDS]; // The number of allocated slabs of a given kind nrecordable_t num_recordable_slabs[NUM_KINDS]; // The number of slabs that can be recorded (a multiple of NUM_SDATA) rtcsample_t salloc_time[NUM_KINDS]; // The time the most recent slab allocation was started rtcsample_t grow_time[NUM_KINDS]; // The time the most recent grow recordable was started sdata_t sdata[NUM_KINDS][NUM_SDATA]; // Information about all allocated slabs #if defined NON_SLAB_TRACKING atomic_ulong num_nonslab_allocations; // Count of number of non-slab allocations that have not been freed #endif } heap_t; // TODO: get the heap pointer from the language runtime static __global heap_t heap; #define HEAP_POINTER &heap // Inhibit control flow optimizations #define O0(X) X = o0(X) __attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; } __attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; } __attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; } // Atomics wrappers #define AL(P, O) __opencl_atomic_load(P, O, memory_scope_device) #define AS(P, V, O) __opencl_atomic_store(P, V, O, memory_scope_device) #define AFA(P, V, O) __opencl_atomic_fetch_add(P, V, O, memory_scope_device) #define AFS(P, V, O) __opencl_atomic_fetch_sub(P, V, O, memory_scope_device) #define AFN(P, V, O) __opencl_atomic_fetch_and(P, V, O, memory_scope_device) #define AFO(P, V, O) __opencl_atomic_fetch_or (P, V, O, memory_scope_device) #define ACE(P, E, V, O) __opencl_atomic_compare_exchange_strong(P, E, V, O, O, memory_scope_device) // realtime __attribute__((target("s-memrealtime")))static ulong realtime(void) { return __builtin_amdgcn_s_memrealtime(); } // The actual number of blocks in a slab with blocks of kind k static uint num_blocks(kind_t k) { return kinfo[k].num_blocks; } // The usable number of blocks in a slab with blocks of kind k static uint num_usable_blocks(kind_t k) { return kinfo[k].num_usable_blocks; } // The number of used blocks in a slab of kind k triggering skipping while searching static uint skip_threshold(kind_t k) { return kinfo[k].skip_threshold; } // The offset to the first block in a slab of kind k static uint block_offset(kind_t k) { return kinfo[k].block_offset; } // The index of the first unusable block in a slab of kind k static uint first_unusable(kind_t k) { return kinfo[k].first_unusable; } // The gap or distance between indices of unusable blocks in a slab of kind k static uint gap_unusable(kind_t k) { return kinfo[k].gap_unusable; } // The pattern of unusable bits when the gap is less than 32 static uint pattern_unusable(kind_t k) { return kinfo[k].pattern_unusable; } // The multiplier used to spread out the probes of individual lanes while searching a slab of kind k static uint spread_factor(kind_t k) { return kinfo[k].spread_factor; } // The number of active lanes at this point static uint active_lane_count(void) { if (__oclc_wavefrontsize64) { return __builtin_popcountl(__builtin_amdgcn_read_exec()); } else { return __builtin_popcount(__builtin_amdgcn_read_exec_lo()); } } // Overloads to broadcast the value held by the first active lane // The result is known to be wave-uniform static __attribute__((overloadable)) uint first(uint v) { return __builtin_amdgcn_readfirstlane(v); } static __attribute__((overloadable)) ulong first(ulong v) { uint2 v2 = __builtin_astype(v, uint2); uint2 w2; w2.x = __builtin_amdgcn_readfirstlane(v2.x); w2.y = __builtin_amdgcn_readfirstlane(v2.y); return __builtin_astype(w2, ulong); } static __attribute__((overloadable)) __global void * first(__global void * v) { uint2 v2 = __builtin_astype(v, uint2); uint2 w2; w2.x = __builtin_amdgcn_readfirstlane(v2.x); w2.y = __builtin_amdgcn_readfirstlane(v2.y); return __builtin_astype(w2, __global void *); } // Read val from one active lane whose predicate is one. // If no lanes have the predicate set, return none // This is like first, except that first may not have its predicate set static uint elect_uint(int pred, uint val, uint none) { uint ret = none; if (__oclc_wavefrontsize64) { ulong mask = __llvm_amdgcn_icmp_i64_i32(pred, 0, ICMP_NE); if (mask != 0UL) { uint l = __ockl_ctz_u64(mask); ret = __builtin_amdgcn_ds_bpermute(l << 2, val); } } else { uint mask = __llvm_amdgcn_icmp_i32_i32(pred, 0, ICMP_NE); if (mask != 0U) { uint l = __ockl_ctz_u32(mask); ret = __builtin_amdgcn_ds_bpermute(l << 2, val); } } return ret; } // Count the number of nonzero arguments across the wave static uint countnz(ulong a) { if (__oclc_wavefrontsize64) { ulong mask = __llvm_amdgcn_icmp_i64_i64(a, 0UL, ICMP_NE); return __builtin_popcountl(mask); } else { uint mask = __llvm_amdgcn_icmp_i32_i64(a, 0UL, ICMP_NE); return __builtin_popcount(mask); } } // The kind of the smallest block that can hold sz bytes static uint size_to_kind(uint sz) { sz = sz < 16 ? 16 : sz; uint b = 31 - __ockl_clz_u32(sz); uint v = 1 << b; return ((b - 4) << 1) + (sz > v) + (sz > (v | (v >> 1))); } // The size of a block of kind k // Alternatively we could place this in kinfo static uint kind_to_size(kind_t k) { uint s = 1 << ((k >> 1) + 4); return s + ((k & 1) != 0 ? (s >> 1) : 0); } // Get the sdata pointer corresponding to kind k and index i // Assumes only 2 levels static __global sdata_t * sdata_for(__global heap_t *hp, kind_t k, sid_t i) { if (i >= NUM_SDATA) { i -= NUM_SDATA; __global sdata_t *sdp = &hp->sdata[k][i >> SDATA_SHIFT]; ulong array = AL(&sdp->array, memory_order_relaxed); __global sdata_t *sda = (__global sdata_t *)array; return &sda[i & SDATA_MASK]; } else { return &hp->sdata[k][i]; } } // Get the sdata parent pointer corresponding to kind k and index i // Also assumes only 2 levels, and i must be >= NUM_SDATA static __global sdata_t * sdata_parent_for(__global heap_t *hp, kind_t k, sid_t i) { return &hp->sdata[k][(i - NUM_SDATA) >> SDATA_SHIFT]; } // Free a non-slab allocation static void non_slab_free(ulong addr) { __ockl_devmem_request(addr, 0); #if defined NON_SLAB_TRACKING uint aid = __ockl_activelane_u32(); uint nactive = active_lane_count(); if (aid == 0) { __global heap_t *hp = HEAP_POINTER; AFS(&hp->num_nonslab_allocations, nactive, memory_order_relaxed); } #endif } // public dealloc() entrypoint __attribute__((noinline)) void __ockl_dm_dealloc(ulong addr) { // Check for non-block and handle elsewhere if ((addr & 0xfffUL) == 0UL) { non_slab_free(addr); return; } // This must be a slab block ulong saddr = addr & ~(ulong)0x1fffffUL; __global slab_t *sptr = (__global slab_t *)saddr; kind_t my_k = sptr->k; sid_t my_i = sptr->i; __global heap_t *hp = HEAP_POINTER; int go = 1; do { o0(go); if (go) { kind_t first_k = first(my_k); sid_t first_i = first(my_i); if (my_k == first_k && my_i == first_i) { uint aid = __ockl_activelane_u32(); uint nactive = active_lane_count(); __global sdata_t *sdp = 0; if (aid == 0) sdp = sdata_for(hp, first_k, first_i); sdp = first(sdp); uint b = (uint)(addr - (saddr + block_offset(first_k))) / kind_to_size(first_k); uint mask = ~(1 << (b & 0x1f)); AFN(&sptr->in_use[b >> 5], mask, memory_order_relaxed); if (aid == 0) AFS(&sdp->num_used_blocks, nactive, memory_order_relaxed); go = 0; } } } while (__ockl_wfany_i32(go)); } // The is the malloc implementation for sizes greater // than ALLOC_THRESHOLD static __global void * non_slab_malloc(size_t sz) { ulong addr = __ockl_devmem_request(0, sz); #if defined NON_SLAB_TRACKING if (addr != 0) { uint aid = __ockl_activelane_u32(); uint nactive = active_lane_count(); if (aid == 0) { __global heap_t *hp = HEAP_POINTER; AFA(&hp->num_nonslab_allocations, nactive, memory_order_relaxed); } } #endif return (__global void *)addr; } // Wait for a while to let a new slab of kind k to appear static void new_slab_wait(__global heap_t *hp, kind_t k) { uint aid = __ockl_activelane_u32(); if (aid == 0) { ulong expected = AL(&hp->salloc_time[k].value, memory_order_relaxed); ulong now = realtime(); ulong dt = now - expected; if (dt < SLAB_TICKS) __ockl_rtcwait_u32(SLAB_TICKS - (uint)dt); } } // Wait for a while to let the number of recordable slabs of kind k to grow static void grow_recordable_wait(__global heap_t *hp, kind_t k) { uint aid = __ockl_activelane_u32(); if (aid == 0) { ulong expected = AL(&hp->grow_time[k].value, memory_order_relaxed); ulong now = realtime(); ulong dt = now - expected; if (dt < GROW_TICKS) __ockl_rtcwait_u32(GROW_TICKS - (uint)dt); } } // Wait to let a CAS failure clear static void cas_wait(void) { __builtin_amdgcn_s_sleep(CAS_SLEEP); } // Obtain a new sdata array // Expect only one active lane here static ulong obtain_new_array(void) { return __ockl_devmem_request(0, sizeof(sdata_t) * NUM_SDATA); } // Clear an array of sdata static void clear_array(ulong a) { uint aid = __ockl_activelane_u32(); uint nactive = active_lane_count(); __global ulong *p = (__global ulong *)a; for (uint i = aid; i < NUM_SDATA*ULONG_PER_SDATA; i += nactive) p[i] = 0UL; } // Release an array // Expect only one active lane here static void release_array(ulong a) { __ockl_devmem_request(a, 0); } // Try to grow the number of recordable slabs // The arguments and result are uniform static uint try_grow_num_recordable_slabs(__global heap_t *hp, kind_t k) { uint aid = __ockl_activelane_u32(); O0(aid); uint nrs; if (aid == 0) nrs = AL(&hp->num_recordable_slabs[k].value, memory_order_relaxed); nrs = first(nrs); if (nrs == MAX_RECORDABLE_SLABS) return GROW_FAILURE; uint ret = GROW_BUSY; if (aid == 0) { ulong expected = AL(&hp->grow_time[k].value, memory_order_relaxed); ulong now = realtime(); if (now - expected >= GROW_TICKS && ACE(&hp->grow_time[k].value, &expected, now, memory_order_relaxed)) ret = GROW_FAILURE; } ret = first(ret); if (ret == GROW_BUSY) return ret; ulong sa; if (aid == 0) sa = obtain_new_array(); sa = first(sa); if (!sa) return ret; clear_array(sa); for (;;) { O0(aid); if (aid == 0) nrs = AL(&hp->num_recordable_slabs[k].value, memory_order_relaxed); nrs = first(nrs); if (nrs == MAX_RECORDABLE_SLABS) { if (aid == 0) release_array(sa); return ret; } if (aid == 0) { __global sdata_t *sdp = sdata_parent_for(hp, k, nrs); ulong expected = 0UL; bool done = ACE(&sdp->array, &expected, sa, memory_order_relaxed); ret = done ? GROW_SUCCESS : ret; if (done) AFA(&hp->num_recordable_slabs[k].value, NUM_SDATA, memory_order_release); } ret = first(ret); if (ret == GROW_SUCCESS) return ret; cas_wait(); } } // Obtain a new slab // Only expect one lane active here static ulong obtain_new_slab(void) { ulong ret = __ockl_devmem_request(0, 1UL << 21); return ret; } // Initialize a slab // Rely on the caller to release the changes static void initialize_slab(__global slab_t *s, kind_t k) { uint aid = __ockl_activelane_u32(); O0(aid); uint nactive = active_lane_count(); uint g = gap_unusable(k); uint m = num_blocks(k); uint n = (m + 31) >> 5; __global uint *p = (__global uint *)&s->in_use; if (g > 32) { for (uint i = aid; i < n; i += nactive) p[i] = 0; uint di = g * nactive; for (uint i = first_unusable(k) + aid*g; i < m; i += di) p[i >> 5] = 1 << (i & 0x1f); } else { uint v = pattern_unusable(k); for (uint i = aid; i < n; i += nactive) p[i] = v; } if (aid == 0) { uint l = m & 0x1f; if (l != 0) p[n-1] |= ~0 << l; *((__global uint4 *)s) = (uint4)(k, 0, 0, 0); } } // Release a slab // Only expect one lane active here static void release_slab(ulong saddr) { __ockl_devmem_request(saddr, 0); } // Try to allocate a new slab of kind k static __global sdata_t * try_allocate_new_slab(__global heap_t *hp, kind_t k) { uint aid = __ockl_activelane_u32(); for (;;) { O0(aid); uint nas, nrs; if (aid == 0) nas = AL(&hp->num_allocated_slabs[k].value, memory_order_relaxed); nas = first(nas); if (nas == MAX_RECORDABLE_SLABS) return (__global sdata_t *)0; if (aid == 0) { uint expected = 0; bool s = ACE(&hp->num_recordable_slabs[k].value, &expected, NUM_SDATA, memory_order_relaxed); nrs = s ? NUM_SDATA : expected; } nrs = first(nrs); if (nas == nrs) { uint result = try_grow_num_recordable_slabs(hp, k); if (result != GROW_SUCCESS) { grow_recordable_wait(hp, k); return result == GROW_FAILURE ? (__global sdata_t *)0 : SDATA_BUSY; } } __global sdata_t *ret = SDATA_BUSY; if (aid == 0) { ulong expected = AL(&hp->salloc_time[k].value, memory_order_relaxed); ulong now = realtime(); if (now - expected >= SLAB_TICKS && ACE(&hp->salloc_time[k].value, &expected, now, memory_order_relaxed)) ret = (__global sdata_t *)0; } ret = first(ret); if (ret) return ret; ulong saddr; if (aid == 0) saddr = obtain_new_slab(); saddr = first(saddr); if (!saddr) return (__global sdata_t *)0; initialize_slab((__global slab_t *)saddr, k); for (;;) { O0(aid); if (aid == 0) nas = AL(&hp->num_allocated_slabs[k].value, memory_order_relaxed); nas = first(nas); if (nas == MAX_RECORDABLE_SLABS) return (__global sdata_t *)0; if (aid == 0) nrs = AL(&hp->num_recordable_slabs[k].value, memory_order_relaxed); nrs = first(nrs); if (nas == nrs) { if (aid == 0) release_slab(saddr); break; } if (aid == 0) { ret = sdata_for(hp, k, nas); ((__global slab_t *)saddr)->i = nas; ulong expected = 0; bool done = ACE(&ret->saddr, &expected, saddr, memory_order_relaxed); ret = done ? ret : (__global sdata_t *)0; if (done) AFA(&hp->num_allocated_slabs[k].value, 1, memory_order_release); } ret = first(ret); if (ret) return ret; cas_wait(); } } } // Find a slab of kind k that can be searched for blocks using // the "normal" approach. The arguments and results are uniform static __global sdata_t * normal_slab_find(__global heap_t *hp, kind_t k, uint nas) { __global sdata_t *ret = (__global sdata_t *)0; uint aid = __ockl_activelane_u32(); uint nactive = active_lane_count(); for (;;) { O0(aid); if (nas > 0) { int nleft = nas; uint i; if (aid == 0) i = AL(&hp->start[k].value, memory_order_relaxed); i = (first(i) + aid) % nas; do { __global sdata_t *sdp = sdata_for(hp, k, i); uint nub = AL(&sdp->num_used_blocks, memory_order_relaxed); uint besti = first(elect_uint(nub < skip_threshold(k), i, ~0)); if (besti != ~0) return sdata_for(hp, k, besti); i = (i + nactive) % nas; if (aid == 0) AS(&hp->start[k].value, i, memory_order_relaxed); nleft -= nactive; } while (nleft > 0); } __global sdata_t *sdp = try_allocate_new_slab(hp, k); if (sdp != SDATA_BUSY) return sdp; new_slab_wait(hp, k); if (aid == 0) nas = AL(&hp->num_allocated_slabs[k].value, memory_order_relaxed); nas = first(nas); } } // Find a slab of kind k that can be searched for blocks using // the "final" approach. The arguments and results are uniform static __global sdata_t * final_slab_find(__global heap_t *hp, kind_t k0) { __global sdata_t *ret = (__global sdata_t *)0; uint aid = __ockl_activelane_u32(); uint nactive = active_lane_count(); for (kind_t k = k0;;) { O0(aid); __global sdata_t *sda = hp->sdata[k]; int nleft = MAX_RECORDABLE_SLABS; uint i; if (aid == 0) i = AL(&hp->start[k].value, memory_order_relaxed); i = (first(i) + aid) % MAX_RECORDABLE_SLABS; do { __global sdata_t *sdp = sdata_for(hp, k, i); uint nub = AL(&sdp->num_used_blocks, memory_order_relaxed); uint besti = first(elect_uint(nub < num_usable_blocks(k), i, ~0)); if (besti != ~0) return sdata_for(hp, k, besti); i = (i + nactive) % MAX_RECORDABLE_SLABS; if (aid == 0) AS(&hp->start[k].value, i, memory_order_relaxed); nleft -= nactive; } while (nleft > 0); uint nextk = k + 2 - (k & 1); if (k != k0 || nextk >= NUM_KINDS) return (__global sdata_t *)0; uint nas = 0; if (aid == 0) nas = AL(&hp->num_allocated_slabs[nextk].value, memory_order_relaxed); nas = first(nas); if (nas < MAX_RECORDABLE_SLABS) return normal_slab_find(hp, nextk, nas); k = nextk; } } // Find a slab of kind k that can be searched for blocks // The arguments and results are uniform static __global sdata_t * slab_find(__global heap_t *hp, kind_t k) { uint aid = __ockl_activelane_u32(); O0(aid); uint nas = 0; if (aid == 0) nas = AL(&hp->num_allocated_slabs[k].value, memory_order_relaxed); nas = first(nas); if (nas < MAX_RECORDABLE_SLABS) return normal_slab_find(hp, k, nas); else return final_slab_find(hp, k); } // Find an empty block in a specific slab // The argument is uniform, the result is not static __global void * block_find(__global sdata_t *sdp) { uint aid = __ockl_activelane_u32(); O0(aid); uint nactive = active_lane_count(); __global slab_t *sp = (__global slab_t *)sdp->saddr; kind_t k = sp->k; uint i; if (aid == 0) i = AFA(&sp->start, nactive, memory_order_relaxed); i = ((first(i) + aid) * spread_factor(k) % num_blocks(k)) >> 5; uint n = (num_blocks(k) + 31) >> 5; __global void *ret = (__global void *)0; for (uint j=0; jin_use + i; uint m = AL(p, memory_order_relaxed); if (m != ~0) { uint b = __ockl_ctz_u32(~m); uint mm = AFO(p, 1 << b, memory_order_relaxed); if ((mm & (1 << b)) == 0) { uint ii = (i << 5) + b; ret = (__global void *)((__global char *)sp + block_offset(k) + kind_to_size(k)*ii); break; } } i = (i + 1) % n; } uint done = countnz((ulong)ret); if (aid == 0) AFA(&sdp->num_used_blocks, done, memory_order_relaxed); return ret; } // This is the malloc implementation for sizes that fit in some kind of block static __global void * slab_malloc(int sz) { kind_t my_k = size_to_kind(sz); __global void *ret = (__global void *)0; __global heap_t *hp = HEAP_POINTER; int k_go = 1; do { O0(k_go); if (k_go) { kind_t first_k = first(my_k); if (first_k == my_k) { int s_go = 1; do { O0(s_go); if (s_go) { __global sdata_t *sdp = first(slab_find(hp, first_k)); if (sdp != (__global sdata_t *)0) { ret = block_find(sdp); if (ret != (__global void *)0) { k_go = 0; s_go = 0; } } else { k_go = 0; s_go = 0; } } } while (__ockl_wfany_i32(s_go)); } } } while (__ockl_wfany_i32(k_go)); return ret; } // public alloc() entrypoint __attribute__((noinline)) __global void * __ockl_dm_alloc(ulong sz) { if (sz == 0) return (__global void *)0; if (sz > ALLOC_THRESHOLD) return non_slab_malloc(sz); return slab_malloc(sz); } #if defined NON_SLAB_TRACKING // return a snapshot of the current number of nonslab allocations // which haven't been deallocated ulong __ockl_dm_nna(void) { __global heap_t *hp = HEAP_POINTER; return AL(&hp->num_nonslab_allocations, memory_order_relaxed); } #endif ROCm-Device-Libs-rocm-5.0.0/ockl/src/dots.cl000066400000000000000000000122161415221260100203420ustar00rootroot00000000000000 /*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" #include "ockl.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable __attribute__((target("dot7-insts"), const)) static float amdgcn_fdot2(half2 a, half2 b, float c, bool s) { if (s) return __builtin_amdgcn_fdot2(a, b, c, true); else return __builtin_amdgcn_fdot2(a, b, c, false); } __attribute__((target("dot2-insts"), const)) static int amdgcn_sdot2(short2 a, short2 b, int c, bool s) { if (s) return __builtin_amdgcn_sdot2(a, b, c, true); else return __builtin_amdgcn_sdot2(a, b, c, false); } __attribute__((target("dot2-insts"), const)) static uint amdgcn_udot2(ushort2 a, ushort2 b, uint c, bool s) { if (s) return __builtin_amdgcn_udot2(a, b, c, true); else return __builtin_amdgcn_udot2(a, b, c, false); } __attribute__((target("dot1-insts"), const)) static int amdgcn_sdot4(int a, int b, int c, bool s) { if (s) return __builtin_amdgcn_sdot4(a, b, c, true); else return __builtin_amdgcn_sdot4(a, b, c, false); } __attribute__((target("dot7-insts"), const)) static uint amdgcn_udot4(uint a, uint b, uint c, bool s) { if (s) return __builtin_amdgcn_udot4(a, b, c, true); else return __builtin_amdgcn_udot4(a, b, c, false); } __attribute__((target("dot1-insts"), const)) static int amdgcn_sdot8(int a, int b, int c, bool s) { if (s) return __builtin_amdgcn_sdot8(a, b, c, true); else return __builtin_amdgcn_sdot8(a, b, c, false); } __attribute__((target("dot7-insts"), const)) static uint amdgcn_udot8(uint a, uint b, uint c, bool s) { if (s) return __builtin_amdgcn_udot8(a, b, c, true); else return __builtin_amdgcn_udot8(a, b, c, false); } #define SWDOT __oclc_ISA_version < 9006 || __oclc_ISA_version == 9009 || __oclc_ISA_version == 10100 #define AS_INT(X) __builtin_astype(X, int) #define AS_UINT(X) __builtin_astype(X, uint) #define ATTR __attribute__((const)) ATTR static float fmuladd(float a, float b, float c) { #pragma OPENCL FP_CONTRACT ON return a * b + c; } ATTR float __ockl_fdot2(half2 a, half2 b, float c, bool s) { if (SWDOT) return fmuladd((float)a.s1, (float)b.s1, fmuladd((float)a.s0, (float)b.s0, c)); else return amdgcn_fdot2(a, b, c, true); } ATTR int __ockl_sdot2(short2 a, short2 b, int c, bool s) { if (SWDOT) { int p0 = (int)a.s0 * (int)b.s0; int p1 = (int)a.s1 * (int)b.s1; long r = (long)c + (long)p0 + (long)p1; if (s) return r < -2147483648L ? -2147483648 : (r > 2147483647L ? 2147483647 : (int)r); else return (int)r; } else { return amdgcn_sdot2(a, b, c, s); } } ATTR uint __ockl_udot2(ushort2 a, ushort2 b, uint c, bool s) { if (SWDOT) { uint p0 = (uint)a.s0 * (uint)b.s0; uint p1 = (uint)a.s1 * (uint)b.s1; ulong r = (ulong)c + (ulong)p0 + (ulong)p1; return (s & (r > (ulong)0xffffffff)) ? 0xffffffff : (uint)r; } else { return amdgcn_udot2(a, b, c, s); } } ATTR int __ockl_sdot4(char4 a, char4 b, int c, bool s) { if (SWDOT) { int t = (int)a.s0 * (int)b.s0 + (int)a.s1 * (int)b.s1 + (int)a.s2 * (int)b.s2 + (int)a.s3 * (int)b.s3; return s ? __ockl_add_sat_i32(t, c) : (t + c); } else { return amdgcn_sdot4(AS_INT(a), AS_INT(b), c, s); } } ATTR uint __ockl_udot4(uchar4 a, uchar4 b, uint c, bool s) { if (SWDOT) { uint t = (uint)a.s0 * (uint)b.s0 + (uint)a.s1 * (uint)b.s1 + (uint)a.s2 * (uint)b.s2 + (uint)a.s3 * (uint)b.s3; return s ? __ockl_add_sat_u32(t, c) : (t + c); } else { return amdgcn_udot4(AS_UINT(a), AS_UINT(b), c, s); } } ATTR int __ockl_sdot8(int a, int b, int c, bool s) { if (SWDOT) { int t = ((a << 28) >> 28) * ((b << 28) >> 28) + ((a << 24) >> 28) * ((b << 24) >> 28) + ((a << 20) >> 28) * ((b << 20) >> 28) + ((a << 16) >> 28) * ((b << 16) >> 28) + ((a << 12) >> 28) * ((b << 12) >> 28) + ((a << 8) >> 28) * ((b << 8) >> 28) + ((a << 4) >> 28) * ((b << 4) >> 28) + ( a >> 28) * ( b >> 28); return s ? __ockl_add_sat_i32(t, c) : (t + c); } else { return amdgcn_sdot8(a, b, c, s); } } ATTR uint __ockl_udot8(uint a, uint b, uint c, bool s) { if (SWDOT) { uint t = ( a & 0xf) * ( b & 0xf) + ((a >> 4) & 0xf) * ((b >> 4) & 0xf) + ((a >> 8) & 0xf) * ((b >> 8) & 0xf) + ((a >> 12) & 0xf) * ((b >> 12) & 0xf) + ((a >> 16) & 0xf) * ((b >> 16) & 0xf) + ((a >> 20) & 0xf) * ((b >> 20) & 0xf) + ((a >> 24) & 0xf) * ((b >> 24) & 0xf) + ((a >> 28) ) * ((b >> 28) ); return s ? __ockl_add_sat_u32(t, c) : (t + c); } else { return amdgcn_udot8(a, b, c, s); } } ROCm-Device-Libs-rocm-5.0.0/ockl/src/gaaf.cl000066400000000000000000000026771415221260100203010ustar00rootroot00000000000000 /*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" #include "ockl.h" #define AS_FLOAT(X) __builtin_astype(X, float) #define AS_UINT(X) __builtin_astype(X, uint) #define AC(P, E, V, O, R, S) __opencl_atomic_compare_exchange_strong(P, E, V, O, R, S) #define AL(P, O, S) __opencl_atomic_load(P, O, S) extern float __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32"); __attribute__((target("atomic-fadd-insts"))) static void global_atomic_fadd(__global float *p, float v) { __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(p, v); } static void generic_atomic_fadd(float *p, float v) { atomic_uint *t = (atomic_uint *)p; uint e = AL(t, memory_order_relaxed, memory_scope_device); while (!AC(t, &e, AS_UINT(v + AS_FLOAT(e)), memory_order_relaxed, memory_order_relaxed, memory_scope_device)) ; } void __ockl_atomic_add_noret_f32(float *p, float v) { if ((__oclc_ISA_version == 9008 || __oclc_ISA_version == 9010) && !__ockl_is_local_addr(p) && !__ockl_is_private_addr(p)) { global_atomic_fadd((__global float *)p, v); } else { generic_atomic_fadd(p, v); } } ROCm-Device-Libs-rocm-5.0.0/ockl/src/hostcall.cl000066400000000000000000000043551415221260100212070ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ /** \brief Internal implementation of hostcall. * * *** INTERNAL USE ONLY *** * Internal function, not safe for direct use in user * code. Application kernels must only use __ockl_hostcall_preview() * defined below. */ extern long2 __ockl_hostcall_internal(void *buffer, uint service_id, ulong arg0, ulong arg1, ulong arg2, ulong arg3, ulong arg4, ulong arg5, ulong arg6, ulong arg7); /** \brief Submit a wave-wide hostcall packet. * \param service_id The service to be invoked on the host. * \param arg0 Up to eight parameters (arg0..arg7) * \return Two 64-bit values. * * The hostcall is executed for all active threads in the * wave. #service_id must be uniform across the active threads, * otherwise behaviour is undefined. The service parameters may be * different for each active thread, and correspondingly, the * returned values are also different. * * The contents of the input parameters and the return values are * defined by the service being invoked. * * *** PREVIEW FEATURE *** * This is a feature preview and considered alpha quality only; * behaviour may vary between ROCm releases. Device code that invokes * hostcall can be launched only on the ROCm release that it was * compiled for, otherwise behaviour is undefined. */ long2 __ockl_hostcall_preview(uint service_id, ulong arg0, ulong arg1, ulong arg2, ulong arg3, ulong arg4, ulong arg5, ulong arg6, ulong arg7) { // Retrieve the buffer pointer passed as an implicit kernel // argument. This is at offset 3, which is the same as the OpenCL // printf buffer. __constant size_t *argptr = (__constant size_t *)__builtin_amdgcn_implicitarg_ptr(); void *buffer = (void *)argptr[3]; return __ockl_hostcall_internal(buffer, service_id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7); } ROCm-Device-Libs-rocm-5.0.0/ockl/src/hostcall_impl.cl000066400000000000000000000235401415221260100222250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl_hsa.h" #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable #define AC(P, E, V, O, R, S) \ __opencl_atomic_compare_exchange_strong(P, E, V, O, R, S) #define AL(P, O, S) __opencl_atomic_load(P, O, S) #define AF(K, P, V, O, S) __opencl_atomic_fetch_##K(P, V, O, S) typedef enum { STATUS_SUCCESS, STATUS_BUSY } status_t; typedef enum { CONTROL_OFFSET_READY_FLAG = 0, CONTROL_OFFSET_RESERVED0 = 1, } control_offset_t; typedef enum { CONTROL_WIDTH_READY_FLAG = 1, CONTROL_WIDTH_RESERVED0 = 31, } control_width_t; typedef struct { ulong next; ulong activemask; uint service; uint control; } header_t; typedef struct { // 64 slots of 8 ulongs each ulong slots[64][8]; } payload_t; typedef struct { __global header_t *headers; __global payload_t *payloads; hsa_signal_t doorbell; ulong free_stack; ulong ready_stack; ulong index_mask; } buffer_t; static void send_signal(hsa_signal_t signal) { __ockl_hsa_signal_add(signal, 1, __ockl_memory_order_release); } static __global header_t * get_header(__global buffer_t *buffer, ulong ptr) { return buffer->headers + (ptr & buffer->index_mask); } static __global payload_t * get_payload(__global buffer_t *buffer, ulong ptr) { return buffer->payloads + (ptr & buffer->index_mask); } static uint get_control_field(uint control, uint offset, uint width) { return (control >> offset) & ((1 << width) - 1); } static uint get_ready_flag(uint control) { return get_control_field(control, CONTROL_OFFSET_READY_FLAG, CONTROL_WIDTH_READY_FLAG); } static uint set_control_field(uint control, uint offset, uint width, uint value) { uint mask = ~(((1 << width) - 1) << offset); return (control & mask) | (value << offset); } static uint set_ready_flag(uint control) { return set_control_field(control, CONTROL_OFFSET_READY_FLAG, CONTROL_WIDTH_READY_FLAG, 1); } static uint optimizationBarrierHack(uint in_val) { uint out_val; __asm__ volatile("; ockl readfirstlane hoisting hack %0" : "=v"(out_val) : "0"(in_val)); return out_val; } static ulong pop(__global ulong *top, __global buffer_t *buffer) { ulong F = AL((__global atomic_ulong *)top, memory_order_acquire, memory_scope_all_svm_devices); // F is guaranteed to be non-zero, since there are at least as // many packets as there are waves, and each wave can hold at most // one packet. while (true) { __global header_t *P = get_header(buffer, F); ulong N = AL((__global atomic_ulong *)&P->next, memory_order_relaxed, memory_scope_all_svm_devices); if (AC((__global atomic_ulong *)top, &F, N, memory_order_acquire, memory_order_relaxed, memory_scope_all_svm_devices)) { break; } __builtin_amdgcn_s_sleep(1); } return F; } /** \brief Use the first active lane to get a free packet and * broadcast to the whole wave. */ static ulong pop_free_stack(__global buffer_t *buffer, uint me, uint low) { ulong packet_ptr = 0; if (me == low) { packet_ptr = pop(&buffer->free_stack, buffer); } uint ptr_lo = packet_ptr; uint ptr_hi = packet_ptr >> 32; ptr_lo = __builtin_amdgcn_readfirstlane(ptr_lo); ptr_hi = __builtin_amdgcn_readfirstlane(ptr_hi); return ((ulong)ptr_hi << 32) | ptr_lo; } static void push(__global ulong *top, ulong ptr, __global buffer_t *buffer) { ulong F = AL((__global const atomic_ulong *)top, memory_order_relaxed, memory_scope_all_svm_devices); __global header_t *P = get_header(buffer, ptr); while (true) { P->next = F; if (AC((__global atomic_ulong *)top, &F, ptr, memory_order_release, memory_order_relaxed, memory_scope_all_svm_devices)) break; __builtin_amdgcn_s_sleep(1); } } /** \brief Use the first active lane in a wave to submit a ready * packet and signal the host. */ static void push_ready_stack(__global buffer_t *buffer, ulong ptr, uint me, uint low) { if (me == low) { push(&buffer->ready_stack, ptr, buffer); send_signal(buffer->doorbell); } } static ulong inc_ptr_tag(ulong ptr, ulong index_mask) { // Unit step for the tag. ulong inc = index_mask + 1; ptr += inc; // When the tag for index 0 wraps, increment the tag. return ptr == 0 ? inc : ptr; } /** \brief Return the packet after incrementing the ABA tag */ static void return_free_packet(__global buffer_t *buffer, ulong ptr, uint me, uint low) { if (me == low) { ptr = inc_ptr_tag(ptr, buffer->index_mask); push(&buffer->free_stack, ptr, buffer); } } static void fill_packet(__global header_t *header, __global payload_t *payload, uint service_id, ulong arg0, ulong arg1, ulong arg2, ulong arg3, ulong arg4, ulong arg5, ulong arg6, ulong arg7, uint me, uint low) { ulong active = __builtin_amdgcn_read_exec(); if (me == low) { header->service = service_id; header->activemask = active; uint control = set_ready_flag(0); header->control = control; } __global ulong *ptr = payload->slots[me]; ptr[0] = arg0; ptr[1] = arg1; ptr[2] = arg2; ptr[3] = arg3; ptr[4] = arg4; ptr[5] = arg5; ptr[6] = arg6; ptr[7] = arg7; } /** \brief Wait for the host response and return the first two ulong * entries per workitem. * * After the packet is submitted in READY state, the wave spins until * the host changes the state to DONE. Each workitem reads the first * two ulong elements in its slot and returns this. */ static long2 get_return_value(__global header_t *header, __global payload_t *payload, uint me, uint low) { // The while loop needs to be executed by all active // lanes. Otherwise, later reads from ptr are performed only by // the first thread, while other threads reuse a value cached from // previous operations. The use of readfirstlane in the while loop // prevents this reordering. // // In the absence of the readfirstlane, only one thread has a // sequenced-before relation from the atomic load on // header->control to the ordinary loads on ptr. As a result, the // compiler is free to reorder operations in such a way that the // ordinary loads are performed only by the first thread. The use // of readfirstlane provides a stronger code-motion barrier, and // it effectively "spreads out" the sequenced-before relation to // the ordinary stores in other threads too. while (true) { uint ready_flag = 1; if (me == low) { uint control = AL((__global const atomic_uint *)&header->control, memory_order_acquire, memory_scope_all_svm_devices); ready_flag = get_ready_flag(control); } ready_flag = __builtin_amdgcn_readfirstlane(ready_flag); if (ready_flag == 0) break; __builtin_amdgcn_s_sleep(1); } __global ulong *ptr = (__global ulong *)(payload->slots + me); ulong value0 = *ptr++; ulong value1 = *ptr; long2 retval = {value0, value1}; return retval; } /** \brief The implementation that should be hidden behind an ABI * * The transaction is a wave-wide operation, where the service_id * must be uniform, but the parameters are different for each * workitem. Parameters from all active lanes are written into a * hostcall packet. The hostcall blocks until the host processes the * request, and returns the response it receiveds. * * TODO: This function and everything above it should eventually move * to a separate library that is loaded by the language runtime. The * function itself will be exposed as an orindary function symbol to * be linked into kernel objects that are loaded after this library. * * *** INTERNAL USE ONLY *** * Internal function, not safe for direct use in user * code. Application kernels must only use __ockl_hostcall_preview() * defined elsewhere. * * The function is marked noinline to preserve all calls in the * kernel. This is required because the compiler backend includes a * check for the presence of this function as a way to determine that * hostcall is used. * * FIXME: Additionally, the optnone attribute is required to ensure * that the SelectAcceleratorCode pass in HCC does not forcibly * inline this function. This should be removed when the SAC pass or * HCC itself is removed. */ __attribute__((noinline)) __attribute__((optnone)) long2 __ockl_hostcall_internal(void *_buffer, uint service_id, ulong arg0, ulong arg1, ulong arg2, ulong arg3, ulong arg4, ulong arg5, ulong arg6, ulong arg7) { uint me = __ockl_lane_u32(); me = optimizationBarrierHack(me); uint low = __builtin_amdgcn_readfirstlane(me); __global buffer_t *buffer = (__global buffer_t *)_buffer; ulong packet_ptr = pop_free_stack(buffer, me, low); __global header_t *header = get_header(buffer, packet_ptr); __global payload_t *payload = get_payload(buffer, packet_ptr); fill_packet(header, payload, service_id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, me, low); push_ready_stack(buffer, packet_ptr, me, low); long2 retval = get_return_value(header, payload, me, low); return_free_packet(buffer, packet_ptr, me, low); return retval; } ROCm-Device-Libs-rocm-5.0.0/ockl/src/hsaqs.cl000066400000000000000000000153411415221260100205120ustar00rootroot00000000000000 /*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" #include "ockl_hsa.h" #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable #define ATTR #define AL(T,P,O,S) __opencl_atomic_load(P,O,S) #define AS(P,V,O,S) __opencl_atomic_store(P,V,O,S) #define AF(T,K,P,V,O,S) __opencl_atomic_fetch_##K(P,V,O,S) #define AX(T,P,V,O,S) __opencl_atomic_exchange(P,V,O,S) #define AC(P,E,V,O,R,S) __opencl_atomic_compare_exchange_strong(P,E,V,O,R,S) // // HSA queue ops // ATTR ulong OCKL_MANGLE_T(hsa_queue,load_read_index)(const __global hsa_queue_t *queue, __ockl_memory_order mem_order) { const __global amd_queue_t *q = (const __global amd_queue_t *)queue; return AL(ulong, (__global atomic_ulong *)&q->read_dispatch_id, mem_order, memory_scope_all_svm_devices); } ATTR ulong OCKL_MANGLE_T(hsa_queue,load_write_index)(const __global hsa_queue_t *queue, __ockl_memory_order mem_order) { const __global amd_queue_t *q = (const __global amd_queue_t *)queue; return AL(ulong, (__global atomic_ulong *)&q->write_dispatch_id, mem_order, memory_scope_all_svm_devices); } ATTR ulong OCKL_MANGLE_T(hsa_queue,add_write_index)(__global hsa_queue_t *queue, ulong value, __ockl_memory_order mem_order) { __global amd_queue_t *q = (__global amd_queue_t *)queue; return AF(ulong, add, (__global atomic_ulong *)&q->write_dispatch_id, value, mem_order, memory_scope_all_svm_devices); } ATTR ulong OCKL_MANGLE_T(hsa_queue,cas_write_index)(__global hsa_queue_t *queue, ulong expected, ulong value, __ockl_memory_order mem_order) { __global amd_queue_t *q = (__global amd_queue_t *)queue; ulong e = expected; AC((__global atomic_ulong *)&q->write_dispatch_id, &e, value, mem_order, memory_order_relaxed, memory_scope_all_svm_devices); return e; } ATTR void OCKL_MANGLE_T(hsa_queue,store_write_index)(__global hsa_queue_t *queue, ulong value, __ockl_memory_order mem_order) { __global amd_queue_t *q = (__global amd_queue_t *)queue; AS((__global atomic_ulong *)&q->write_dispatch_id, value, mem_order, memory_scope_all_svm_devices); } // // HSA signal ops // static ATTR void update_mbox(const __global amd_signal_t *sig) { __global atomic_ulong *mb = (__global atomic_ulong *)sig->event_mailbox_ptr; if (mb) { uint id = sig->event_id; AS(mb, id, memory_order_release, memory_scope_all_svm_devices); __builtin_amdgcn_s_sendmsg(1 | (0 << 4), __builtin_amdgcn_readfirstlane(id) & 0xff); } } ATTR long OCKL_MANGLE_T(hsa_signal,load)(const hsa_signal_t sig, __ockl_memory_order mem_order) { const __global amd_signal_t *s = (const __global amd_signal_t *)sig.handle; return AL(long, (__global atomic_long *)&s->value, mem_order, memory_scope_all_svm_devices); } ATTR void OCKL_MANGLE_T(hsa_signal,add)(hsa_signal_t sig, long value, __ockl_memory_order mem_order) { __global amd_signal_t *s = (__global amd_signal_t *)sig.handle; AF(long, add, (__global atomic_long *)&s->value, value, mem_order, memory_scope_all_svm_devices); update_mbox(s); } ATTR void OCKL_MANGLE_T(hsa_signal,and)(hsa_signal_t sig, long value, __ockl_memory_order mem_order) { __global amd_signal_t *s = (__global amd_signal_t *)sig.handle; AF(long, and, (__global atomic_long *)&s->value, value, mem_order, memory_scope_all_svm_devices); update_mbox(s); } ATTR void OCKL_MANGLE_T(hsa_signal,or)(hsa_signal_t sig, long value, __ockl_memory_order mem_order) { __global amd_signal_t *s = (__global amd_signal_t *)sig.handle; AF(long, or, (__global atomic_long *)&s->value, value, mem_order, memory_scope_all_svm_devices); update_mbox(s); } ATTR void OCKL_MANGLE_T(hsa_signal,xor)(hsa_signal_t sig, long value, __ockl_memory_order mem_order) { __global amd_signal_t *s = (__global amd_signal_t *)sig.handle; AF(long, xor, (__global atomic_long *)&s->value, value, mem_order, memory_scope_all_svm_devices); update_mbox(s); } ATTR long OCKL_MANGLE_T(hsa_signal,exchange)(hsa_signal_t sig, long value, __ockl_memory_order mem_order) { __global amd_signal_t *s = (__global amd_signal_t *)sig.handle; long ret = AX(long, (__global atomic_long *)&s->value, value, mem_order, memory_scope_all_svm_devices); update_mbox(s); return ret; } ATTR void OCKL_MANGLE_T(hsa_signal,subtract)(hsa_signal_t sig, long value, __ockl_memory_order mem_order) { __global amd_signal_t *s = (__global amd_signal_t *)sig.handle; AF(long, sub, (__global atomic_long *)&s->value, value, mem_order, memory_scope_all_svm_devices); update_mbox(s); } ATTR long OCKL_MANGLE_T(hsa_signal,cas)(hsa_signal_t sig, long expected, long value, __ockl_memory_order mem_order) { __global amd_signal_t *s = (__global amd_signal_t *)sig.handle; long e = expected; if (AC((__global atomic_long *)&s->value, &e, value, mem_order, memory_order_relaxed, memory_scope_all_svm_devices)) update_mbox(s); return e; } ATTR void OCKL_MANGLE_T(hsa_signal,store)(hsa_signal_t sig, long value, __ockl_memory_order mem_order) { __global amd_signal_t *s = (__global amd_signal_t *)sig.handle; if (s->kind == AMD_SIGNAL_KIND_USER) { AS((__global atomic_long *)&s->value, value, mem_order, memory_scope_all_svm_devices); update_mbox(s); } else if (__oclc_ISA_version >= 9000) { // Hardware doorbell supports AQL semantics. AS((__global atomic_ulong *)s->hardware_doorbell_ptr, (ulong)value, memory_order_release, memory_scope_all_svm_devices); } else { { __global amd_queue_t * q = s->queue_ptr; __global atomic_uint *lp = (__global atomic_uint *)&q->legacy_doorbell_lock; uint e = 0; while (!AC(lp, &e, (uint)1, memory_order_acquire, memory_order_relaxed, memory_scope_all_svm_devices)) { __builtin_amdgcn_s_sleep(1); e = 0; } ulong legacy_dispatch_id = value + 1; if (legacy_dispatch_id > q->max_legacy_doorbell_dispatch_id_plus_1) { AS((__global atomic_ulong *)&q->max_legacy_doorbell_dispatch_id_plus_1, legacy_dispatch_id, memory_order_relaxed, memory_scope_all_svm_devices); if (__oclc_ISA_version < 8000) { legacy_dispatch_id = (ulong)(((uint)legacy_dispatch_id & ((q->hsa_queue.size << 1) - 1)) * 16); } *s->legacy_hardware_doorbell_ptr = (uint)legacy_dispatch_id; } AS(lp, 0, memory_order_release, memory_scope_all_svm_devices); } } } ROCm-Device-Libs-rocm-5.0.0/ockl/src/image.cl000066400000000000000000001416351415221260100204630ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "ockl.h" #include "oclc.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define EII() __oclc_ISA_version != 9010 #define RATTR __attribute__((pure)) #define ERATTR __attribute__((pure, target("extended-image-insts"))) #define WATTR #define GATTR __attribute__((const)) // TSHARP/SSHARP access #define FIELD(P,B,W) ((P[B >> 5] >> (B & 0x1f)) & ((1 << W) - 1)) #define WORD(P,I) P[I] #define LOAD_TSHARP(I) *(__constant uint8 *)I #define LOAD_VSHARP(I) *(__constant uint4 *)I #define LOAD_SSHARP(S) *(__constant uint4 *)S // Adjustments for hardware precision limits #define ADJUST_X(C,I,S) do { \ float _w = (float)WORD(I,10); \ float _p = FIELD(S,15,1) ? 1.0f : _w; \ float _x = __builtin_floorf(C * _p) * __builtin_amdgcn_rcpf(_p); \ C = FIELD(S,84,1) ? C : _x; \ } while (0) #define ADJUST_XY(C,I,S) do { \ float _w = (float)WORD(I,10); \ float _h = (float)(FIELD(I,78,14) + 1U); \ bool _f = FIELD(S,15,1); \ float _p = _f ? 1.0f : _w; \ float _q = _f ? 1.0f : _h; \ float _x = __builtin_floorf(C.x * _p) * __builtin_amdgcn_rcpf(_p); \ float _y = __builtin_floorf(C.y * _q) * __builtin_amdgcn_rcpf(_q); \ bool _m = FIELD(S,84,1); \ C.x = _m ? C.x : _x; \ C.y = _m ? C.y : _y; \ } while (0) #define ADJUST_XYZ(C,I,S) do { \ float _w = (float)WORD(I,10); \ float _h = (float)(FIELD(I,78,14) + 1U); \ float _d = (float)(FIELD(I, 128, 13) + 1U); \ bool _f = FIELD(S,15,1); \ float _p = _f ? 1.0f : _w; \ float _q = _f ? 1.0f : _h; \ float _r = _f ? 1.0f : _d; \ float _x = __builtin_floorf(C.x * _p) * __builtin_amdgcn_rcpf(_p); \ float _y = __builtin_floorf(C.y * _q) * __builtin_amdgcn_rcpf(_q); \ float _z = __builtin_floorf(C.z * _r) * __builtin_amdgcn_rcpf(_r); \ bool _m = FIELD(S,84,1); \ C.x = _m ? C.x : _x; \ C.y = _m ? C.y : _y; \ C.z = _m ? C.z : _z; \ } while (0) GATTR static float fmuladd_f32(float a, float b, float c) { #pragma OPENCL FP_CONTRACT ON return a * b + c; } #define LS_ARRAY_FACE(I,F) (6 * (((I) << 8) >> 8) + (F)) #define SAMPLE_ARRAY_FACE(I, F) fmuladd_f32(__builtin_rintf(I), 8.0f, F) #define CUBE_PREP(C) do { \ float _vx = C.x; \ float _vy = C.y; \ float _vz = C.z; \ float _rl = __builtin_amdgcn_rcpf(__builtin_amdgcn_cubema(_vx, _vy, _vz)); \ C.x = fmuladd_f32(__builtin_amdgcn_cubesc(_vx, _vy, _vz), _rl, 0.5f); \ C.y = fmuladd_f32(__builtin_amdgcn_cubetc(_vx, _vy, _vz), _rl, 0.5f); \ C.z = __builtin_amdgcn_cubeid(_vx, _vy, _vz); \ } while (0) RATTR static float4 my_image_load_1d_v4f32_i32(uint ix, uint8 t) { return __llvm_amdgcn_image_load_1d_v4f32_i32(ix, t); } RATTR static float4 my_image_load_2d_v4f32_i32(uint ix, uint iy, uint8 t) { return __llvm_amdgcn_image_load_2d_v4f32_i32(ix, iy, t); } RATTR static float4 my_image_load_3d_v4f32_i32(uint ix, uint iy, uint iz, uint8 t) { return __llvm_amdgcn_image_load_3d_v4f32_i32(ix, iy, iz, t); } RATTR static float4 my_image_load_cube_v4f32_i32(uint ix, uint iy, uint iface, uint8 t) { return __llvm_amdgcn_image_load_cube_v4f32_i32(ix, iy, iface, t); } RATTR static float4 my_image_load_1darray_v4f32_i32(uint ix, uint islice, uint8 t) { return __llvm_amdgcn_image_load_1darray_v4f32_i32(ix, islice, t); } RATTR static float4 my_image_load_2darray_v4f32_i32(uint ix, uint iy, uint islice, uint8 t) { return __llvm_amdgcn_image_load_2darray_v4f32_i32(ix, iy, islice, t); } RATTR static float4 my_image_load_mip_1d_v4f32_i32(uint ix, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_1d_v4f32_i32(ix, imip, t); } RATTR static float4 my_image_load_mip_2d_v4f32_i32(uint ix, uint iy, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_2d_v4f32_i32(ix, iy, imip, t); } RATTR static float4 my_image_load_mip_3d_v4f32_i32(uint ix, uint iy, uint iz, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_3d_v4f32_i32(ix, iy, iz, imip, t); } RATTR static float4 my_image_load_mip_cube_v4f32_i32(uint ix, uint iy, uint iface, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_cube_v4f32_i32(ix, iy, iface, imip, t); } RATTR static float4 my_image_load_mip_1darray_v4f32_i32(uint ix, uint islice, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_1darray_v4f32_i32(ix, islice, imip, t); } RATTR static float4 my_image_load_mip_2darray_v4f32_i32(uint ix, uint iy, uint islice, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_2darray_v4f32_i32(ix, iy, islice, imip, t); } RATTR static half4 my_image_load_1d_v4f16_i32(uint ix, uint8 t) { return __llvm_amdgcn_image_load_1d_v4f16_i32(ix, t); } RATTR static half4 my_image_load_2d_v4f16_i32(uint ix, uint iy, uint8 t) { return __llvm_amdgcn_image_load_2d_v4f16_i32(ix, iy, t); } RATTR static half4 my_image_load_3d_v4f16_i32(uint ix, uint iy, uint iz, uint8 t) { return __llvm_amdgcn_image_load_3d_v4f16_i32(ix, iy, iz, t); } RATTR static half4 my_image_load_cube_v4f16_i32(uint ix, uint iy, uint iface, uint8 t) { return __llvm_amdgcn_image_load_cube_v4f16_i32(ix, iy, iface, t); } RATTR static half4 my_image_load_1darray_v4f16_i32(uint ix, uint islice, uint8 t) { return __llvm_amdgcn_image_load_1darray_v4f16_i32(ix, islice, t); } RATTR static half4 my_image_load_2darray_v4f16_i32(uint ix, uint iy, uint islice, uint8 t) { return __llvm_amdgcn_image_load_2darray_v4f16_i32(ix, iy, islice, t); } RATTR static half4 my_image_load_mip_1d_v4f16_i32(uint ix, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_1d_v4f16_i32(ix, imip, t); } RATTR static half4 my_image_load_mip_2d_v4f16_i32(uint ix, uint iy, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_2d_v4f16_i32(ix, iy, imip, t); } RATTR static half4 my_image_load_mip_3d_v4f16_i32(uint ix, uint iy, uint iz, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_3d_v4f16_i32(ix, iy, iz, imip, t); } RATTR static half4 my_image_load_mip_cube_v4f16_i32(uint ix, uint iy, uint iface, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_cube_v4f16_i32(ix, iy, iface, imip, t); } RATTR static half4 my_image_load_mip_1darray_v4f16_i32(uint ix, uint islice, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_1darray_v4f16_i32(ix, islice, imip, t); } RATTR static half4 my_image_load_mip_2darray_v4f16_i32(uint ix, uint iy, uint islice, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_2darray_v4f16_i32(ix, iy, islice, imip, t); } RATTR static float my_image_load_2d_f32_i32(uint ix, uint iy, uint8 t) { return __llvm_amdgcn_image_load_2d_f32_i32(ix, iy, t); } RATTR static float my_image_load_2darray_f32_i32(uint ix, uint iy, uint islice, uint8 t) { return __llvm_amdgcn_image_load_2darray_f32_i32(ix, iy, islice, t); } RATTR static float my_image_load_mip_2d_f32_i32(uint ix, uint iy, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_2d_f32_i32(ix, iy, imip, t); } RATTR static float my_image_load_mip_2darray_f32_i32(uint ix, uint iy, uint islice, uint imip, uint8 t) { return __llvm_amdgcn_image_load_mip_2darray_f32_i32(ix, iy, islice, imip, t); } WATTR static void my_image_store_1d_v4f32_i32(float4 pix, uint ix, uint8 t) { __llvm_amdgcn_image_store_1d_v4f32_i32(pix, ix, t); } WATTR static void my_image_store_2d_v4f32_i32(float4 pix, uint ix, uint iy, uint8 t) { __llvm_amdgcn_image_store_2d_v4f32_i32(pix, ix, iy, t); } WATTR static void my_image_store_3d_v4f32_i32(float4 pix, uint ix, uint iy, uint iz, uint8 t) { __llvm_amdgcn_image_store_3d_v4f32_i32(pix, ix, iy, iz, t); } WATTR static void my_image_store_cube_v4f32_i32(float4 pix, uint ix, uint iy, uint iface, uint8 t) { __llvm_amdgcn_image_store_cube_v4f32_i32(pix, ix, iy, iface, t); } WATTR static void my_image_store_1darray_v4f32_i32(float4 pix, uint ix, uint islice, uint8 t) { __llvm_amdgcn_image_store_1darray_v4f32_i32(pix, ix, islice, t); } WATTR static void my_image_store_2darray_v4f32_i32(float4 pix, uint ix, uint iy, uint islice, uint8 t) { __llvm_amdgcn_image_store_2darray_v4f32_i32(pix, ix, iy, islice, t); } WATTR static void my_image_store_mip_1d_v4f32_i32(float4 pix, uint ix, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_1d_v4f32_i32(pix, ix, imip, t); } WATTR static void my_image_store_mip_2d_v4f32_i32(float4 pix, uint ix, uint iy, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_2d_v4f32_i32(pix, ix, iy, imip, t); } WATTR static void my_image_store_mip_3d_v4f32_i32(float4 pix, uint ix, uint iy, uint iz, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_3d_v4f32_i32(pix, ix, iy, iz, imip, t); } WATTR static void my_image_store_mip_cube_v4f32_i32(float4 pix, uint ix, uint iy, uint iface, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_cube_v4f32_i32(pix, ix, iy, iface, imip, t); } WATTR static void my_image_store_mip_1darray_v4f32_i32(float4 pix, uint ix, uint islice, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_1darray_v4f32_i32(pix, ix, islice, imip, t); } WATTR static void my_image_store_mip_2darray_v4f32_i32(float4 pix, uint ix, uint iy, uint islice, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_2darray_v4f32_i32(pix, ix, iy, islice, imip, t); } WATTR static void my_image_store_1d_v4f16_i32(half4 pix, uint ix, uint8 t) { __llvm_amdgcn_image_store_1d_v4f16_i32(pix, ix, t); } WATTR static void my_image_store_2d_v4f16_i32(half4 pix, uint ix, uint iy, uint8 t) { __llvm_amdgcn_image_store_2d_v4f16_i32(pix, ix, iy, t); } WATTR static void my_image_store_3d_v4f16_i32(half4 pix, uint ix, uint iy, uint iz, uint8 t) { __llvm_amdgcn_image_store_3d_v4f16_i32(pix, ix, iy, iz, t); } WATTR static void my_image_store_cube_v4f16_i32(half4 pix, uint ix, uint iy, uint iface, uint8 t) { __llvm_amdgcn_image_store_cube_v4f16_i32(pix, ix, iy, iface, t); } WATTR static void my_image_store_1darray_v4f16_i32(half4 pix, uint ix, uint islice, uint8 t) { __llvm_amdgcn_image_store_1darray_v4f16_i32(pix, ix, islice, t); } WATTR static void my_image_store_2darray_v4f16_i32(half4 pix, uint ix, uint iy, uint islice, uint8 t) { __llvm_amdgcn_image_store_2darray_v4f16_i32(pix, ix, iy, islice, t); } WATTR static void my_image_store_mip_1d_v4f16_i32(half4 pix, uint ix, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_1d_v4f16_i32(pix, ix, imip, t); } WATTR static void my_image_store_mip_2d_v4f16_i32(half4 pix, uint ix, uint iy, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_2d_v4f16_i32(pix, ix, iy, imip, t); } WATTR static void my_image_store_mip_3d_v4f16_i32(half4 pix, uint ix, uint iy, uint iz, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_3d_v4f16_i32(pix, ix, iy, iz, imip, t); } WATTR static void my_image_store_mip_cube_v4f16_i32(half4 pix, uint ix, uint iy, uint iface, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_cube_v4f16_i32(pix, ix, iy, iface, imip, t); } WATTR static void my_image_store_mip_1darray_v4f16_i32(half4 pix, uint ix, uint islice, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_1darray_v4f16_i32(pix, ix, islice, imip, t); } WATTR static void my_image_store_mip_2darray_v4f16_i32(half4 pix, uint ix, uint iy, uint islice, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_2darray_v4f16_i32(pix, ix, iy, islice, imip, t); } WATTR static void my_image_store_2d_f32_i32(float pix, uint ix, uint iy, uint8 t) { __llvm_amdgcn_image_store_2d_f32_i32(pix, ix, iy, t); } WATTR static void my_image_store_2darray_f32_i32(float pix, uint ix, uint iy, uint islice, uint8 t) { __llvm_amdgcn_image_store_2darray_f32_i32(pix, ix, iy, islice, t); } WATTR static void my_image_store_mip_2d_f32_i32(float pix, uint ix, uint iy, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_2d_f32_i32(pix, ix, iy, imip, t); } WATTR static void my_image_store_mip_2darray_f32_i32(float pix, uint ix, uint iy, uint islice, uint imip, uint8 t) { __llvm_amdgcn_image_store_mip_2darray_f32_i32(pix, ix, iy, islice, imip, t); } RATTR static float4 my_image_sample_1d_v4f32_f32(float x, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_1d_v4f32_f32(x, t, s); } ERATTR static float4 my_image_sample_lz_1d_v4f32_f32(float x, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_1d_v4f32_f32(x, t, s); } ERATTR static float4 my_image_sample_l_1d_v4f32_f32(float x, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_1d_v4f32_f32(x, lod, t, s); } ERATTR static float4 my_image_sample_d_1d_v4f32_f32_f32(float dxdh, float dxdv, float x, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_1d_v4f32_f32_f32(dxdh, dxdv, x, t, s); } RATTR static float4 my_image_sample_2d_v4f32_f32(float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_2d_v4f32_f32(x, y, t, s); } ERATTR static float4 my_image_sample_lz_2d_v4f32_f32(float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_2d_v4f32_f32(x, y, t, s); } ERATTR static float4 my_image_sample_l_2d_v4f32_f32(float x, float y, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_2d_v4f32_f32(x, y, lod, t, s); } ERATTR static float4 my_image_sample_d_2d_v4f32_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_2d_v4f32_f32_f32(dxdh, dydh, dxdv, dydv, x, y, t, s); } RATTR static float4 my_image_sample_3d_v4f32_f32(float x, float y, float z, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_3d_v4f32_f32(x, y, z, t, s); } ERATTR static float4 my_image_sample_lz_3d_v4f32_f32(float x, float y, float z, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_3d_v4f32_f32(x, y, z, t, s); } ERATTR static float4 my_image_sample_l_3d_v4f32_f32(float x, float y, float z, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_3d_v4f32_f32(x, y, z, lod, t, s); } ERATTR static float4 my_image_sample_d_3d_v4f32_f32_f32(float dxdh, float dydh, float dzdh, float dxdv, float dydv, float dzdv, float x, float y, float z, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_3d_v4f32_f32_f32(dxdh, dydh, dzdh, dxdv, dydv, dzdv, x, y, z, t, s); } RATTR static float4 my_image_sample_cube_v4f32_f32(float x, float y, float face, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_cube_v4f32_f32(x, y, face, t, s); } ERATTR static float4 my_image_sample_lz_cube_v4f32_f32(float x, float y, float face, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_cube_v4f32_f32(x, y, face, t, s); } ERATTR static float4 my_image_sample_l_cube_v4f32_f32(float x, float y, float face, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_cube_v4f32_f32(x, y, face, lod, t, s); } RATTR static float4 my_image_sample_1darray_v4f32_f32(float x, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_1darray_v4f32_f32(x, slice, t, s); } ERATTR static float4 my_image_sample_lz_1darray_v4f32_f32(float x, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_1darray_v4f32_f32(x, slice, t, s); } ERATTR static float4 my_image_sample_l_1darray_v4f32_f32(float x, float slice, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_1darray_v4f32_f32(x, slice, lod, t, s); } ERATTR static float4 my_image_sample_d_1darray_v4f32_f32_f32(float dxdh, float dxdv, float x, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_1darray_v4f32_f32_f32(dxdh, dxdv, x, slice, t, s); } RATTR static float4 my_image_sample_2darray_v4f32_f32(float x, float y, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_2darray_v4f32_f32(x, y, slice, t, s); } ERATTR static float4 my_image_sample_lz_2darray_v4f32_f32(float x, float y, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_2darray_v4f32_f32(x, y, slice, t, s); } ERATTR static float4 my_image_sample_l_2darray_v4f32_f32(float x, float y, float slice, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_2darray_v4f32_f32(x, y, slice, lod, t, s); } ERATTR static float4 my_image_sample_d_2darray_v4f32_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_2darray_v4f32_f32_f32(dxdh, dydh, dxdv, dydv, x, y, slice, t, s); } RATTR static half4 my_image_sample_1d_v4f16_f32(float x, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_1d_v4f16_f32(x, t, s); } ERATTR static half4 my_image_sample_lz_1d_v4f16_f32(float x, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_1d_v4f16_f32(x, t, s); } ERATTR static half4 my_image_sample_l_1d_v4f16_f32(float x, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_1d_v4f16_f32(x, lod, t, s); } ERATTR static half4 my_image_sample_d_1d_v4f16_f32_f32(float dxdh, float dxdv, float x, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_1d_v4f16_f32_f32(dxdh, dxdv, x, t, s); } RATTR static half4 my_image_sample_2d_v4f16_f32(float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_2d_v4f16_f32(x, y, t, s); } ERATTR static half4 my_image_sample_lz_2d_v4f16_f32(float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_2d_v4f16_f32(x, y, t, s); } ERATTR static half4 my_image_sample_l_2d_v4f16_f32(float x, float y, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_2d_v4f16_f32(x, y, lod, t, s); } ERATTR static half4 my_image_sample_d_2d_v4f16_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_2d_v4f16_f32_f32(dxdh, dydh, dxdv, dydv, x, y, t, s); } RATTR static half4 my_image_sample_3d_v4f16_f32(float x, float y, float z, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_3d_v4f16_f32(x, y, z, t, s); } ERATTR static half4 my_image_sample_lz_3d_v4f16_f32(float x, float y, float z, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_3d_v4f16_f32(x, y, z, t, s); } ERATTR static half4 my_image_sample_l_3d_v4f16_f32(float x, float y, float z, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_3d_v4f16_f32(x, y, z, lod, t, s); } ERATTR static half4 my_image_sample_d_3d_v4f16_f32_f32(float dxdh, float dydh, float dzdh, float dxdv, float dydv, float dzdv, float x, float y, float z, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_3d_v4f16_f32_f32(dxdh, dydh, dzdh, dxdv, dydv, dzdv, x, y, z, t, s); } RATTR static half4 my_image_sample_cube_v4f16_f32(float x, float y, float face, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_cube_v4f16_f32(x, y, face, t, s); } ERATTR static half4 my_image_sample_lz_cube_v4f16_f32(float x, float y, float face, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_cube_v4f16_f32(x, y, face, t, s); } ERATTR static half4 my_image_sample_l_cube_v4f16_f32(float x, float y, float face, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_cube_v4f16_f32(x, y, face, lod, t, s); } RATTR static half4 my_image_sample_1darray_v4f16_f32(float x, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_1darray_v4f16_f32(x, slice, t, s); } ERATTR static half4 my_image_sample_lz_1darray_v4f16_f32(float x, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_1darray_v4f16_f32(x, slice, t, s); } ERATTR static half4 my_image_sample_l_1darray_v4f16_f32(float x, float slice, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_1darray_v4f16_f32(x, slice, lod, t, s); } ERATTR static half4 my_image_sample_d_1darray_v4f16_f32_f32(float dxdh, float dxdv, float x, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_1darray_v4f16_f32_f32(dxdh, dxdv, x, slice, t, s); } RATTR static half4 my_image_sample_2darray_v4f16_f32(float x, float y, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_2darray_v4f16_f32(x, y, slice, t, s); } ERATTR static half4 my_image_sample_lz_2darray_v4f16_f32(float x, float y, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_2darray_v4f16_f32(x, y, slice, t, s); } ERATTR static half4 my_image_sample_l_2darray_v4f16_f32(float x, float y, float slice, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_2darray_v4f16_f32(x, y, slice, lod, t, s); } ERATTR static half4 my_image_sample_d_2darray_v4f16_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_2darray_v4f16_f32_f32(dxdh, dydh, dxdv, dydv, x, y, slice, t, s); } RATTR static float my_image_sample_2d_f32_f32(float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_2d_f32_f32(x, y, t, s); } ERATTR static float my_image_sample_lz_2d_f32_f32(float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_2d_f32_f32(x, y, t, s); } ERATTR static float my_image_sample_l_2d_f32_f32(float x, float y, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_2d_f32_f32(x, y, lod, t, s); } ERATTR static float my_image_sample_d_2d_f32_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_2d_f32_f32_f32(dxdh, dydh, dxdv, dydv, x, y, t, s); } RATTR static float my_image_sample_2darray_f32_f32(float x, float y, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_2darray_f32_f32(x, y, slice, t, s); } ERATTR static float my_image_sample_lz_2darray_f32_f32(float x, float y, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_lz_2darray_f32_f32(x, y, slice, t, s); } ERATTR static float my_image_sample_l_2darray_f32_f32(float x, float y, float slice, float lod, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_l_2darray_f32_f32(x, y, slice, lod, t, s); } ERATTR static float my_image_sample_d_2darray_f32_f32_f32(float dxdh, float dydh, float dxdv, float dydv, float x, float y, float slice, uint8 t, uint4 s) { return __llvm_amdgcn_image_sample_d_2darray_f32_f32_f32(dxdh, dydh, dxdv, dydv, x, y, slice, t, s); } ERATTR static float4 my_image_gather4_lz_2d_v4f32_f32_r(float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_r(x, y, t, s); } ERATTR static float4 my_image_gather4_lz_2d_v4f32_f32_g(float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_g(x, y, t, s); } ERATTR static float4 my_image_gather4_lz_2d_v4f32_f32_b(float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_b(x, y, t, s); } ERATTR static float4 my_image_gather4_lz_2d_v4f32_f32_a(float x, float y, uint8 t, uint4 s) { return __llvm_amdgcn_image_gather4_lz_2d_v4f32_f32_a(x, y, t, s); } RATTR float4 OCKL_MANGLE_T(image_load,1D)(TSHARP i, int c) { return my_image_load_1d_v4f32_i32(c, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load,1Da)(TSHARP i, int2 c) { return my_image_load_1darray_v4f32_i32(c.x, c.y, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load,1Db)(TSHARP i, int c) { return __llvm_amdgcn_struct_buffer_load_format_v4f32(LOAD_VSHARP(i), c, 0, 0, 0); } RATTR float4 OCKL_MANGLE_T(image_load,2D)(TSHARP i, int2 c) { return my_image_load_2d_v4f32_i32(c.x, c.y, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load,2Da)(TSHARP i, int4 c) { return my_image_load_2darray_v4f32_i32(c.x, c.y, c.z, LOAD_TSHARP(i)); } RATTR float OCKL_MANGLE_T(image_load,2Dad)(TSHARP i, int4 c) { return my_image_load_2darray_f32_i32(c.x, c.y, c.z, LOAD_TSHARP(i)); } RATTR float OCKL_MANGLE_T(image_load,2Dd)(TSHARP i, int2 c) { return my_image_load_2d_f32_i32(c.x, c.y, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load,3D)(TSHARP i, int4 c) { return my_image_load_3d_v4f32_i32(c.x, c.y, c.z, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load,CM)(TSHARP i, int2 c, int f) { return my_image_load_cube_v4f32_i32(c.x, c.y, f, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load,CMa)(TSHARP i, int4 c, int f) { f = LS_ARRAY_FACE(c.z, f); return my_image_load_cube_v4f32_i32(c.x, c.y, f, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load_lod,1D)(TSHARP i, int c, int l) { return my_image_load_mip_1d_v4f32_i32(c, l, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load_lod,1Da)(TSHARP i, int2 c, int l) { return my_image_load_mip_1darray_v4f32_i32(c.x, c.y, l, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load_lod,2D)(TSHARP i, int2 c, int l) { return my_image_load_mip_2d_v4f32_i32(c.x, c.y, l, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load_lod,2Da)(TSHARP i, int4 c, int l) { return my_image_load_mip_2darray_v4f32_i32(c.x, c.y, c.z, l, LOAD_TSHARP(i)); } RATTR float OCKL_MANGLE_T(image_load_lod,2Dad)(TSHARP i, int4 c, int l) { return my_image_load_mip_2darray_f32_i32(c.x, c.y, c.z, l, LOAD_TSHARP(i)); } RATTR float OCKL_MANGLE_T(image_load_lod,2Dd)(TSHARP i, int2 c, int l) { return my_image_load_mip_2d_f32_i32(c.x, c.y, l, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load_lod,3D)(TSHARP i, int4 c, int l) { return my_image_load_mip_3d_v4f32_i32(c.x, c.y, c.z, l, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load_lod,CM)(TSHARP i, int2 c, int f, int l) { return my_image_load_mip_cube_v4f32_i32(c.x, c.y, f, l, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_load_lod,CMa)(TSHARP i, int4 c, int f, int l) { f = LS_ARRAY_FACE(c.z, f); return my_image_load_mip_cube_v4f32_i32(c.x, c.y, f, l, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh,1D)(TSHARP i, int c) { return my_image_load_1d_v4f16_i32(c, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh,1Da)(TSHARP i, int2 c) { return my_image_load_1darray_v4f16_i32(c.x, c.y, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh,1Db)(TSHARP i, int c) { return __llvm_amdgcn_struct_buffer_load_format_v4f16(LOAD_VSHARP(i), c, 0, 0, 0); } RATTR half4 OCKL_MANGLE_T(image_loadh,2D)(TSHARP i, int2 c) { return my_image_load_2d_v4f16_i32(c.x, c.y, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh,2Da)(TSHARP i, int4 c) { return my_image_load_2darray_v4f16_i32(c.x, c.y, c.z, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh,3D)(TSHARP i, int4 c) { return my_image_load_3d_v4f16_i32(c.x, c.y, c.z, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh,CM)(TSHARP i, int2 c, int f) { return my_image_load_cube_v4f16_i32(c.x, c.y, f, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh,CMa)(TSHARP i, int4 c, int f) { f = LS_ARRAY_FACE(c.z, f); return my_image_load_cube_v4f16_i32(c.x, c.y, f, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh_lod,1D)(TSHARP i, int c, int l) { return my_image_load_mip_1d_v4f16_i32(c, l, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh_lod,1Da)(TSHARP i, int2 c, int l) { return my_image_load_mip_1darray_v4f16_i32(c.x, c.y, l, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh_lod,2D)(TSHARP i, int2 c, int l) { return my_image_load_mip_2d_v4f16_i32(c.x, c.y, l, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh_lod,2Da)(TSHARP i, int4 c, int l) { return my_image_load_mip_2darray_v4f16_i32(c.x, c.y, c.z, l, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh_lod,3D)(TSHARP i, int4 c, int l) { return my_image_load_mip_3d_v4f16_i32(c.x, c.y, c.z, l, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh_lod,CM)(TSHARP i, int2 c, int f, int l) { return my_image_load_mip_cube_v4f16_i32(c.x, c.y, f, l, LOAD_TSHARP(i)); } RATTR half4 OCKL_MANGLE_T(image_loadh_lod,CMa)(TSHARP i, int4 c, int f, int l) { f = LS_ARRAY_FACE(c.z, f); return my_image_load_mip_cube_v4f16_i32(c.x, c.y, f, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store,1D)(TSHARP i, int c, float4 p) { my_image_store_1d_v4f32_i32(p, c, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store,1Da)(TSHARP i, int2 c, float4 p) { my_image_store_1darray_v4f32_i32(p, c.x, c.y, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store,1Db)(TSHARP i, int c, float4 p) { __llvm_amdgcn_struct_buffer_store_format_v4f32(p, LOAD_VSHARP(i), c, 0, 0, 0); } WATTR void OCKL_MANGLE_T(image_store,2D)(TSHARP i, int2 c, float4 p) { my_image_store_2d_v4f32_i32(p, c.x, c.y, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store,2Da)(TSHARP i, int4 c, float4 p) { my_image_store_2darray_v4f32_i32(p, c.x, c.y, c.z, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store,2Dad)(TSHARP i, int4 c, float p) { my_image_store_2darray_f32_i32(p, c.x, c.y, c.z, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store,2Dd)(TSHARP i, int2 c, float p) { my_image_store_2d_f32_i32(p, c.x, c.y, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store,3D)(TSHARP i, int4 c, float4 p) { my_image_store_3d_v4f32_i32(p, c.x, c.y, c.z, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store,CM)(TSHARP i, int2 c, int f, float4 p) { my_image_store_cube_v4f32_i32(p, c.x, c.y, f, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store,CMa)(TSHARP i, int4 c, int f, float4 p) { f = LS_ARRAY_FACE(c.z, f); my_image_store_cube_v4f32_i32(p, c.x, c.y, f, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store_lod,1D)(TSHARP i, int c, int l, float4 p) { my_image_store_mip_1d_v4f32_i32(p, c, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store_lod,1Da)(TSHARP i, int2 c, int l, float4 p) { my_image_store_mip_1darray_v4f32_i32(p, c.x, c.y, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store_lod,2D)(TSHARP i, int2 c, int l, float4 p) { my_image_store_mip_2d_v4f32_i32(p, c.x, c.y, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store_lod,2Da)(TSHARP i, int4 c, int l, float4 p) { my_image_store_mip_2darray_v4f32_i32(p, c.x, c.y, c.z, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store_lod,2Dad)(TSHARP i, int4 c, int l, float p) { my_image_store_mip_2darray_f32_i32(p, c.x, c.y, c.z, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store_lod,2Dd)(TSHARP i, int2 c, int l, float p) { my_image_store_mip_2d_f32_i32(p, c.x, c.y, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store_lod,3D)(TSHARP i, int4 c, int l, float4 p) { my_image_store_mip_3d_v4f32_i32(p, c.x, c.y, c.z, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store_lod,CM)(TSHARP i, int2 c, int f, int l, float4 p) { my_image_store_mip_cube_v4f32_i32(p, c.x, c.y, f, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_store_lod,CMa)(TSHARP i, int4 c, int f, int l, float4 p) { f = LS_ARRAY_FACE(c.z, f); my_image_store_mip_cube_v4f32_i32(p, c.x, c.y, f, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh,1D)(TSHARP i, int c, half4 p) { my_image_store_1d_v4f16_i32(p, c, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh,1Da)(TSHARP i, int2 c, half4 p) { my_image_store_1darray_v4f16_i32(p, c.x, c.y, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh,1Db)(TSHARP i, int c, half4 p) { __llvm_amdgcn_struct_buffer_store_format_v4f16(p, LOAD_VSHARP(i), c, 0, 0, 0); } WATTR void OCKL_MANGLE_T(image_storeh,2D)(TSHARP i, int2 c, half4 p) { my_image_store_2d_v4f16_i32(p, c.x, c.y, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh,2Da)(TSHARP i, int4 c, half4 p) { my_image_store_2darray_v4f16_i32(p, c.x, c.y, c.z, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh,3D)(TSHARP i, int4 c, half4 p) { my_image_store_3d_v4f16_i32(p, c.x, c.y, c.z, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh,CM)(TSHARP i, int2 c, int f, half4 p) { my_image_store_cube_v4f16_i32(p, c.x, c.y, f, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh,CMa)(TSHARP i, int4 c, int f, half4 p) { f = LS_ARRAY_FACE(c.z, f); my_image_store_cube_v4f16_i32(p, c.x, c.y, f, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh_lod,1D)(TSHARP i, int c, int l, half4 p) { my_image_store_mip_1d_v4f16_i32(p, c, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh_lod,1Da)(TSHARP i, int2 c, int l, half4 p) { my_image_store_mip_1darray_v4f16_i32(p, c.x, c.y, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh_lod,2D)(TSHARP i, int2 c, int l, half4 p) { my_image_store_mip_2d_v4f16_i32(p, c.x, c.y, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh_lod,2Da)(TSHARP i, int4 c, int l, half4 p) { my_image_store_mip_2darray_v4f16_i32(p, c.x, c.y, c.z, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh_lod,3D)(TSHARP i, int4 c, int l, half4 p) { my_image_store_mip_3d_v4f16_i32(p, c.x, c.y, c.z, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh_lod,CM)(TSHARP i, int2 c, int f, int l, half4 p) { my_image_store_mip_cube_v4f16_i32(p, c.x, c.y, f, l, LOAD_TSHARP(i)); } WATTR void OCKL_MANGLE_T(image_storeh_lod,CMa)(TSHARP i, int4 c, int f, int l, half4 p) { f = LS_ARRAY_FACE(c.z, f); my_image_store_mip_cube_v4f16_i32(p, c.x, c.y, f, l, LOAD_TSHARP(i)); } RATTR float4 OCKL_MANGLE_T(image_sample,1D)(TSHARP i, SSHARP s, float c) { ADJUST_X(c, i, s); if (EII()) return my_image_sample_lz_1d_v4f32_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_1d_v4f32_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample,1Da)(TSHARP i, SSHARP s, float2 c) { ADJUST_X(c.x, i, s); c.y = __builtin_rintf(c.y); if (EII()) return my_image_sample_lz_1darray_v4f32_f32(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_1darray_v4f32_f32(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); if (EII()) return my_image_sample_lz_2d_v4f32_f32(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_2d_v4f32_f32(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample,2Da)(TSHARP i, SSHARP s, float4 c) { ADJUST_XY(c, i, s); c.z = __builtin_rintf(c.z); if (EII()) return my_image_sample_lz_2darray_v4f32_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_2darray_v4f32_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float OCKL_MANGLE_T(image_sample,2Dad)(TSHARP i, SSHARP s, float4 c) { ADJUST_XY(c, i, s); c.z = __builtin_rintf(c.z); if (EII()) return my_image_sample_lz_2darray_f32_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_2darray_f32_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float OCKL_MANGLE_T(image_sample,2Dd)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); if (EII()) return my_image_sample_lz_2d_f32_f32(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_2d_f32_f32(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample,3D)(TSHARP i, SSHARP s, float4 c) { ADJUST_XYZ(c, i, s); if (EII()) return my_image_sample_lz_3d_v4f32_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_3d_v4f32_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample,CM)(TSHARP i, SSHARP s, float4 c) { CUBE_PREP(c); if (EII()) return my_image_sample_lz_cube_v4f32_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_cube_v4f32_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample,CMa)(TSHARP i, SSHARP s, float4 c) { CUBE_PREP(c); c.z = SAMPLE_ARRAY_FACE(c.w, c.z); if (EII()) return my_image_sample_lz_cube_v4f32_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_cube_v4f32_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_grad,1D)(TSHARP i, SSHARP s, float c, float dx, float dy) { ADJUST_X(c, i, s); return my_image_sample_d_1d_v4f32_f32_f32(dx, dy, c, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_grad,1Da)(TSHARP i, SSHARP s, float2 c, float dx, float dy) { ADJUST_X(c.x, i, s); c.y = __builtin_rintf(c.y); return my_image_sample_d_1darray_v4f32_f32_f32(dx, dy, c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_grad,2D)(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy) { ADJUST_XY(c, i, s); return my_image_sample_d_2d_v4f32_f32_f32(dx.x, dx.y, dy.x, dy.y, c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_grad,2Da)(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy) { ADJUST_XY(c, i, s); c.z = __builtin_rintf(c.z); return my_image_sample_d_2darray_v4f32_f32_f32(dx.x, dx.y, dy.x, dy.y, c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float OCKL_MANGLE_T(image_sample_grad,2Dad)(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy) { ADJUST_XY(c, i, s); c.z = __builtin_rintf(c.z); return my_image_sample_d_2darray_f32_f32_f32(dx.x, dx.y, dy.x, dy.y, c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float OCKL_MANGLE_T(image_sample_grad,2Dd)(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy) { ADJUST_XY(c, i, s); return my_image_sample_d_2d_f32_f32_f32(dx.x, dx.y, dy.x, dy.y, c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_grad,3D)(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy) { ADJUST_XYZ(c, i, s); return my_image_sample_d_3d_v4f32_f32_f32(dx.x, dx.y, dx.z, dy.x, dy.y, dy.z, c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_lod,1D)(TSHARP i, SSHARP s, float c, float l) { return my_image_sample_l_1d_v4f32_f32(c, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_lod,1Da)(TSHARP i, SSHARP s, float2 c, float l) { c.y = __builtin_rintf(c.y); return my_image_sample_l_1darray_v4f32_f32(c.x, c.y, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_lod,2D)(TSHARP i, SSHARP s, float2 c, float l) { return my_image_sample_l_2d_v4f32_f32(c.x, c.y, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_lod,2Da)(TSHARP i, SSHARP s, float4 c, float l) { c.z = __builtin_rintf(c.z); return my_image_sample_l_2darray_v4f32_f32(c.x, c.y, c.z, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float OCKL_MANGLE_T(image_sample_lod,2Dad)(TSHARP i, SSHARP s, float4 c, float l) { c.z = __builtin_rintf(c.z); return my_image_sample_l_2darray_f32_f32(c.x, c.y, c.z, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float OCKL_MANGLE_T(image_sample_lod,2Dd)(TSHARP i, SSHARP s, float2 c, float l) { return my_image_sample_l_2d_f32_f32(c.x, c.y, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_lod,3D)(TSHARP i, SSHARP s, float4 c, float l) { return my_image_sample_l_3d_v4f32_f32(c.x, c.y, c.z, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_lod,CM)(TSHARP i, SSHARP s, float4 c, float l) { CUBE_PREP(c); return my_image_sample_l_cube_v4f32_f32(c.x, c.y, c.z, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_sample_lod,CMa)(TSHARP i, SSHARP s, float4 c, float l) { CUBE_PREP(c); c.z = SAMPLE_ARRAY_FACE(c.w, c.z); return my_image_sample_l_cube_v4f32_f32(c.x, c.y, c.z, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh,1D)(TSHARP i, SSHARP s, float c) { ADJUST_X(c, i, s); if (EII()) return my_image_sample_lz_1d_v4f16_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_1d_v4f16_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh,1Da)(TSHARP i, SSHARP s, float2 c) { ADJUST_X(c.x, i, s); c.y = __builtin_rintf(c.y); if (EII()) return my_image_sample_lz_1darray_v4f16_f32(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_1darray_v4f16_f32(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); if (EII()) return my_image_sample_lz_2d_v4f16_f32(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_2d_v4f16_f32(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh,2Da)(TSHARP i, SSHARP s, float4 c) { ADJUST_XY(c, i, s); c.z = __builtin_rintf(c.z); if (EII()) return my_image_sample_lz_2darray_v4f16_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_2darray_v4f16_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh,3D)(TSHARP i, SSHARP s, float4 c) { ADJUST_XYZ(c, i, s); if (EII()) return my_image_sample_lz_3d_v4f16_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_3d_v4f16_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh,CM)(TSHARP i, SSHARP s, float4 c) { CUBE_PREP(c); if (EII()) return my_image_sample_lz_cube_v4f16_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_cube_v4f16_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh,CMa)(TSHARP i, SSHARP s, float4 c) { CUBE_PREP(c); c.z = SAMPLE_ARRAY_FACE(c.w, c.z); if (EII()) return my_image_sample_lz_cube_v4f16_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); else return my_image_sample_cube_v4f16_f32(c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_grad,1D)(TSHARP i, SSHARP s, float c, float dx, float dy) { ADJUST_X(c, i, s); return my_image_sample_d_1d_v4f16_f32_f32(dx, dy, c, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_grad,1Da)(TSHARP i, SSHARP s, float2 c, float dx, float dy) { ADJUST_X(c.x, i, s); c.y = __builtin_rintf(c.y); return my_image_sample_d_1darray_v4f16_f32_f32(dx, dy, c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_grad,2D)(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy) { ADJUST_XY(c, i, s); return my_image_sample_d_2d_v4f16_f32_f32(dx.x, dx.y, dy.x, dy.y, c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_grad,2Da)(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy) { ADJUST_XY(c, i, s); c.z = __builtin_rintf(c.z); return my_image_sample_d_2darray_v4f16_f32_f32(dx.x, dx.y, dy.x, dy.y, c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_grad,3D)(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy) { ADJUST_XYZ(c, i, s); return my_image_sample_d_3d_v4f16_f32_f32(dx.x, dx.y, dx.z, dy.x, dy.y, dy.z, c.x, c.y, c.z, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_lod,1D)(TSHARP i, SSHARP s, float c, float l) { return my_image_sample_l_1d_v4f16_f32(c, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_lod,1Da)(TSHARP i, SSHARP s, float2 c, float l) { c.y = __builtin_rintf(c.y); return my_image_sample_l_1darray_v4f16_f32(c.x, c.y, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_lod,2D)(TSHARP i, SSHARP s, float2 c, float l) { return my_image_sample_l_2d_v4f16_f32(c.x, c.y, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_lod,2Da)(TSHARP i, SSHARP s, float4 c, float l) { c.z = __builtin_rintf(c.z); return my_image_sample_l_2darray_v4f16_f32(c.x, c.y, c.z, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_lod,3D)(TSHARP i, SSHARP s, float4 c, float l) { return my_image_sample_l_3d_v4f16_f32(c.x, c.y, c.z, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_lod,CM)(TSHARP i, SSHARP s, float4 c, float l) { CUBE_PREP(c); return my_image_sample_l_cube_v4f16_f32(c.x, c.y, c.z, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR half4 OCKL_MANGLE_T(image_sampleh_lod,CMa)(TSHARP i, SSHARP s, float4 c, float l) { CUBE_PREP(c); c.z = SAMPLE_ARRAY_FACE(c.w, c.z); return my_image_sample_l_cube_v4f16_f32(c.x, c.y, c.z, l, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_gather4r,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); return my_image_gather4_lz_2d_v4f32_f32_r(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_gather4g,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); return my_image_gather4_lz_2d_v4f32_f32_g(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_gather4b,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); return my_image_gather4_lz_2d_v4f32_f32_b(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } RATTR float4 OCKL_MANGLE_T(image_gather4a,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); return my_image_gather4_lz_2d_v4f32_f32_a(c.x, c.y, LOAD_TSHARP(i), LOAD_SSHARP(s)); } // We rely on the fact that the runtime allocates 12 words for the T# or V# // and fills words 8, 9, and 10 with the data we need to answer all of the queries #define ARRAY_SIZE(I) \ if (__oclc_ISA_version < 9000) { \ return FIELD(I, 173, 13) + 1U; \ } else { \ return FIELD(I, 128, 13) + 1U; \ } GATTR int OCKL_MANGLE_T(image_array_size,1Da)(TSHARP i) { ARRAY_SIZE(i) } GATTR int OCKL_MANGLE_T(image_array_size,2Da)(TSHARP i) { ARRAY_SIZE(i) } GATTR int OCKL_MANGLE_T(image_array_size,2Dad)(TSHARP i) { ARRAY_SIZE(i) } GATTR int OCKL_MANGLE_T(image_array_size,CMa)(TSHARP i) { ARRAY_SIZE(i) } GATTR int OCKL_MANGLE_T(image_channel_data_type,1D)(TSHARP i) { return WORD(i, 8); } GATTR int OCKL_MANGLE_T(image_channel_data_type,1Da)(TSHARP i) { return WORD(i, 8); } GATTR int OCKL_MANGLE_T(image_channel_data_type,1Db)(TSHARP i) { return WORD(i, 8); } GATTR int OCKL_MANGLE_T(image_channel_data_type,2D)(TSHARP i) { return WORD(i, 8); } GATTR int OCKL_MANGLE_T(image_channel_data_type,2Da)(TSHARP i) { return WORD(i, 8); } GATTR int OCKL_MANGLE_T(image_channel_data_type,2Dad)(TSHARP i) { return WORD(i, 8); } GATTR int OCKL_MANGLE_T(image_channel_data_type,2Dd)(TSHARP i) { return WORD(i, 8); } GATTR int OCKL_MANGLE_T(image_channel_data_type,3D)(TSHARP i) { return WORD(i, 8); } GATTR int OCKL_MANGLE_T(image_channel_data_type,CM)(TSHARP i) { return WORD(i, 8); } GATTR int OCKL_MANGLE_T(image_channel_data_type,CMa)(TSHARP i) { return WORD(i, 8); } GATTR int OCKL_MANGLE_T(image_channel_order,1D)(TSHARP i) { return WORD(i, 9); } GATTR int OCKL_MANGLE_T(image_channel_order,1Da)(TSHARP i) { return WORD(i, 9); } GATTR int OCKL_MANGLE_T(image_channel_order,1Db)(TSHARP i) { return WORD(i, 9); } GATTR int OCKL_MANGLE_T(image_channel_order,2D)(TSHARP i) { return WORD(i, 9); } GATTR int OCKL_MANGLE_T(image_channel_order,2Da)(TSHARP i) { return WORD(i, 9); } GATTR int OCKL_MANGLE_T(image_channel_order,2Dad)(TSHARP i) { return WORD(i, 9); } GATTR int OCKL_MANGLE_T(image_channel_order,2Dd)(TSHARP i) { return WORD(i, 9); } GATTR int OCKL_MANGLE_T(image_channel_order,3D)(TSHARP i) { return WORD(i, 9); } GATTR int OCKL_MANGLE_T(image_channel_order,CM)(TSHARP i) { return WORD(i, 9); } GATTR int OCKL_MANGLE_T(image_channel_order,CMa)(TSHARP i) { return WORD(i, 9); } GATTR int OCKL_MANGLE_T(image_depth,3D)(TSHARP i) { return FIELD(i, 128, 13) + 1U; } GATTR int OCKL_MANGLE_T(image_height,2D)(TSHARP i) { return FIELD(i, 78, 14) + 1U; } GATTR int OCKL_MANGLE_T(image_height,2Da)(TSHARP i) { return FIELD(i, 78, 14) + 1U; } GATTR int OCKL_MANGLE_T(image_height,2Dad)(TSHARP i) { return FIELD(i, 78, 14) + 1U; } GATTR int OCKL_MANGLE_T(image_height,2Dd)(TSHARP i) { return FIELD(i, 78, 14) + 1U; } GATTR int OCKL_MANGLE_T(image_height,3D)(TSHARP i) { return FIELD(i, 78, 14) + 1U; } GATTR int OCKL_MANGLE_T(image_height,CM)(TSHARP i) { return FIELD(i, 78, 14) + 1U; } GATTR int OCKL_MANGLE_T(image_height,CMa)(TSHARP i) { return FIELD(i, 78, 14) + 1U; } GATTR int OCKL_MANGLE_T(image_num_mip_levels,1D)(TSHARP i) { return FIELD(i, 112, 4); } GATTR int OCKL_MANGLE_T(image_num_mip_levels,1Da)(TSHARP i) { return FIELD(i, 112, 4); } GATTR int OCKL_MANGLE_T(image_num_mip_levels,2D)(TSHARP i) { return FIELD(i, 112, 4); } GATTR int OCKL_MANGLE_T(image_num_mip_levels,2Da)(TSHARP i) { return FIELD(i, 112, 4); } GATTR int OCKL_MANGLE_T(image_num_mip_levels,2Dad)(TSHARP i) { return FIELD(i, 112, 4); } GATTR int OCKL_MANGLE_T(image_num_mip_levels,2Dd)(TSHARP i) { return FIELD(i, 112, 4); } GATTR int OCKL_MANGLE_T(image_num_mip_levels,3D)(TSHARP i) { return FIELD(i, 112, 4); } GATTR int OCKL_MANGLE_T(image_num_mip_levels,CM)(TSHARP i) { return FIELD(i, 112, 4); } GATTR int OCKL_MANGLE_T(image_num_mip_levels,CMa)(TSHARP i) { return FIELD(i, 112, 4); } // In FIELD(i, 64, 14) but also copied into word 11 of the 12 that are allocated GATTR int OCKL_MANGLE_T(image_width,1D)(TSHARP i) { return WORD(i, 10); } GATTR int OCKL_MANGLE_T(image_width,1Da)(TSHARP i) { return WORD(i, 10); } GATTR int OCKL_MANGLE_T(image_width,2D)(TSHARP i) { return WORD(i, 10); } GATTR int OCKL_MANGLE_T(image_width,2Da)(TSHARP i) { return WORD(i, 10); } GATTR int OCKL_MANGLE_T(image_width,2Dad)(TSHARP i) { return WORD(i, 10); } GATTR int OCKL_MANGLE_T(image_width,2Dd)(TSHARP i) { return WORD(i, 10); } GATTR int OCKL_MANGLE_T(image_width,3D)(TSHARP i) { return WORD(i, 10); } GATTR int OCKL_MANGLE_T(image_width,CM)(TSHARP i) { return WORD(i, 10); } GATTR int OCKL_MANGLE_T(image_width,CMa)(TSHARP i) { return WORD(i, 10); } // This would be a bit trickier since we actually have a V# here and need to look at const_num_records and const_stride GATTR int OCKL_MANGLE_T(image_width,1Db)(TSHARP i) { return WORD(i, 10); } ROCm-Device-Libs-rocm-5.0.0/ockl/src/lane.cl000066400000000000000000000011461415221260100203100ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" #include "ockl.h" __attribute__((always_inline)) uint OCKL_MANGLE_U32(lane)(void) { if (__oclc_wavefrontsize64) { return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); } else { return __builtin_amdgcn_mbcnt_lo(~0u, 0u); } } ROCm-Device-Libs-rocm-5.0.0/ockl/src/media.cl000066400000000000000000000107211415221260100204470ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "ockl.h" #include "oclc.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define CATTR __attribute__((const)) #define AS_UCHAR4(X) __builtin_astype(X, uchar4) CATTR uint OCKL_MANGLE_U32(bfm)(uint w, uint s) { // TODO check that this results in v_bfm_b32 return ((1U << w) - 1U) << s; } CATTR int OCKL_MANGLE_I32(bfe)(int a, uint s, uint w) { return __builtin_amdgcn_sbfe(a, s, w); } CATTR uint OCKL_MANGLE_U32(bfe)(uint a, uint s, uint w) { return __builtin_amdgcn_ubfe(a, s, w); } CATTR uint OCKL_MANGLE_U32(bitalign)(uint a, uint b, uint c) { return __builtin_amdgcn_alignbit(a, b, c); } CATTR uint OCKL_MANGLE_U32(bytealign)(uint a, uint b, uint c) { return __builtin_amdgcn_alignbyte(a, b, c); } CATTR uint OCKL_MANGLE_U32(lerp)(uint a, uint b, uint c) { return __builtin_amdgcn_lerp(a, b, c); } CATTR float OCKL_MANGLE_F32(max3)(float a, float b, float c) { return __builtin_fmaxf(__builtin_fmaxf(a, b), c); } CATTR float OCKL_MANGLE_F32(median3)(float a, float b, float c) { return __builtin_amdgcn_fmed3f(a, b, c); } CATTR float OCKL_MANGLE_F32(min3)(float a, float b, float c) { return __builtin_fminf(__builtin_fminf(a, b), c); } CATTR half OCKL_MANGLE_F16(max3)(half a, half b, half c) { return __builtin_fmaxf16(__builtin_fmaxf16(a, b), c); } REQUIRES_GFX9_INSTS static inline half median3_f16_gfx9_impl(half a, half b, half c) { return __builtin_amdgcn_fmed3h(a, b, c); } CATTR half OCKL_MANGLE_F16(median3)(half a, half b, half c) { if (__oclc_ISA_version >= 9000) return median3_f16_gfx9_impl(a, b, c); half a1 = __builtin_fminf16(a, b); half b1 = __builtin_fmaxf16(a, b); half c1 = __builtin_fmaxf16(a1, c); return __builtin_fminf16(b1, c1); } CATTR half OCKL_MANGLE_F16(min3)(half a, half b, half c) { return __builtin_fminf16(__builtin_fminf16(a, b), c); } CATTR int OCKL_MANGLE_I32(max3)(int a, int b, int c) { int a1 = a > b ? a : b; return a1 > c ? a1 : c; } CATTR int OCKL_MANGLE_I32(median3)(int a, int b, int c) { int a1 = a < b ? a : b; int b1 = a > b ? a : b; int c1 = a1 > c ? a1 : c; return b1 < c1 ? b1 : c1; } CATTR int OCKL_MANGLE_I32(min3)(int a, int b, int c) { int a1 = a < b ? a : b; return a1 < c ? a1 : c; } CATTR uint OCKL_MANGLE_U32(max3)(uint a, uint b, uint c) { uint a1 = a > b ? a : b; return a1 > c ? a1 : c; } CATTR uint OCKL_MANGLE_U32(median3)(uint a, uint b, uint c) { uint a1 = a < b ? a : b; uint b1 = a > b ? a : b; uint c1 = a1 > c ? a1 : c; return b1 < c1 ? b1 : c1; } CATTR uint OCKL_MANGLE_U32(min3)(uint a, uint b, uint c) { uint a1 = a < b ? a : b; return a1 < c ? a1 : c; } CATTR uint OCKL_MANGLE_U32(msad)(uint a, uint b, uint c) { return __builtin_amdgcn_msad_u8(a, b, c); } CATTR ulong OCKL_MANGLE_U64(mqsad)(ulong a, uint b, ulong c) { return __builtin_amdgcn_mqsad_pk_u16_u8(a, b, c); } CATTR uint OCKL_MANGLE_U32(pack)(float4 a) { return __builtin_amdgcn_cvt_pk_u8_f32(a.s3, 3, __builtin_amdgcn_cvt_pk_u8_f32(a.s2, 2, __builtin_amdgcn_cvt_pk_u8_f32(a.s1, 1, __builtin_amdgcn_cvt_pk_u8_f32(a.s0, 0, 0)))); } CATTR ulong OCKL_MANGLE_U64(qsad)(ulong a, uint b, ulong c) { return __builtin_amdgcn_qsad_pk_u16_u8(a, b, c); } CATTR uint OCKL_MANGLE_U32(sad)(uint a, uint b, uint c) { return __builtin_amdgcn_sad_u8(a, b, c); } CATTR uint OCKL_MANGLE_U32(sadd)(uint a, uint b, uint c) { // TODO check that this results in v_sad_u32 return (a > b ? a : b) - (a < b ? a : b) + c; } CATTR uint OCKL_MANGLE_U32(sadhi)(uint a, uint b, uint c) { return __builtin_amdgcn_sad_hi_u8(a, b, c); } CATTR uint OCKL_MANGLE_U32(sadw)(uint a, uint b, uint c) { return __builtin_amdgcn_sad_u16(a, b, c); } CATTR float OCKL_MANGLE_F32(unpack0)(uint a) { uchar4 v = AS_UCHAR4(a); return (float)v.s0; } CATTR float OCKL_MANGLE_F32(unpack1)(uint a) { uchar4 v = AS_UCHAR4(a); return (float)v.s1; } CATTR float OCKL_MANGLE_F32(unpack2)(uint a) { uchar4 v = AS_UCHAR4(a); return (float)v.s2; } CATTR float OCKL_MANGLE_F32(unpack3)(uint a) { uchar4 v = AS_UCHAR4(a); return (float)v.s3; } ROCm-Device-Libs-rocm-5.0.0/ockl/src/mtime.cl000066400000000000000000000011251415221260100205010ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" __attribute__((target("s-memtime-inst"))) ulong OCKL_MANGLE_U64(memtime)(void) { return __builtin_amdgcn_s_memtime(); } __attribute__((target("s-memrealtime"))) ulong OCKL_MANGLE_U64(memrealtime)(void) { return __builtin_amdgcn_s_memrealtime(); } ROCm-Device-Libs-rocm-5.0.0/ockl/src/mul24.cl000066400000000000000000000010751415221260100203350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" __attribute__((const)) int OCKL_MANGLE_I32(mul24)(int x, int y) { return ((x << 8) >> 8) * ((y << 8) >> 8); } __attribute__((const)) uint OCKL_MANGLE_U32(mul24)(uint x, uint y) { return ((x << 8) >> 8) * ((y << 8) >> 8); } ROCm-Device-Libs-rocm-5.0.0/ockl/src/mul_hi.cl000066400000000000000000000024251415221260100206470ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" __attribute__((const)) int OCKL_MANGLE_I32(mul_hi)(int x, int y) { return (int)(((long)x * (long)y) >> 32); } __attribute__((const)) uint OCKL_MANGLE_U32(mul_hi)(uint x, uint y) { return (uint)(((ulong)x * (ulong)y) >> 32); } __attribute__((const)) long OCKL_MANGLE_I64(mul_hi)(long x, long y) { ulong x0 = (ulong)x & 0xffffffffUL; long x1 = x >> 32; ulong y0 = (ulong)y & 0xffffffffUL; long y1 = y >> 32; ulong z0 = x0*y0; long t = x1*y0 + (z0 >> 32); long z1 = t & 0xffffffffL; long z2 = t >> 32; z1 = x0*y1 + z1; return x1*y1 + z2 + (z1 >> 32); } __attribute__((const)) ulong OCKL_MANGLE_U64(mul_hi)(ulong x, ulong y) { ulong x0 = x & 0xffffffffUL; ulong x1 = x >> 32; ulong y0 = y & 0xffffffffUL; ulong y1 = y >> 32; ulong z0 = x0*y0; ulong t = x1*y0 + (z0 >> 32); ulong z1 = t & 0xffffffffUL; ulong z2 = t >> 32; z1 = x0*y1 + z1; return x1*y1 + z2 + (z1 >> 32); } ROCm-Device-Libs-rocm-5.0.0/ockl/src/popcount.cl000066400000000000000000000011361415221260100212370ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "ockl.h" __attribute__((always_inline, const)) uint OCKL_MANGLE_U32(popcount)(uint i) { return (uint)__builtin_popcount(i); } __attribute__((always_inline, const)) ulong OCKL_MANGLE_U64(popcount)(ulong i) { return (ulong)__builtin_popcountl(i); } ROCm-Device-Libs-rocm-5.0.0/ockl/src/readuplane.cl000066400000000000000000000030521415221260100215070ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define WAVESIZE 64 #define ATTR __attribute__((convergent)) // Function to exchange data between different lanes // var: value to return if the index is outside the bounds of the wave // offset: To be added to the lane id to obtain final index // return a int value correspoding to the lane ATTR int __ockl_readuplane_i32(int var, int offset) { uint lane_id = __ockl_lane_u32(); int index = lane_id + offset; index = (uint)((lane_id & (WAVESIZE - 1)) + offset) >= WAVESIZE ? lane_id : index; return __builtin_amdgcn_ds_bpermute(index << 2, var); } // Function to exchange data between different lanes // var: value to return if the index is outside the bounds of the wave // offset: To be added to the lane id to obtain final index // return a long value correspoding to the lane ATTR long __ockl_readuplane_i64(long var, int offset) { int lane_id = __ockl_lane_u32(); int index = lane_id + offset; index = (uint)((lane_id & (WAVESIZE - 1)) + offset) >= WAVESIZE ? lane_id : index; int2 var_64= __builtin_astype(var, int2); var_64.x = __builtin_amdgcn_ds_bpermute(index << 2, var_64.x); var_64.y = __builtin_amdgcn_ds_bpermute(index << 2, var_64.y); return __builtin_astype(var_64, long); } ROCm-Device-Libs-rocm-5.0.0/ockl/src/services.cl000066400000000000000000000351721415221260100212220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define WEAK_ATTR __attribute__((weak)) // This must match the enumeration defined by the runtime in // ROCclr/device/devhcmessages.hpp typedef enum { SERVICE_RESERVED = 0, SERVICE_FUNCTION_CALL = 1, SERVICE_PRINTF = 2, SERVICE_FPRINTF = SERVICE_PRINTF, SERVICE_DEVMEM = 3, SERVICE_SANITIZER = 4 } service_id_t; extern long2 __ockl_hostcall_preview(uint service_id, ulong arg0, ulong arg1, ulong arg2, ulong arg3, ulong arg4, ulong arg5, ulong arg6, ulong arg7); /*===--- FUNCTION CALL -----------------------------------------------------*/ long2 __ockl_call_host_function(ulong fptr, ulong arg0, ulong arg1, ulong arg2, ulong arg3, ulong arg4, ulong arg5, ulong arg6) { return __ockl_hostcall_preview(SERVICE_FUNCTION_CALL, fptr, arg0, arg1, arg2, arg3, arg4, arg5, arg6); } /*===--- MESSAGES ----------------------------------------------------------*/ /** \brief Concatenating hostcalls into a message * * A message is a stream of 64-bit integers transmitted as a series * of hostcall invocations by the device code. Although the hostcall * is "warp-wide", the message for each workitem is distinct. * * Of the eight uint64_t arguments in hostcall, the first argument is * used as the message descriptor, while the rest are used for * message contents. The descriptor consists of the following fields: * * - Bit 0 is the BEGIN flag. * - Bit 1 is the END flag. * - Bits 2-4 are reserved and must be zero. * - Bits 5-7 indicate the number of elements being transmitted. * - Bits 8-63 contain a 56-bit message ID. * * A hostcall with the BEGIN flag set in the descriptor indicates the * start of a new message. A hostcall with the END flag set indicates * the end of a message. A single hostcall can have both flags set if * the message fits in the payload of a single hostcall. Each * hostcall indicates the number of uint64_t elements in the payload * that contain data to be appended to the message. * * When the accumulator receives a hostcall with the BEGIN flag set, * it allocates a new message ID, which is transmitted to the device * via the first return value in the hostcall. Every subsequent * hostcall containing the same message ID appends its payload to * that message. The message is said to be "active" until a * corresponding END hostcall is received. * * When the accumulator receives a hostcall with the END flag set, it * invokes the corresponding message handler on the contents of the * accumulated message, and then discards the message. The handler * may return up to two uint64_t values, that are transmitted to the * device via the return value of the last hostcall. * * Behaviour is undefined in each of the following cases: * - An END packet is received with a non-existent message ID, or with * the ID of a message that has previously been END'ed. * - No END packet is received for an active message. * - Any of the reserved bits are non-zero. * - Different hostcalls indicate the same active message ID but a * different service. */ /** Enums that describe the message descriptor fields. */ typedef enum { DESCRIPTOR_OFFSET_FLAG_BEGIN = 0, DESCRIPTOR_OFFSET_FLAG_END = 1, DESCRIPTOR_OFFSET_RESERVED0 = 2, DESCRIPTOR_OFFSET_LEN = 5, DESCRIPTOR_OFFSET_ID = 8 } descriptor_offset_t; typedef enum { DESCRIPTOR_WIDTH_FLAG_BEGIN = 1, DESCRIPTOR_WIDTH_FLAG_END = 1, DESCRIPTOR_WIDTH_RESERVED0 = 3, DESCRIPTOR_WIDTH_LEN = 3, DESCRIPTOR_WIDTH_ID = 56 } descriptor_width_t; static ulong msg_set_len(ulong pd, uint len) { ulong reset_mask = ~(((1UL << DESCRIPTOR_WIDTH_LEN) - 1) << DESCRIPTOR_OFFSET_LEN); return (pd & reset_mask) | ((ulong)len << DESCRIPTOR_OFFSET_LEN); } static ulong msg_set_begin_flag(ulong pd) { return pd | (1UL << DESCRIPTOR_OFFSET_FLAG_BEGIN); } static ulong msg_reset_begin_flag(ulong pd) { return pd & (~(1UL << DESCRIPTOR_OFFSET_FLAG_BEGIN)); } static ulong msg_get_end_flag(ulong pd) { return pd & (1UL << DESCRIPTOR_OFFSET_FLAG_END); } static ulong msg_reset_end_flag(ulong pd) { return pd & (~(1UL << DESCRIPTOR_OFFSET_FLAG_END)); } static ulong msg_set_end_flag(ulong pd) { return pd | (1UL << DESCRIPTOR_OFFSET_FLAG_END); } static long2 append_bytes(uint service_id, ulong msg_desc, const uchar *data, uint len) { msg_desc = msg_set_len(msg_desc, (len + 7) / 8); #define PACK_ULONG(ARG) \ ulong ARG = 0; \ if (len >= 8) { \ ARG = (ulong)data[0] | ((ulong)data[1] << 8) | \ ((ulong)data[2] << 16) | ((ulong)data[3] << 24) | \ ((ulong)data[4] << 32) | ((ulong)data[5] << 40) | \ ((ulong)data[6] << 48) | ((ulong)data[7] << 56); \ len -= 8; \ data += 8; \ } else { \ for (uint ii = 0; ii != len; ++ii) { \ ARG |= (ulong)data[ii] << (ii * 8); \ } \ len = 0; \ } PACK_ULONG(arg1); PACK_ULONG(arg2); PACK_ULONG(arg3); PACK_ULONG(arg4); PACK_ULONG(arg5); PACK_ULONG(arg6); PACK_ULONG(arg7); return __ockl_hostcall_preview(service_id, msg_desc, arg1, arg2, arg3, arg4, arg5, arg6, arg7); } /** \brief Append an array of bytes to a message. * \param service_id Identifier for the target host-side service. * \param msg_desc Message descriptor for a new or existing message. * \param data Pointer to an array of bytes. * \param len Length of the array. * \return Values depend on the state of the message. * * The function can transmit a byte array of arbitrary length, but * during transmission, the array is padded with zeroes until the * length is a multiple of eight bytes. Only the array contents are * transmitted, and not the length. * * If the END flag is set, the function returns two long values * received from the host message handler. Otherwise, the first * return value is the message descriptor to be used for a subsequent * message call, while the second return value is not defined. */ static long2 message_append_bytes(uint service_id, ulong msg_desc, const uchar *data, ulong len) { ulong end_flag = msg_get_end_flag(msg_desc); long2 retval = {0, 0}; retval.x = msg_reset_end_flag(msg_desc); do { uint plen = len; if (len > 56) { plen = 56; } else { retval.x |= end_flag; } retval = append_bytes(service_id, retval.x, data, plen); len -= plen; data += plen; } while (len != 0); return retval; } /** \brief Append up to seven ulong values to a message. * \param service_id Identifier for the target host-side service. * \param msg_desc Message descriptor for a new or existing message. * \param num_args Number of arguments to be appended (maximum seven). * \param arg[0..6] Arguments to be appended. * \return Values depend on the state of the message. * * Only the first #num_args arguments are appended to the * message. The remaining arguments are ignored. Behaviour is * undefined if #num_args is greater then seven. * * If the END flag is set, the function returns two uint64_t values * received from the host message handler. Otherwise, the first * return value is the message descriptor to be used for a subsequent * message call, while the second return value is not defined. */ static long2 message_append_args(uint service_id, ulong msg_desc, uint num_args, ulong arg0, ulong arg1, ulong arg2, ulong arg3, ulong arg4, ulong arg5, ulong arg6) { msg_desc = msg_set_len(msg_desc, num_args); return __ockl_hostcall_preview(service_id, msg_desc, arg0, arg1, arg2, arg3, arg4, arg5, arg6); } /*===--- FPRINTF -----------------------------------------------------------*/ typedef enum { FPRINTF_CTRL_STDOUT = 0, FPRINTF_CTRL_STDERR = 1 } fprintf_ctrl_t; static inline ulong begin_fprintf(fprintf_ctrl_t flags) { // The two standard output streams stderr and stdout are indicated // using the lowest bits in the control qword. For now, all other // bits are required to be zero. const ulong msg_desc = msg_set_begin_flag(0); ulong control = (ulong)flags; long2 retval = message_append_args(SERVICE_FPRINTF, msg_desc, /* num_args = */ 1, control, 0, 0, 0, 0, 0, 0); return retval.x; } /** \brief Begin a new fprintf message for stdout. * \return Message descriptor for a new printf invocation. */ ulong __ockl_fprintf_stdout_begin() { return begin_fprintf(FPRINTF_CTRL_STDOUT); } /** \brief Begin a new fprintf message for stderr. * \return Message descriptor for a new printf invocation. */ ulong __ockl_fprintf_stderr_begin() { return begin_fprintf(FPRINTF_CTRL_STDERR); } /** \brief Append up to seven arguments to the fprintf message. * \param msg_desc Message descriptor for the current fprintf. * \param num_args Number of arguments to be appended (maximum seven). * \param value0... The argument values to be appended. * \param is_last If non-zero, this causes the fprintf to be completed. * \return Value depends on #is_last. * * Only the first #num_args arguments are appended to the * message. The remaining arguments are ignored. Behaviour is * undefined if #num_args is greater then seven. * * If #is_last is zero, the function returns a message desciptor that * must be used by a subsequent call to any __ockl_fprintf* * function. If #is_last is non-zero, the function causes the current * fprintf to be completed on the host-side, and returns the value * returned by that fprintf. */ ulong __ockl_fprintf_append_args(ulong msg_desc, uint num_args, ulong value0, ulong value1, ulong value2, ulong value3, ulong value4, ulong value5, ulong value6, uint is_last) { if (is_last) { msg_desc = msg_set_end_flag(msg_desc); } long2 retval = message_append_args(SERVICE_FPRINTF, msg_desc, num_args, value0, value1, value2, value3, value4, value5, value6); return retval.x; } /** \brief Append a null-terminated string to the fprintf message. * \param msg_desc Message descriptor for the current fprintf. * \param data Pointer to the string. * \param length Number of bytes, including the null terminator. * \param is_last If non-zero, this causes the fprintf to be completed. * \return Value depends on #is_last. * * The function appends a single null-terminated string to a current * fprintf message, including the final null character. The host-side * can use the bytes as a null-terminated string in place, without * having to first copy the string and then append the null * terminator. * * #length itself is not transmitted. Behaviour is undefined if * #length does not include the final null character. #data may * be a null pointer, in which case, #length is ignored and a single * zero is transmitted. This makes the nullptr indistinguishable from * an empty string to the host-side receiver. * * The call to message_append_args() ensures that during * transmission, the string is null-padded to a multiple of eight. * * If #is_last is zero, the function returns a message desciptor that * must be used by a subsequent call to any __ockl_fprintf* * function. If #is_last is non-zero, the function causes the current * fprintf to be completed on the host-side, and returns the value * returned by that fprintf. */ ulong __ockl_fprintf_append_string_n(ulong msg_desc, const char *data, ulong length, uint is_last) { long2 retval = {0, 0}; if (is_last) { msg_desc = msg_set_end_flag(msg_desc); } if (!data) { retval = message_append_args(SERVICE_FPRINTF, msg_desc, 1, 0, 0, 0, 0, 0, 0, 0); return retval.x; } retval = message_append_bytes(SERVICE_FPRINTF, msg_desc, (const uchar *)data, length); return retval.x; } /*===--- PRINTF ------------------------------------------------------------*/ /* DEPRECATED. Wrappers that should be removed eventually. */ ulong __ockl_printf_begin(ulong ignored /* used to be version */) { return __ockl_fprintf_stdout_begin(); } ulong __ockl_printf_append_args(ulong msg_desc, uint num_args, ulong value0, ulong value1, ulong value2, ulong value3, ulong value4, ulong value5, ulong value6, uint is_last) { return __ockl_fprintf_append_args(msg_desc, num_args, value0, value1, value2, value3, value4, value5, value6, is_last); } ulong __ockl_printf_append_string_n(ulong msg_desc, const char *data, ulong length, uint is_last) { return __ockl_fprintf_append_string_n(msg_desc, data, length, is_last); } /*---------------- SANITIZER SERVICE ---------------------------------*/ WEAK_ATTR void __ockl_sanitizer_report(ulong addr, ulong pc, ulong wgidx, ulong wgidy, ulong wgidz, ulong wave_id, ulong is_read, ulong access_size) { long2 value = __ockl_hostcall_preview(SERVICE_SANITIZER, addr, pc, wgidx, wgidy, wgidz, wave_id, is_read, access_size); (void)value; } /*===--- DEVMEM ----------------------------------------------------------*/ WEAK_ATTR ulong __ockl_devmem_request(ulong addr, ulong size) { long2 result = __ockl_hostcall_preview(SERVICE_DEVMEM, addr, size, 0, 0, 0, 0, 0, 0); return (ulong)result.x; } ROCm-Device-Libs-rocm-5.0.0/ockl/src/sub_sat.cl000066400000000000000000000017771415221260100210430ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" __attribute__((const)) int OCKL_MANGLE_I32(sub_sat)(int x, int y) { int s; bool c = __builtin_ssub_overflow(x, y, &s); return c ? (x < 0 ? INT_MIN : INT_MAX) : s; } __attribute__((const)) uint OCKL_MANGLE_U32(sub_sat)(uint x, uint y) { uint s; bool c = __builtin_usub_overflow(x, y, &s); return c ? 0U : s; } __attribute__((const)) long OCKL_MANGLE_I64(sub_sat)(long x, long y) { long s; bool c = __builtin_ssubl_overflow(x, y, &s); return c ? (x < 0 ? LONG_MIN : LONG_MAX) : s; } __attribute__((const)) ulong OCKL_MANGLE_U64(sub_sat)(ulong x, ulong y) { ulong s; bool c = __builtin_usubl_overflow(x, y, &s); return c ? 0UL : s; } ROCm-Device-Libs-rocm-5.0.0/ockl/src/toas.cl000066400000000000000000000022701415221260100203360ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" __attribute__((const, target("flat-address-space"))) bool OCKL_MANGLE_T(is_local,addr)(const void *a) { return __builtin_amdgcn_is_shared(a); } __attribute__((const, target("flat-address-space"))) bool OCKL_MANGLE_T(is_private,addr)(const void *a) { return __builtin_amdgcn_is_private(a); } __attribute__((const)) __global void * OCKL_MANGLE_T(to,global)(void *a) { return (OCKL_MANGLE_T(is_local,addr)(a) | OCKL_MANGLE_T(is_private,addr)(a)) ? (__global void *)0 : (__global void*)a; } __attribute__((const)) __local void * OCKL_MANGLE_T(to,local)(void *a) { return OCKL_MANGLE_T(is_local,addr)(a) ? (__local void *)a : (__local void *)0; } __attribute__((const)) __private void * OCKL_MANGLE_T(to,private)(void *a) { return OCKL_MANGLE_T(is_private,addr)(a) ? (__private void *)a : (__private void *)0; } ROCm-Device-Libs-rocm-5.0.0/ockl/src/wait.cl000066400000000000000000000027621415221260100203420ustar00rootroot00000000000000 /*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "ockl.h" #include "oclc.h" __attribute__((target("s-memrealtime"))) void OCKL_MANGLE_T(rtcwait,u32)(uint ticks) { ulong now = __builtin_amdgcn_s_memrealtime(); ulong end = now + __builtin_amdgcn_readfirstlane(ticks); if (__oclc_ISA_version >= 9000) { while (end > now + 1625) { __builtin_amdgcn_s_sleep(127); now = __builtin_amdgcn_s_memrealtime(); } while (end > now + 806) { __builtin_amdgcn_s_sleep(63); now = __builtin_amdgcn_s_memrealtime(); } while (end > now + 396) { __builtin_amdgcn_s_sleep(31); now = __builtin_amdgcn_s_memrealtime(); } } while (end > now + 192) { __builtin_amdgcn_s_sleep(15); now = __builtin_amdgcn_s_memrealtime(); } while (end > now + 89) { __builtin_amdgcn_s_sleep(7); now = __builtin_amdgcn_s_memrealtime(); } while (end > now + 38) { __builtin_amdgcn_s_sleep(3); now = __builtin_amdgcn_s_memrealtime(); } while (end > now) { __builtin_amdgcn_s_sleep(1); now = __builtin_amdgcn_s_memrealtime(); } } ROCm-Device-Libs-rocm-5.0.0/ockl/src/wfaas.cl000066400000000000000000000034621415221260100204750ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" #include "irif.h" #include "ockl.h" #define ATTR __attribute__((always_inline)) // XXX from llvm/include/llvm/IR/InstrTypes.h #define ICMP_NE 33 // Hack to prevent incorrect hoisting of the operation. There // currently is no proper way in llvm to prevent hoisting of // operations control flow dependent results. ATTR static int optimizationBarrierHack(int in_val) { int out_val; __asm__ volatile ("; ockl ballot hoisting hack %0" : "=v"(out_val) : "0"(in_val)); return out_val; } ATTR bool OCKL_MANGLE_I32(wfany)(int e) { e = optimizationBarrierHack(e); if (__oclc_wavefrontsize64) { return __llvm_amdgcn_icmp_i64_i32(e, 0, ICMP_NE) != 0UL; } else { return __llvm_amdgcn_icmp_i32_i32(e, 0, ICMP_NE) != 0U; } } ATTR bool OCKL_MANGLE_I32(wfall)(int e) { e = optimizationBarrierHack(e); if (__oclc_wavefrontsize64) { return __llvm_amdgcn_icmp_i64_i32(e, 0, ICMP_NE) == __builtin_amdgcn_read_exec(); } else { return __llvm_amdgcn_icmp_i32_i32(e, 0, ICMP_NE) == __builtin_amdgcn_read_exec_lo(); } } ATTR bool OCKL_MANGLE_I32(wfsame)(int e) { e = optimizationBarrierHack(e); if (__oclc_wavefrontsize64) { ulong u = __llvm_amdgcn_icmp_i64_i32(e, 0, ICMP_NE) != 0UL; return (u == 0UL) | (u == __builtin_amdgcn_read_exec()); } else { uint u = __llvm_amdgcn_icmp_i32_i32(e, 0, ICMP_NE) != 0U; return (u == 0UL) | (u == __builtin_amdgcn_read_exec_lo()); } } ROCm-Device-Libs-rocm-5.0.0/ockl/src/wfbc.cl000066400000000000000000000006151415221260100203120ustar00rootroot00000000000000 #include "ockl.h" uint OCKL_MANGLE_U32(wfbcast)(uint a, uint i) { uint j = __builtin_amdgcn_readfirstlane(i); return __builtin_amdgcn_readlane(a, j); } ulong OCKL_MANGLE_U64(wfbcast)(ulong a, uint i) { uint j = __builtin_amdgcn_readfirstlane(i); return ((ulong)__builtin_amdgcn_readlane((uint)(a >> 32), j) << 32) | (ulong)__builtin_amdgcn_readlane((uint)a, j); } ROCm-Device-Libs-rocm-5.0.0/ockl/src/wfredscan.cl000066400000000000000000000435111415221260100213470ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "ockl.h" #include "oclc.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define AS_USHORT(X) __builtin_astype(X, ushort) #define AS_INT(X) __builtin_astype(X, int) #define AS_UINT(X) __builtin_astype(X, uint) #define AS_UINT2(X) __builtin_astype(X, uint2) #define AS_LONG(X) __builtin_astype(X, long) #define AS_ULONG(X) __builtin_astype(X, ulong) #define AS_DOUBLE(X) __builtin_astype(X, double) #define AS_FLOAT(X) __builtin_astype(X, float) #define AS_HALF(X) __builtin_astype(X, half) #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) // Swizzle offset macros #define SWIZZLE_QUAD_PERM(S0,S1,S2,S3) (uint)(0x8000 | (S3 << 6) | (S2 << 4) | (S1 << 2) | S0) #define SWIZZLE_32_LIMITED(ANDM,ORM,XORM) (uint)((XORM << 10) | (ORM << 5) | ANDM) // DPP 9 bit control macros #define DPP_QUAD_PERM(S0,S1,S2,S3) (uint)((S3 << 6) | (S2 << 4) | (S1 << 2) | S0) #define DPP_ROW_SL(N) (uint)(0x100 | N) #define DPP_ROW_SR(N) (uint)(0x110 | N) #define DPP_ROW_RR(N) (uint)(0x120 | N) #define DPP_WF_SL1 (uint)0x130 #define DPP_WF_RL1 (uint)0x134 #define DPP_WF_SR1 (uint)0x138 #define DPP_WF_RR1 (uint)0x13c #define DPP_ROW_MIRROR (uint)0x140 #define DPP_ROW_HALF_MIRROR (uint)0x141 #define DPP_ROW_BCAST15 (uint)0x142 #define DPP_ROW_BCAST31 (uint)0x143 #define DPP_ROW_SHARE(N) (uint)(0x150 | N) #define DPP_ROW_XMASK(N) (uint)(0x160 | N) // Swizzle #define uint_swizzle(X,Y) __builtin_amdgcn_ds_swizzle(X, Y) #define ulong_swizzle(X,Y) ({ \ uint2 __x = AS_UINT2(X); \ uint2 __r; \ __r.lo = uint_swizzle(__x.lo, Y); \ __r.hi = uint_swizzle(__x.hi, Y); \ AS_ULONG(__r); \ }) #define int_swizzle(X,Y) AS_INT(uint_swizzle(AS_UINT(X),Y)) #define long_swizzle(X,Y) AS_LONG(ulong_swizzle(AS_ULONG(X),Y)) #define float_swizzle(X,Y) AS_FLOAT(uint_swizzle(AS_UINT(X),Y)) #define double_swizzle(X,Y) AS_DOUBLE(ulong_swizzle(AS_ULONG(X),Y)) #define half_swizzle(X,Y) AS_HALF((ushort)uint_swizzle((uint)AS_USHORT(X),Y)) // DPP16 #define uint_dpp(ID,X,C,R,B,W) __builtin_amdgcn_update_dpp(ID,X,C,R,B,W) #define ulong_dpp(ID,X,C,R,B,W) ({ \ uint2 __x = AS_UINT2(X); \ uint2 __r; \ __r.lo = uint_dpp((uint)ID, __x.lo, C, R, B, W); \ __r.hi = uint_dpp((uint)(ID >> 32), __x.hi, C, R, B, W); \ AS_ULONG(__r); \ }) #define int_dpp(ID,X,C,R,B,W) AS_INT(uint_dpp(AS_UINT(ID),AS_UINT(X),C,R,B,W)) #define long_dpp(ID,X,C,R,B,W) AS_LONG(ulong_dpp(AS_ULONG(ID),AS_ULONG(X),C,R,B,W)) #define float_dpp(ID,X,C,R,B,W) AS_FLOAT(uint_dpp(AS_UINT(ID),AS_UINT(X),C,R,B,W)) #define double_dpp(ID,X,C,R,B,W) AS_DOUBLE(ulong_dpp(AS_ULONG(ID),AS_ULONG(X),C,R,B,W)) #define half_dpp(ID,X,C,R,B,W) AS_HALF((ushort)uint_dpp((uint)AS_USHORT(ID),(uint)AS_USHORT(X),C,R,B,W)) // DPP8 #define uint_dpp8(X,S) __builtin_amdgcn_mov_dpp8(X,S) #define ulong_dpp8(X,S) ({ \ uint2 __x = AS_UINT2(X); \ uint2 __r; \ __r.lo = uint_dpp8(__x.lo, S); \ __r.hi = uint_dpp8(__x.hi, S); \ AS_ULONG(__r); \ }) #define int_dpp8(X,S) AS_INT(uint_dpp8(AS_UINT(X),S)) #define long_dpp8(X,S) AS_LONG(ulong_dpp8(AS_ULONG(X),S)) #define float_dpp8(X,S) AS_FLOAT(uint_dpp8(AS_UINT(X),S)) #define double_dpp8(X,S) AS_DOUBLE(ulong_dpp8(AS_ULONG(X),S)) #define half_dpp8(X,S) AS_HALF((ushort)uint_dpp8((uint)AS_USHORT(X),S)) // permlane16 #define uint_permlane16(ID,X,S0,S1,W) __builtin_amdgcn_permlane16(ID,X,S0,S1,false,W) #define ulong_permlane16(ID,X,S0,S1,W) ({ \ uint2 __x = AS_UINT2(X); \ uint2 __r; \ __r.lo = uint_permlane16((uint)ID,__x.lo,S0,S1,W); \ __r.hi = uint_permlane16((uint)(ID>>32),__x.hi,S0,S1,W); \ AS_ULONG(__r); \ }) #define int_permlane16(ID,X,S0,S1,W) AS_INT(uint_permlane16(AS_UINT(ID),AS_UINT(X),S0,S1,W)) #define long_permlane16(ID,X,S0,S1,W) AS_LONG(ulong_permlane16(AS_ULONG(ID),AS_ULONG(X),S0,S1,W)) #define float_permlane16(ID, X,S0,S1,W) AS_FLOAT(uint_permlane16(AS_UINT(ID),AS_UINT(X),S0,S1,W)) #define double_permlane16(ID, X,S0,S1,W) AS_DOUBLE(ulong_permlane16(AS_ULONG(ID),AS_ULONG(X),S0,S1,W)) #define half_permlane16(ID,X,S0,S1,W) AS_HALF((ushort)uint_permlane16((uint)AS_USHORT(ID),(uint)AS_USHORT(X),S0,S1,W)) // permlanex16 #define uint_permlanex16(ID,X,S0,S1,W) __builtin_amdgcn_permlanex16(ID,X,S0,S1,false,W) #define ulong_permlanex16(ID,X,S0,S1,W) ({ \ uint2 __x = AS_UINT2(X); \ uint2 __r; \ __r.lo = uint_permlanex16((uint)ID,__x.lo,S0,S1,W); \ __r.hi = uint_permlanex16((uint)(ID>>32),__x.hi,S0,S1,W); \ AS_ULONG(__r); \ }) #define int_permlanex16(ID,X,S0,S1,W) AS_INT(uint_permlanex16(AS_UINT(ID),AS_UINT(X),S0,S1,W)) #define long_permlanex16(ID,X,S0,S1,W) AS_LONG(ulong_permlanex16(AS_ULONG(ID),AS_ULONG(X),S0,S1,W)) #define float_permlanex16(ID, X,S0,S1,W) AS_FLOAT(uint_permlanex16(AS_UINT(ID),AS_UINT(X),S0,S1,W)) #define double_permlanex16(ID, X,S0,S1,W) AS_DOUBLE(ulong_permlanex16(AS_ULONG(ID),AS_ULONG(X),S0,S1,W)) #define half_permlanex16(ID,X,S0,S1,W) AS_HALF((ushort)uint_permlanex16((uint)AS_USHORT(ID),(uint)AS_USHORT(X),S0,S1,W)) // readlane #define uint_readlane(X,L) __builtin_amdgcn_readlane(X,L) #define ulong_readlane(X,L) ({ \ uint2 __x = AS_UINT2(X); \ uint2 __r; \ __r.lo = uint_readlane(__x.lo, L); \ __r.hi = uint_readlane(__x.hi, L); \ AS_ULONG(__r); \ }) #define int_readlane(X,L) AS_INT(uint_readlane(AS_UINT(X),L)) #define long_readlane(X,L) AS_LONG(ulong_readlane(AS_ULONG(X),L)) #define float_readlane(X,L) AS_FLOAT(uint_readlane(AS_UINT(X),L)) #define double_readlane(X,L) AS_DOUBLE(ulong_readlane(AS_ULONG(X),L)) #define half_readlane(X,L) AS_HALF((ushort)uint_readlane((uint)AS_USHORT(X),L)) // Select #define uint_sel(C,B,A) ({ \ uint __c = C; \ (__c & B) | (~__c & A); \ }) #define ulong_sel(C,B,A) ({ \ uint __c = C; \ uint2 __b = AS_UINT2(B); \ uint2 __a = AS_UINT2(A); \ uint2 __r; \ __r.lo = (__c & __b.lo) | (~__c & __a.lo); \ __r.hi = (__c & __b.hi) | (~__c & __a.hi); \ AS_ULONG(__r); \ }) #define int_sel(C,B,A) AS_INT(uint_sel(C, AS_UINT(B), AS_UINT(A))) #define long_sel(C,B,A) AS_LONG(ulong_sel(C, AS_ULONG(B), AS_ULONG(A))) #define float_sel(C,B,A) AS_FLOAT(uint_sel(C, AS_UINT(B), AS_UINT(A))) #define double_sel(C,B,A) AS_DOUBLE(ulong_sel(C, AS_ULONG(B), AS_ULONG(A))) #define half_sel(C,B,A) AS_HALF((ushort)uint_sel(C, (uint)AS_USHORT(B), (uint)AS_USHORT(A))) #define uint_suf _u32 #define int_suf _i32 #define ulong_suf _u64 #define long_suf _i64 #define float_suf _f32 #define double_suf _f64 #define half_suf _f16 #define CATTR __attribute__((const)) #define IATTR #define GENMIN(T) CATTR static T T##_min(T a, T b) { return a < b ? a : b; } GENMIN(int) GENMIN(uint) GENMIN(long) GENMIN(ulong) #define float_min(A,B) __builtin_fminf(A,B) #define double_min(A,B) __builtin_fmin(A,B) #define half_min(A,B) __builtin_fminf16(A,B) #define GENMAX(T) CATTR static T T##_max(T a, T b) { return a < b ? b : a; } GENMAX(int) GENMAX(uint) GENMAX(long) GENMAX(ulong) #define float_max(A,B) __builtin_fmaxf(A,B) #define double_max(A,B) __builtin_fmax(A,B) #define half_max(A,B) __builtin_fmaxf16(A,B) #define ADD(X,Y) (X + Y) #define uint_add(X,Y) ADD(X,Y) #define int_add(X,Y) ADD(X,Y) #define ulong_add(X,Y) ADD(X,Y) #define long_add(X,Y) ADD(X,Y) #define float_add(X,Y) ADD(X,Y) #define double_add(X,Y) ADD(X,Y) #define half_add(X,Y) ADD(X,Y) #define OR(X,Y) (X | Y) #define uint_or(X,Y) OR(X,Y) #define int_or(X,Y) OR(X,Y) #define ulong_or(X,Y) OR(X,Y) #define long_or(X,Y) OR(X,Y) #define AND(X,Y) (X & Y) #define uint_and(X,Y) AND(X,Y) #define int_and(X,Y) AND(X,Y) #define ulong_and(X,Y) AND(X,Y) #define long_and(X,Y) AND(X,Y) #define XOR(X,Y) (X ^ Y) #define uint_xor(X,Y) XOR(X,Y) #define int_xor(X,Y) XOR(X,Y) #define ulong_xor(X,Y) XOR(X,Y) #define long_xor(X,Y) XOR(X,Y) #define GENRED7_FULL(T,OP,ID,IDZ) \ static T \ red7_full_##T##_##OP(T x) \ { \ T v, r; \ \ v = T##_swizzle(x, SWIZZLE_QUAD_PERM(0x1,0x0,0x3,0x2)); \ r = T##_##OP(x, v); \ \ v = T##_swizzle(r, SWIZZLE_QUAD_PERM(0x2,0x3,0x0,0x1)); \ r = T##_##OP(r, v); \ \ v = T##_swizzle(r, SWIZZLE_32_LIMITED(0x1f,0x00,0x04)); \ r = T##_##OP(r, v); \ \ v = T##_swizzle(r, SWIZZLE_32_LIMITED(0x1f,0x00,0x08)); \ r = T##_##OP(r, v); \ \ v = T##_swizzle(r, SWIZZLE_32_LIMITED(0x1f,0x00,0x10)); \ r = T##_##OP(r, v); \ \ r = T##_##OP(T##_readlane(r,0), T##_readlane(r,32)); \ \ return r; \ } #define GENRED7_PART(T,OP,ID,IDZ) \ static T \ red7_part_##T##_##OP(T x) \ { \ T r; \ if (IDZ) { \ T v; \ \ v = T##_swizzle(x, SWIZZLE_QUAD_PERM(0x1,0x0,0x3,0x2)); \ r = T##_##OP(x, v); \ \ v = T##_swizzle(r, SWIZZLE_QUAD_PERM(0x2,0x3,0x0,0x1)); \ r = T##_##OP(r, v); \ \ v = T##_swizzle(r, SWIZZLE_32_LIMITED(0x1f,0x00,0x04)); \ r = T##_##OP(r, v); \ \ v = T##_swizzle(r, SWIZZLE_32_LIMITED(0x1f,0x00,0x08)); \ r = T##_##OP(r, v); \ \ v = T##_swizzle(r, SWIZZLE_32_LIMITED(0x1f,0x00,0x10)); \ r = T##_##OP(r, v); \ \ v = T##_readlane(r, 32); \ v = (__builtin_amdgcn_read_exec_hi() & 1) ? v : ID; \ r = T##_##OP(T##_readlane(r, 0), v); \ } else { \ uint e; \ T v, t; \ \ t = T##_swizzle(x, SWIZZLE_QUAD_PERM(0x1,0x0,0x3,0x2)); \ e = uint_swizzle(~0u, SWIZZLE_QUAD_PERM(0x1,0x0,0x3,0x2)); \ v = T##_sel(e, t, ID); \ r = T##_##OP(x, v); \ \ t = T##_swizzle(r, SWIZZLE_QUAD_PERM(0x2,0x3,0x0,0x1)); \ e = uint_swizzle(~0u, SWIZZLE_QUAD_PERM(0x2,0x3,0x0,0x1)); \ v = T##_sel(e, t, ID); \ r = T##_##OP(r, v); \ \ t = T##_swizzle(r, SWIZZLE_32_LIMITED(0x1f,0x00,0x04)); \ e = uint_swizzle(~0u, SWIZZLE_32_LIMITED(0x1f,0x00,0x04)); \ v = T##_sel(e, t, ID); \ r = T##_##OP(r, v); \ \ t = T##_swizzle(r, SWIZZLE_32_LIMITED(0x1f,0x00,0x08)); \ e = uint_swizzle(~0u, SWIZZLE_32_LIMITED(0x1f,0x00,0x08)); \ v = T##_sel(e, t, ID); \ r = T##_##OP(r, v); \ \ t = T##_swizzle(r, SWIZZLE_32_LIMITED(0x1f,0x00,0x10)); \ e = uint_swizzle(~0u, SWIZZLE_32_LIMITED(0x1f,0x00,0x10)); \ v = T##_sel(e, t, ID); \ r = T##_##OP(r, v); \ \ t = T##_readlane(r, 32); \ v = (__builtin_amdgcn_read_exec_hi() & 1) ? t : ID; \ r = T##_##OP(T##_readlane(r, 0), v); \ } \ \ return r; \ } #define GENRED7(T,OP,ID,IDZ) \ GENRED7_FULL(T,OP,ID,IDZ) \ GENRED7_PART(T,OP,ID,IDZ) #define GENRED89(T,OP,ID,IDZ) \ __attribute__((target("dpp"))) static T \ red89_##T##_##OP(T x) \ { \ T r, v; \ \ v = T##_dpp(ID, x, DPP_ROW_SL(1), 0xf, 0xf, IDZ); \ r = T##_##OP(x, v); \ \ v = T##_dpp(ID, r, DPP_ROW_SL(2), 0xf, 0xf, IDZ); \ r = T##_##OP(r, v); \ \ v = T##_dpp(ID, r, DPP_ROW_SL(4), 0xf, 0xf, IDZ); \ r = T##_##OP(r, v); \ \ v = T##_dpp(ID, r, DPP_ROW_SL(8), 0xf, 0xf, IDZ); \ r = T##_##OP(r, v); \ \ v = T##_dpp(ID, r, DPP_WF_SL1, 0xf, 0xf, IDZ); \ v = T##_dpp(ID, v, DPP_ROW_MIRROR, 0xf, 0xf, IDZ); \ r = T##_##OP(r, v); \ \ v = T##_readlane(r, 32); \ v = (__builtin_amdgcn_read_exec_hi() & 1) ? v : ID; \ r = T##_##OP(T##_readlane(r, 0), v); \ \ return r; \ } #define GENRED10(T,OP,ID,IDZ) \ __attribute__((target("dpp,gfx10-insts"))) static T \ red10_##T##_##OP(T x) \ { \ T r, v; \ \ v = T##_dpp(ID, x, DPP_ROW_SL(1), 0xf, 0xf, IDZ); \ r = T##_##OP(x, v); \ \ v = T##_dpp(ID, r, DPP_ROW_SL(2), 0xf, 0xf, IDZ); \ r = T##_##OP(r, v); \ \ v = T##_dpp(ID, r, DPP_ROW_SL(4), 0xf, 0xf, IDZ); \ r = T##_##OP(r, v); \ \ v = T##_dpp(ID, r, DPP_ROW_SL(8), 0xf, 0xf, IDZ); \ r = T##_##OP(r, v); \ \ r = T##_dpp(ID, r, DPP_ROW_SHARE(0), 0xf, 0xf, IDZ); \ \ v = T##_permlanex16(ID, r, 0, 0, IDZ); \ r = T##_##OP(r, v); \ \ if (__oclc_wavefrontsize64) { \ T v = T##_readlane(r, 32); \ v = (__builtin_amdgcn_read_exec_hi() & 1) ? v : ID; \ r = T##_##OP(T##_readlane(r, 0), v); \ } \ \ return r; \ } #define GENISCAN7(T,OP,ID,IDZ) \ static T \ iscan7_##T##_##OP(T x, uint l) \ { \ T s, v; \ \ v = T##_swizzle(x, SWIZZLE_32_LIMITED(0x1e,0x00,0x00)); \ v = (l & 1) ? v : ID; \ s = T##_##OP(x, v); \ \ v = T##_swizzle(s, SWIZZLE_32_LIMITED(0x1c,0x01,0x00)); \ v = (l & 2) ? v : ID; \ s = T##_##OP(s, v); \ \ v = T##_swizzle(s, SWIZZLE_32_LIMITED(0x18,0x03,0x00)); \ v = (l & 4) ? v : ID; \ s = T##_##OP(s, v); \ \ v = T##_swizzle(s, SWIZZLE_32_LIMITED(0x10,0x07,0x00)); \ v = (l & 8) ? v : ID; \ s = T##_##OP(s, v); \ \ v = T##_swizzle(s, SWIZZLE_32_LIMITED(0x00,0x0f,0x00)); \ v = (l & 16) ? v : ID; \ s = T##_##OP(s, v); \ \ v = T##_readlane(s, 31); \ v = l > 31 ? v : ID; \ s = T##_##OP(s, v); \ \ return s; \ } #define GENISCAN89(T,OP,ID,IDZ) \ __attribute__((target("dpp"))) static T \ iscan89_##T##_##OP(T x, uint l) \ { \ T s, v; \ \ v = T##_dpp(ID, x, DPP_ROW_SR(1), 0xf, 0xf, IDZ); \ s = T##_##OP(x, v); \ \ v = T##_dpp(ID, s, DPP_ROW_SR(2), 0xf, 0xf, IDZ); \ s = T##_##OP(s, v); \ \ v = T##_dpp(ID, s, DPP_ROW_SR(4), 0xf, 0xf, IDZ); \ s = T##_##OP(s, v); \ \ v = T##_dpp(ID, s, DPP_ROW_SR(8), 0xf, 0xf, IDZ); \ s = T##_##OP(s, v); \ \ v = T##_dpp(ID, s, DPP_ROW_BCAST15, 0xa, 0xf, false); \ s = T##_##OP(s, v); \ \ v = T##_dpp(ID, s, DPP_ROW_BCAST31, 0xc, 0xf, false); \ s = T##_##OP(s, v); \ \ return s; \ } #define GENISCAN10(T,OP,ID,IDZ) \ __attribute__((target("dpp,gfx10-insts"))) static T \ iscan10_##T##_##OP(T x, uint l) \ { \ T s, v; \ \ v = T##_dpp(ID, x, DPP_ROW_SR(1), 0xf, 0xf, IDZ); \ s = T##_##OP(x, v); \ \ v = T##_dpp(ID, s, DPP_ROW_SR(2), 0xf, 0xf, IDZ); \ s = T##_##OP(s, v); \ \ v = T##_dpp(ID, s, DPP_ROW_SR(4), 0xf, 0xf, IDZ); \ s = T##_##OP(s, v); \ \ v = T##_dpp(ID, s, DPP_ROW_SR(8), 0xf, 0xf, IDZ); \ s = T##_##OP(s, v); \ \ v = T##_permlanex16(ID, s, 0xffffffff, 0xffffffff, IDZ); \ v = (l & 0x10) ? v : ID; \ s = T##_##OP(s, v); \ \ if (__oclc_wavefrontsize64) { \ v = T##_readlane(s, 31); \ v = l > 31 ? v : ID; \ s = T##_##OP(s, v); \ } \ \ return s; \ } #define GENSR1_7(T,OP,ID,IDZ) \ static T \ sr1_7_##T##_##OP(T s, uint l) \ { \ T v; \ T t = s; \ \ s = T##_swizzle(t, SWIZZLE_QUAD_PERM(0x0,0x0,0x1,0x2)); \ \ v = T##_swizzle(t, SWIZZLE_32_LIMITED(0x18, 0x03, 0x00)); \ s = (l & 0x7) == 0x4 ? v : s; \ \ v = T##_swizzle(t, SWIZZLE_32_LIMITED(0x10, 0x07, 0x00)); \ s = (l & 0xf) == 0x8 ? v : s; \ \ v = T##_swizzle(t, SWIZZLE_32_LIMITED(0x00, 0x0f, 0x00)); \ s = (l & 0x1f) == 0x10 ? v : s; \ \ v = T##_readlane(t, 31); \ s = l == 32 ? v : s; \ \ s = l == 0 ? ID : s; \ \ return s; \ } #define GENSR1_89(T,OP,ID,IDZ) \ __attribute__((target("dpp"))) static T \ sr1_89_##T##_##OP(T s, uint l) \ { \ return T##_dpp(ID, s, DPP_WF_SR1, 0xf, 0xf, IDZ); \ } #define GENSR1_10(T,OP,ID,IDZ) \ __attribute((target("dpp,gfx10-insts"))) static T \ sr1_10_##T##_##OP(T s, uint l) \ { \ T t = T##_dpp(ID, s, DPP_ROW_SR(1), 0xf, 0xf, IDZ); \ T v = T##_permlanex16(ID, s, 0xffffffff, 0xffffffff, IDZ); \ if (__oclc_wavefrontsize64) { \ T w = T##_readlane(s, 31); \ v = l == 32 ? w : v; \ s = ((l == 32) | ((l & 0x1f) == 0x10)) ? v : t; \ } else {\ s = l == 16 ? v : t; \ } \ \ return s; \ } IATTR static bool fullwave(void) { if (__oclc_wavefrontsize64) { return __builtin_popcountl(__builtin_amdgcn_read_exec()) == 64; } else { return __builtin_popcount(__builtin_amdgcn_read_exec_lo()) == 32; } } #define GENRED(T,OP,ID,IDZ) \ GENRED7(T,OP,ID,IDZ) \ GENRED89(T,OP,ID,IDZ) \ GENRED10(T,OP,ID,IDZ) \ IATTR T \ C(__ockl_wfred_,C(OP,T##_suf))(T x) \ { \ T r; \ if (__oclc_ISA_version < 8000) { \ if (fullwave()) { \ r = red7_full_##T##_##OP(x); \ } else { \ r = red7_part_##T##_##OP(x); \ } \ } else if (__oclc_ISA_version < 10000) { \ r = red89_##T##_##OP(x); \ } else { \ r = red10_##T##_##OP(x); \ } \ return r; \ } #define GENSCAN(T,OP,ID,IDZ) \ GENISCAN7(T,OP,ID,IDZ) \ GENISCAN89(T,OP,ID,IDZ) \ GENISCAN10(T,OP,ID,IDZ) \ GENSR1_7(T,OP,ID,IDZ) \ GENSR1_89(T,OP,ID,IDZ) \ GENSR1_10(T,OP,ID,IDZ) \ IATTR T \ C(__ockl_wfscan_,C(OP,T##_suf))(T x, bool inclusive) \ { \ T s; \ uint l = __ockl_lane_u32(); \ \ if (__oclc_ISA_version < 8000) { \ s = iscan7_##T##_##OP(x, l); \ } else if (__oclc_ISA_version < 10000) { \ s = iscan89_##T##_##OP(x, l); \ } else { \ s = iscan10_##T##_##OP(x, l); \ } \ \ if (!inclusive) { \ if (__oclc_ISA_version < 8000) { \ s = sr1_7_##T##_##OP(s, l); \ } else if (__oclc_ISA_version < 10000) { \ s = sr1_89_##T##_##OP(s, l); \ } else { \ s = sr1_10_##T##_##OP(s, l); \ } \ } \ \ return s; \ } #define GEN(T,OP,ID,IDZ) \ GENRED(T,OP,ID,IDZ) \ GENSCAN(T,OP,ID,IDZ) GEN(int,add,0,1) GEN(uint,add,0u,1) GEN(long,add,0L,1) GEN(ulong,add,0UL,1) GEN(float,add,0.0f,1) GEN(double,add,0.0,1) GEN(half,add,0.0h,1) GEN(int,min,INT_MAX,0) GEN(uint,min,UINT_MAX,0) GEN(long,min,LONG_MAX,0) GEN(ulong,min,ULONG_MAX,0) GEN(float,min,INFINITY,0) GEN(double,min,(double)INFINITY,0) GEN(half,min,(half)INFINITY,0) GEN(int,max,INT_MIN,0) GEN(uint,max,0u,1) GEN(long,max,LONG_MIN,0) GEN(ulong,max,0UL,1) GEN(float,max,-INFINITY,0) GEN(double,max,-(double)INFINITY,0) GEN(half,max,-(half)INFINITY,0) GEN(int,and,~0,0) GEN(uint,and,~0u,0) GEN(long,and,~0L,0) GEN(ulong,and,~0UL,0) GEN(int,or,0,1) GEN(uint,or,0u,1) GEN(long,or,0L,1) GEN(ulong,or,0UL,1) GEN(int,xor,0,1) GEN(uint,xor,0u,1) GEN(long,xor,0L,1) GEN(ulong,xor,0UL,1) ROCm-Device-Libs-rocm-5.0.0/ockl/src/wgred.cl000066400000000000000000000041441415221260100205020ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "oclc.h" #include "ockl.h" #include "wgscratch.h" #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) #define reduce_add __opencl_atomic_fetch_add #define reduce_and __opencl_atomic_fetch_and #define reduce_or __opencl_atomic_fetch_or #define int_suf _i32 static uint my_num_sub_groups(void) { uint wgs = __ockl_mul24_i32((uint)__ockl_get_local_size(2), __ockl_mul24_i32((uint)__ockl_get_local_size(1), (uint)__ockl_get_local_size(0))); if (__oclc_wavefrontsize64) return (wgs + 63U) >> 6U; else return (wgs + 31U) >> 5U; } static uint my_sub_group_id(void) { if (__oclc_wavefrontsize64) return (uint)__ockl_get_local_linear_id() >> 6U; else return (uint)__ockl_get_local_linear_id() >> 5U; } static void my_barrier(void) { __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); __builtin_amdgcn_s_barrier(); __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); } #define AGEN(T,OP) \ __attribute__((convergent)) T \ C(__ockl_wgred_,C(OP,T##_suf))(int a) \ { \ uint n = my_num_sub_groups(); \ a = C(__ockl_wfred_##OP,T##_suf)(a); \ if (n == 1) \ return a; \ \ __local atomic_##T *p = (__local atomic_##T *)__get_scratch_lds(); \ uint l = __ockl_lane_u32(); \ uint i = my_sub_group_id(); \ \ if ((i == 0) & (l == 0)) \ __opencl_atomic_store(p, a, memory_order_relaxed, memory_scope_work_group); \ \ my_barrier(); \ if ((i != 0) & (l == 0)) \ reduce_##OP(p, a, memory_order_relaxed, memory_scope_work_group); \ my_barrier(); \ a = __opencl_atomic_load(p, memory_order_relaxed, memory_scope_work_group); \ my_barrier(); \ return a; \ } AGEN(int,add) AGEN(int,and) AGEN(int,or) ROCm-Device-Libs-rocm-5.0.0/ockl/src/wgscratch.ll000066400000000000000000000011531415221260100213650ustar00rootroot00000000000000target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" ; 1024 work-items means up to 32 work groups @__scratch_lds = linkonce_odr hidden addrspace(3) global [32 x i64] undef, align 8 define protected i64 addrspace(3)* @__get_scratch_lds() #0 { ret i64 addrspace(3)* getelementptr inbounds ([32 x i64], [32 x i64] addrspace(3)* @__scratch_lds, i64 0, i64 0) } attributes #0 = { alwaysinline norecurse nounwind readnone speculatable } ROCm-Device-Libs-rocm-5.0.0/ockl/src/workitem.cl000066400000000000000000000146151415221260100212370ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "device_amd_hsa.h" #define ATTR __attribute__((const)) ATTR size_t __ockl_get_global_offset(uint dim) { // TODO find out if implicit arg pointer is aligned properly switch(dim) { case 0: return *(__constant size_t *)__builtin_amdgcn_implicitarg_ptr(); case 1: return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[1]; case 2: return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[2]; default: return 0; } } ATTR size_t __ockl_get_global_id(uint dim) { uint l, g, s; switch(dim) { case 0: l = __builtin_amdgcn_workitem_id_x(); g = __builtin_amdgcn_workgroup_id_x(); s = __builtin_amdgcn_workgroup_size_x(); break; case 1: l = __builtin_amdgcn_workitem_id_y(); g = __builtin_amdgcn_workgroup_id_y(); s = __builtin_amdgcn_workgroup_size_y(); break; case 2: l = __builtin_amdgcn_workitem_id_z(); g = __builtin_amdgcn_workgroup_id_z(); s = __builtin_amdgcn_workgroup_size_z(); break; default: l = 0; g = 0; s = 1; break; } return (g*s + l) + __ockl_get_global_offset(dim); } ATTR size_t __ockl_get_local_id(uint dim) { switch(dim) { case 0: return __builtin_amdgcn_workitem_id_x(); case 1: return __builtin_amdgcn_workitem_id_y(); case 2: return __builtin_amdgcn_workitem_id_z(); default: return 0; } } ATTR size_t __ockl_get_group_id(uint dim) { switch(dim) { case 0: return __builtin_amdgcn_workgroup_id_x(); case 1: return __builtin_amdgcn_workgroup_id_y(); case 2: return __builtin_amdgcn_workgroup_id_z(); default: return 0; } } ATTR size_t __ockl_get_global_size(uint dim) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); switch(dim) { case 0: return p->grid_size_x; case 1: return p->grid_size_y; case 2: return p->grid_size_z; default: return 1; } } ATTR size_t __ockl_get_local_size(uint dim) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); uint group_id, grid_size, group_size; switch(dim) { case 0: group_id = __builtin_amdgcn_workgroup_id_x(); group_size = __builtin_amdgcn_workgroup_size_x(); grid_size = p->grid_size_x; break; case 1: group_id = __builtin_amdgcn_workgroup_id_y(); group_size = __builtin_amdgcn_workgroup_size_y(); grid_size = p->grid_size_y; break; case 2: group_id = __builtin_amdgcn_workgroup_id_z(); group_size = __builtin_amdgcn_workgroup_size_z(); grid_size = p->grid_size_z; break; default: group_id = 0; grid_size = 0; group_size = 1; break; } uint r = grid_size - group_id * group_size; return (r < group_size) ? r : group_size; } ATTR size_t __ockl_get_num_groups(uint dim) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); uint n, d; switch(dim) { case 0: n = p->grid_size_x; d = __builtin_amdgcn_workgroup_size_x(); break; case 1: n = p->grid_size_y; d = __builtin_amdgcn_workgroup_size_y(); break; case 2: n = p->grid_size_z; d = __builtin_amdgcn_workgroup_size_z(); break; default: n = 1; d = 1; break; } uint q = n / d; return q + (n > q*d); } ATTR uint __ockl_get_work_dim(void) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); // XXX revist this if setup field ever changes return p->setup; } ATTR size_t __ockl_get_enqueued_local_size(uint dim) { switch(dim) { case 0: return __builtin_amdgcn_workgroup_size_x(); case 1: return __builtin_amdgcn_workgroup_size_y(); case 2: return __builtin_amdgcn_workgroup_size_z(); default: return 1; } } ATTR size_t __ockl_get_global_linear_id(void) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); // XXX revisit this if setup field ever changes switch (p->setup) { case 1: { uint l0 = __builtin_amdgcn_workitem_id_x(); uint g0 = __builtin_amdgcn_workgroup_id_x(); uint s0 = __builtin_amdgcn_workgroup_size_x(); return g0*s0 + l0; } case 2: { uint l0 = __builtin_amdgcn_workitem_id_x(); uint l1 = __builtin_amdgcn_workitem_id_y(); uint g0 = __builtin_amdgcn_workgroup_id_x(); uint g1 = __builtin_amdgcn_workgroup_id_y(); uint s0 = __builtin_amdgcn_workgroup_size_x(); uint s1 = __builtin_amdgcn_workgroup_size_y(); uint n0 = p->grid_size_x; uint i0 = g0*s0 + l0; uint i1 = g1*s1 + l1; return (size_t)i1 * (size_t)n0 + i0; } case 3: { uint l0 = __builtin_amdgcn_workitem_id_x(); uint l1 = __builtin_amdgcn_workitem_id_y(); uint l2 = __builtin_amdgcn_workitem_id_z(); uint g0 = __builtin_amdgcn_workgroup_id_x(); uint g1 = __builtin_amdgcn_workgroup_id_y(); uint g2 = __builtin_amdgcn_workgroup_id_z(); uint s0 = __builtin_amdgcn_workgroup_size_x(); uint s1 = __builtin_amdgcn_workgroup_size_y(); uint s2 = __builtin_amdgcn_workgroup_size_z(); uint n0 = p->grid_size_x; uint n1 = p->grid_size_y; uint i0 = g0*s0 + l0; uint i1 = g1*s1 + l1; uint i2 = g2*s2 + l2; return ((size_t)i2 * (size_t)n1 + (size_t)i1) * (size_t)n0 + i0; } default: return 0; } } ATTR size_t __ockl_get_local_linear_id(void) { return (__builtin_amdgcn_workitem_id_z() * __builtin_amdgcn_workgroup_size_y() + __builtin_amdgcn_workitem_id_y()) * __builtin_amdgcn_workgroup_size_x() + __builtin_amdgcn_workitem_id_x(); } ROCm-Device-Libs-rocm-5.0.0/oclc/000077500000000000000000000000001415221260100162505ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/oclc/CMakeLists.txt000066400000000000000000000012261415221260100210110ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- file(GLOB sources ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cl ) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/inc) foreach (file ${sources}) get_filename_component(dir ${file} DIRECTORY) get_filename_component(name ${file} NAME_WE) get_filename_component(ext ${file} EXT) opencl_bc_lib(NAME oclc_${name} SOURCES ${file}) endforeach() ROCm-Device-Libs-rocm-5.0.0/oclc/inc/000077500000000000000000000000001415221260100170215ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/oclc/inc/oclc.h000066400000000000000000000032651415221260100201200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #ifndef OCLC_H #define OCLC_H // These constants are used to control behavior of the libraries which // check them. // // The current list of controls is as follows: // // __constant bool __oclc_finite_only_opt // - the application will only pass finite arguments and expects only finite results // // __constant bool __oclc_unsafe_math_opt // - the aopplication accepts optimizations that may lower the accuracy of the results // // __constant bool __oclc_daz_opt(void) // - the application allows subnormal inputs or outputs to be flushed to zero // // __constant bool __oclc_correctly_rounded_sqrt32(void) // - the application is expecting sqrt(float) to produce a correctly rounded result // // __constant int __oclc_ISA_version // - the ISA version of the target device // // it is expected that the implementation provides these as if declared from the following // C code: // // const bool int __oclc_... = 0; // Or 1 // // allowing them and any control flow associated with them to be optimized away extern const __constant bool __oclc_finite_only_opt; extern const __constant bool __oclc_unsafe_math_opt; extern const __constant bool __oclc_daz_opt; extern const __constant bool __oclc_correctly_rounded_sqrt32; extern const __constant bool __oclc_wavefrontsize64; extern const __constant int __oclc_ISA_version; #endif // OCLC_H ROCm-Device-Libs-rocm-5.0.0/oclc/src/000077500000000000000000000000001415221260100170375ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/oclc/src/correctly_rounded_sqrt_off.cl000066400000000000000000000006171415221260100250140ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant bool __oclc_correctly_rounded_sqrt32 = 0; ROCm-Device-Libs-rocm-5.0.0/oclc/src/correctly_rounded_sqrt_on.cl000066400000000000000000000006201415221260100246500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant bool __oclc_correctly_rounded_sqrt32 = 1; ROCm-Device-Libs-rocm-5.0.0/oclc/src/daz_opt_off.cl000066400000000000000000000005771415221260100216620ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant bool __oclc_daz_opt = 0; ROCm-Device-Libs-rocm-5.0.0/oclc/src/daz_opt_on.cl000066400000000000000000000005771415221260100215240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant bool __oclc_daz_opt = 1; ROCm-Device-Libs-rocm-5.0.0/oclc/src/finite_only_off.cl000066400000000000000000000006071415221260100225330ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant bool __oclc_finite_only_opt = 0; ROCm-Device-Libs-rocm-5.0.0/oclc/src/finite_only_on.cl000066400000000000000000000006071415221260100223750ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant bool __oclc_finite_only_opt = 1; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_1010.cl000066400000000000000000000006061415221260100223430ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 10100; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_1011.cl000066400000000000000000000006061415221260100223440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 10101; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_1012.cl000066400000000000000000000006061415221260100223450ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 10102; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_1013.cl000066400000000000000000000006061415221260100223460ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 10103; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_1030.cl000066400000000000000000000006061415221260100223450ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 10300; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_1031.cl000066400000000000000000000006061415221260100223460ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 10301; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_1032.cl000066400000000000000000000006061415221260100223470ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 10302; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_1033.cl000066400000000000000000000006061415221260100223500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 10303; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_1034.cl000066400000000000000000000006061415221260100223510ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 10304; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_1035.cl000066400000000000000000000006061415221260100223520ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 10305; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_600.cl000066400000000000000000000006051415221260100222660ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 6000; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_601.cl000066400000000000000000000006051415221260100222670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 6001; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_602.cl000066400000000000000000000006051415221260100222700ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 6002; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_700.cl000066400000000000000000000006051415221260100222670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 7000; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_701.cl000066400000000000000000000006051415221260100222700ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 7001; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_702.cl000066400000000000000000000006051415221260100222710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 7002; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_703.cl000066400000000000000000000006051415221260100222720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 7003; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_704.cl000066400000000000000000000006051415221260100222730ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 7004; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_705.cl000066400000000000000000000006051415221260100222740ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 7005; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_801.cl000066400000000000000000000006051415221260100222710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 8001; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_802.cl000066400000000000000000000006051415221260100222720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 8002; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_803.cl000066400000000000000000000006051415221260100222730ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 8003; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_805.cl000066400000000000000000000006051415221260100222750ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 8005; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_810.cl000066400000000000000000000006051415221260100222710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 8100; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_900.cl000066400000000000000000000006051415221260100222710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 9000; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_902.cl000066400000000000000000000006051415221260100222730ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 9002; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_904.cl000066400000000000000000000006051415221260100222750ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 9004; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_906.cl000066400000000000000000000006051415221260100222770ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 9006; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_908.cl000066400000000000000000000006041415221260100223000ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 9008; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_909.cl000066400000000000000000000006041415221260100223010ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 9009; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_90a.cl000066400000000000000000000006041415221260100223510ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 9010; ROCm-Device-Libs-rocm-5.0.0/oclc/src/isa_version_90c.cl000066400000000000000000000006041415221260100223530ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant int __oclc_ISA_version = 9012; ROCm-Device-Libs-rocm-5.0.0/oclc/src/unsafe_math_off.cl000066400000000000000000000006071415221260100225060ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant bool __oclc_unsafe_math_opt = 0; ROCm-Device-Libs-rocm-5.0.0/oclc/src/unsafe_math_on.cl000066400000000000000000000006071415221260100223500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant bool __oclc_unsafe_math_opt = 1; ROCm-Device-Libs-rocm-5.0.0/oclc/src/wavefrontsize64_off.cl000066400000000000000000000006071415221260100232740ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant bool __oclc_wavefrontsize64 = 0; ROCm-Device-Libs-rocm-5.0.0/oclc/src/wavefrontsize64_on.cl000066400000000000000000000006071415221260100231360ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" const __constant bool __oclc_wavefrontsize64 = 1; ROCm-Device-Libs-rocm-5.0.0/ocml/000077500000000000000000000000001415221260100162625ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/ocml/CMakeLists.txt000066400000000000000000000012161415221260100210220ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- file(GLOB sources ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cl ) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../irif/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../oclc/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) opencl_bc_lib(NAME ocml SOURCES ${sources}) ROCm-Device-Libs-rocm-5.0.0/ocml/inc/000077500000000000000000000000001415221260100170335ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/ocml/inc/ocml.h000066400000000000000000000675121415221260100201510ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #ifndef OCML_H #define OCML_H // This C header declares the functions provided by the OCML library // Aspects of this library's behavior can be controlled via the // oclc library. See the oclc header for further information // Define here the return values from fpclassify // These match most host definitions #define FP_NAN 0 #define FP_INFINITE 1 #define FP_ZERO 2 #define FP_SUBNORMAL 3 #define FP_NORMAL 4 #define _MANGLE3(P,N,S) P##_##N##_##S #define MANGLE3(P,N,S) _MANGLE3(P,N,S) #define OCML_MANGLE_F32(N) MANGLE3(__ocml, N, f32) #define OCML_MANGLE_2F32(N) MANGLE3(__ocml, N, 2f32) #define OCML_MANGLE_F64(N) MANGLE3(__ocml, N, f64) #define OCML_MANGLE_F16(N) MANGLE3(__ocml, N, f16) #define OCML_MANGLE_2F16(N) MANGLE3(__ocml, N, 2f16) #define DECL_OCML_UNARY_F32(N) extern float OCML_MANGLE_F32(N)(float); #define _DECL_X_OCML_UNARY_F32(A,N) extern __attribute__((A)) float OCML_MANGLE_F32(N)(float); #define DECL_PURE_OCML_UNARY_F32(N) _DECL_X_OCML_UNARY_F32(pure, N) #define DECL_CONST_OCML_UNARY_F32(N) _DECL_X_OCML_UNARY_F32(const, N) #define DECL_CONST_OCML_UNARYPRED_F32(N) extern __attribute__((const)) int OCML_MANGLE_F32(N)(float); #define DECL_OCML_BINARY_F32(N) extern float OCML_MANGLE_F32(N)(float, float); #define _DECL_X_OCML_BINARY_F32(A,N) extern __attribute__((A)) float OCML_MANGLE_F32(N)(float, float); #define DECL_PURE_OCML_BINARY_F32(N) _DECL_X_OCML_BINARY_F32(pure, N) #define DECL_CONST_OCML_BINARY_F32(N) _DECL_X_OCML_BINARY_F32(const, N) #define DECL_CONST_OCML_BINARYPRED_F32(N) extern __attribute__((const)) int OCML_MANGLE_F32(N)(float, float); #define _DECL_X_OCML_TERNARY_F32(A,N) extern __attribute__((A)) float OCML_MANGLE_F32(N)(float, float, float); #define DECL_PURE_OCML_TERNARY_F32(N) _DECL_X_OCML_TERNARY_F32(pure, N) #define DECL_CONST_OCML_TERNARY_F32(N) _DECL_X_OCML_TERNARY_F32(const, N) #define _DECL_X_OCML_TERNARY_2F32(A,N) extern __attribute__((A)) float2 OCML_MANGLE_2F32(N)(float2, float2, float2); #define DECL_PURE_OCML_TERNARY_2F32(N) _DECL_X_OCML_TERNARY_2F32(pure, N) #define DECL_CONST_OCML_TERNARY_2F32(N) _DECL_X_OCML_TERNARY_2F32(const, N) #define DECL_OCML_UNARY_F64(N) extern double OCML_MANGLE_F64(N)(double); #define _DECL_X_OCML_UNARY_F64(A,N) extern __attribute__((A)) double OCML_MANGLE_F64(N)(double); #define DECL_PURE_OCML_UNARY_F64(N) _DECL_X_OCML_UNARY_F64(pure, N) #define DECL_CONST_OCML_UNARY_F64(N) _DECL_X_OCML_UNARY_F64(const, N) #define DECL_CONST_OCML_UNARYPRED_F64(N) extern __attribute__((const)) int OCML_MANGLE_F64(N)(double); #define DECL_OCML_BINARY_F64(N) extern double OCML_MANGLE_F64(N)(double, double); #define _DECL_X_OCML_BINARY_F64(A,N) extern __attribute__((A)) double OCML_MANGLE_F64(N)(double, double); #define DECL_PURE_OCML_BINARY_F64(N) _DECL_X_OCML_BINARY_F64(pure, N) #define DECL_CONST_OCML_BINARY_F64(N) _DECL_X_OCML_BINARY_F64(const, N) #define DECL_CONST_OCML_BINARYPRED_F64(N) extern __attribute__((const)) int OCML_MANGLE_F64(N)(double, double); #define _DECL_X_OCML_TERNARY_F64(A,N) extern __attribute__((A)) double OCML_MANGLE_F64(N)(double, double, double); #define DECL_PURE_OCML_TERNARY_F64(N) _DECL_X_OCML_TERNARY_F64(pure, N) #define DECL_CONST_OCML_TERNARY_F64(N) _DECL_X_OCML_TERNARY_F64(const, N) #define DECL_OCML_UNARY_F16(N) extern half OCML_MANGLE_F16(N)(half); #define _DECL_X_OCML_UNARY_F16(A,N) extern __attribute__((A)) half OCML_MANGLE_F16(N)(half); #define DECL_PURE_OCML_UNARY_F16(N) _DECL_X_OCML_UNARY_F16(pure, N) #define DECL_CONST_OCML_UNARY_F16(N) _DECL_X_OCML_UNARY_F16(const, N) #define DECL_CONST_OCML_UNARYPRED_F16(N) extern __attribute__((const)) int OCML_MANGLE_F16(N)(half); #define DECL_OCML_BINARY_F16(N) extern half OCML_MANGLE_F16(N)(half, half); #define _DECL_X_OCML_BINARY_F16(A,N) extern __attribute__((A)) half OCML_MANGLE_F16(N)(half, half); #define DECL_PURE_OCML_BINARY_F16(N) _DECL_X_OCML_BINARY_F16(pure, N) #define DECL_CONST_OCML_BINARY_F16(N) _DECL_X_OCML_BINARY_F16(const, N) #define DECL_CONST_OCML_BINARYPRED_F16(N) extern __attribute__((const)) int OCML_MANGLE_F16(N)(half, half); #define _DECL_X_OCML_TERNARY_F16(A,N) extern __attribute__((A)) half OCML_MANGLE_F16(N)(half, half, half); #define DECL_PURE_OCML_TERNARY_F16(N) _DECL_X_OCML_TERNARY_F16(pure, N) #define DECL_CONST_OCML_TERNARY_F16(N) _DECL_X_OCML_TERNARY_F16(const, N) #define DECL_OCML_UNARY_2F16(N) extern half2 OCML_MANGLE_2F16(N)(half2); #define _DECL_X_OCML_UNARY_2F16(A,N) extern __attribute__((A)) half2 OCML_MANGLE_2F16(N)(half2); #define DECL_PURE_OCML_UNARY_2F16(N) _DECL_X_OCML_UNARY_2F16(pure, N) #define DECL_CONST_OCML_UNARY_2F16(N) _DECL_X_OCML_UNARY_2F16(const, N) #define DECL_CONST_OCML_UNARYPRED_2F16(N) extern __attribute__((const)) short2 OCML_MANGLE_2F16(N)(half2); #define DECL_OCML_BINARY_2F16(N) extern half2 OCML_MANGLE_2F16(N)(half2, half2); #define _DECL_X_OCML_BINARY_2F16(A,N) extern __attribute__((A)) half2 OCML_MANGLE_2F16(N)(half2, half2); #define DECL_PURE_OCML_BINARY_2F16(N) _DECL_X_OCML_BINARY_2F16(pure, N) #define DECL_CONST_OCML_BINARY_2F16(N) _DECL_X_OCML_BINARY_2F16(const, N) #define DECL_CONST_OCML_BINARYPRED_2F16(N) extern __attribute__((const)) short2 OCML_MANGLE_2F16(N)(half2, half2); #define _DECL_X_OCML_TERNARY_2F16(A,N) extern __attribute__((A)) half2 OCML_MANGLE_2F16(N)(half2, half2, half2); #define DECL_PURE_OCML_TERNARY_2F16(N) _DECL_X_OCML_TERNARY_2F16(pure, N) #define DECL_CONST_OCML_TERNARY_2F16(N) _DECL_X_OCML_TERNARY_2F16(const, N) DECL_CONST_OCML_UNARY_F32(acos) DECL_CONST_OCML_UNARY_F32(acospi) DECL_CONST_OCML_UNARY_F32(acosh) DECL_CONST_OCML_UNARY_F32(asin) DECL_CONST_OCML_UNARY_F32(asinpi) DECL_CONST_OCML_UNARY_F32(asinh) DECL_CONST_OCML_BINARY_F32(atan2) DECL_CONST_OCML_BINARY_F32(atan2pi) DECL_CONST_OCML_UNARY_F32(atan) DECL_CONST_OCML_UNARY_F32(atanh) DECL_CONST_OCML_UNARY_F32(atanpi) DECL_CONST_OCML_UNARY_F32(cbrt) DECL_CONST_OCML_UNARY_F32(ceil) DECL_OCML_UNARY_F32(cos) DECL_CONST_OCML_UNARY_F32(cosh) DECL_OCML_UNARY_F32(cospi) DECL_CONST_OCML_BINARY_F32(copysign) DECL_CONST_OCML_UNARY_F32(erf) DECL_CONST_OCML_UNARY_F32(erfc) DECL_CONST_OCML_UNARY_F32(erfinv) DECL_CONST_OCML_UNARY_F32(erfcinv) DECL_CONST_OCML_UNARY_F32(erfcx) DECL_CONST_OCML_UNARY_F32(exp) DECL_CONST_OCML_UNARY_F32(exp2) DECL_CONST_OCML_UNARY_F32(exp10) DECL_CONST_OCML_UNARY_F32(expm1) DECL_CONST_OCML_UNARY_F32(fabs) DECL_CONST_OCML_BINARY_F32(fdim) DECL_CONST_OCML_UNARY_F32(floor) DECL_CONST_OCML_TERNARY_F32(fma) DECL_CONST_OCML_TERNARY_2F32(fma) DECL_CONST_OCML_TERNARY_F32(fmuladd) DECL_CONST_OCML_TERNARY_2F32(fmuladd) DECL_CONST_OCML_BINARY_F32(fmax) DECL_CONST_OCML_BINARY_F32(fmin) DECL_CONST_OCML_BINARY_F32(fmod) DECL_CONST_OCML_UNARYPRED_F32(fpclassify) extern float OCML_MANGLE_F32(fract)(float, __private float *); extern float OCML_MANGLE_F32(frexp)(float, __private int *); DECL_CONST_OCML_BINARY_F32(hypot) DECL_CONST_OCML_UNARYPRED_F32(ilogb) DECL_CONST_OCML_UNARYPRED_F32(isfinite) DECL_CONST_OCML_UNARYPRED_F32(isinf) DECL_CONST_OCML_UNARYPRED_F32(isnan) DECL_CONST_OCML_UNARYPRED_F32(isnormal) DECL_CONST_OCML_UNARY_F32(i0) DECL_CONST_OCML_UNARY_F32(i1) DECL_CONST_OCML_UNARY_F32(j0) DECL_CONST_OCML_UNARY_F32(j1) extern __attribute__((const)) float OCML_MANGLE_F32(ldexp)(float, int); DECL_CONST_OCML_TERNARY_F32(len3) extern __attribute__((const)) float OCML_MANGLE_F32(len4)(float, float, float, float); DECL_CONST_OCML_UNARY_F32(lgamma) extern float OCML_MANGLE_F32(lgamma_r)(float, __private int *); DECL_CONST_OCML_UNARY_F32(log) DECL_CONST_OCML_UNARY_F32(log2) DECL_CONST_OCML_UNARY_F32(log10) DECL_CONST_OCML_UNARY_F32(log1p) DECL_CONST_OCML_UNARY_F32(logb) DECL_CONST_OCML_TERNARY_F32(mad) DECL_CONST_OCML_TERNARY_2F32(mad) DECL_CONST_OCML_BINARY_F32(max) DECL_CONST_OCML_BINARY_F32(min) DECL_CONST_OCML_BINARY_F32(maxmag) DECL_CONST_OCML_BINARY_F32(minmag) extern float OCML_MANGLE_F32(modf)(float, __private float *); extern __attribute__((const)) float OCML_MANGLE_F32(nan)(uint); DECL_CONST_OCML_UNARY_F32(ncdf) DECL_CONST_OCML_UNARY_F32(ncdfinv) DECL_CONST_OCML_UNARY_F32(nearbyint) DECL_CONST_OCML_BINARY_F32(nextafter) DECL_CONST_OCML_BINARY_F32(pow) DECL_CONST_OCML_BINARY_F32(powr) extern __attribute__((pure)) float OCML_MANGLE_F32(pown)(float, int); extern __attribute__((pure)) float OCML_MANGLE_F32(rootn)(float, int); DECL_CONST_OCML_BINARY_F32(remainder) extern float OCML_MANGLE_F32(remquo)(float, float, __private int *); DECL_CONST_OCML_BINARY_F32(rhypot) DECL_CONST_OCML_UNARY_F32(rint) DECL_CONST_OCML_TERNARY_F32(rlen3) extern __attribute__((const)) float OCML_MANGLE_F32(rlen4)(float, float, float, float); DECL_CONST_OCML_UNARY_F32(round) DECL_CONST_OCML_UNARY_F32(rcbrt) DECL_CONST_OCML_UNARY_F32(rsqrt) DECL_CONST_OCML_BINARY_F32(scalb) extern __attribute__((const)) float OCML_MANGLE_F32(scalbn)(float, int); DECL_CONST_OCML_UNARYPRED_F32(signbit) DECL_CONST_OCML_UNARY_F32(sin) DECL_CONST_OCML_UNARY_F32(sinh) DECL_CONST_OCML_UNARY_F32(sinpi) extern float OCML_MANGLE_F32(sincos)(float, __private float *); extern float OCML_MANGLE_F32(sincospi)(float, __private float *); DECL_CONST_OCML_UNARY_F32(sqrt) DECL_OCML_UNARY_F32(tan) DECL_CONST_OCML_UNARY_F32(tanpi) DECL_CONST_OCML_UNARY_F32(tanh) DECL_CONST_OCML_UNARY_F32(tgamma) DECL_CONST_OCML_UNARY_F32(trunc) DECL_CONST_OCML_UNARY_F32(y0) DECL_CONST_OCML_UNARY_F32(y1) DECL_CONST_OCML_BINARY_F32(add_rte) DECL_CONST_OCML_BINARY_F32(add_rtp) DECL_CONST_OCML_BINARY_F32(add_rtn) DECL_CONST_OCML_BINARY_F32(add_rtz) DECL_CONST_OCML_BINARY_F32(div_rte) DECL_CONST_OCML_BINARY_F32(div_rtp) DECL_CONST_OCML_BINARY_F32(div_rtn) DECL_CONST_OCML_BINARY_F32(div_rtz) DECL_CONST_OCML_TERNARY_F32(fma_rte) DECL_CONST_OCML_TERNARY_F32(fma_rtp) DECL_CONST_OCML_TERNARY_F32(fma_rtn) DECL_CONST_OCML_TERNARY_F32(fma_rtz) DECL_CONST_OCML_BINARY_F32(mul_rte) DECL_CONST_OCML_BINARY_F32(mul_rtp) DECL_CONST_OCML_BINARY_F32(mul_rtn) DECL_CONST_OCML_BINARY_F32(mul_rtz) DECL_CONST_OCML_UNARY_F32(sqrt_rte) DECL_CONST_OCML_UNARY_F32(sqrt_rtp) DECL_CONST_OCML_UNARY_F32(sqrt_rtn) DECL_CONST_OCML_UNARY_F32(sqrt_rtz) DECL_CONST_OCML_BINARY_F32(sub_rte) DECL_CONST_OCML_BINARY_F32(sub_rtp) DECL_CONST_OCML_BINARY_F32(sub_rtn) DECL_CONST_OCML_BINARY_F32(sub_rtz) DECL_CONST_OCML_UNARY_F64(acos) DECL_CONST_OCML_UNARY_F64(acosh) DECL_CONST_OCML_UNARY_F64(acospi) DECL_CONST_OCML_UNARY_F64(asin) DECL_CONST_OCML_UNARY_F64(asinh) DECL_CONST_OCML_UNARY_F64(asinpi) DECL_CONST_OCML_UNARY_F64(atan) DECL_CONST_OCML_UNARY_F64(atanh) DECL_CONST_OCML_UNARY_F64(atanpi) DECL_CONST_OCML_BINARY_F64(atan2) DECL_CONST_OCML_BINARY_F64(atan2pi) DECL_CONST_OCML_UNARY_F64(cbrt) DECL_CONST_OCML_UNARY_F64(ceil) DECL_CONST_OCML_BINARY_F64(copysign) DECL_CONST_OCML_UNARY_F64(cos) DECL_CONST_OCML_UNARY_F64(cosh) DECL_CONST_OCML_UNARY_F64(cospi) DECL_CONST_OCML_UNARY_F64(erf) DECL_CONST_OCML_UNARY_F64(erfc) DECL_CONST_OCML_UNARY_F64(erfinv) DECL_CONST_OCML_UNARY_F64(erfcinv) DECL_CONST_OCML_UNARY_F64(erfcx) DECL_CONST_OCML_UNARY_F64(exp) DECL_CONST_OCML_UNARY_F64(exp2) DECL_CONST_OCML_UNARY_F64(exp10) DECL_CONST_OCML_UNARY_F64(expm1) DECL_CONST_OCML_UNARY_F64(fabs) DECL_CONST_OCML_BINARY_F64(fdim) DECL_CONST_OCML_UNARY_F64(floor) DECL_CONST_OCML_TERNARY_F64(fma) DECL_CONST_OCML_TERNARY_F64(fmuladd) DECL_CONST_OCML_BINARY_F64(fmax) DECL_CONST_OCML_BINARY_F64(fmin) DECL_CONST_OCML_BINARY_F64(fmod) DECL_CONST_OCML_UNARYPRED_F64(fpclassify) extern double OCML_MANGLE_F64(fract)(double, __private double *); extern double OCML_MANGLE_F64(frexp)(double, __private int *); DECL_CONST_OCML_BINARY_F64(hypot) DECL_CONST_OCML_UNARYPRED_F64(ilogb) DECL_CONST_OCML_UNARYPRED_F64(isfinite) DECL_CONST_OCML_UNARYPRED_F64(isinf) DECL_CONST_OCML_UNARYPRED_F64(isnan) DECL_CONST_OCML_UNARYPRED_F64(isnormal) DECL_CONST_OCML_UNARY_F64(i0) DECL_CONST_OCML_UNARY_F64(i1) DECL_CONST_OCML_UNARY_F64(j0) DECL_CONST_OCML_UNARY_F64(j1) extern __attribute__((const)) double OCML_MANGLE_F64(ldexp)(double, int); DECL_CONST_OCML_TERNARY_F64(len3) extern __attribute__((const)) double OCML_MANGLE_F64(len4)(double, double, double, double); DECL_CONST_OCML_UNARY_F64(lgamma) extern double OCML_MANGLE_F64(lgamma_r)(double, __private int *); DECL_CONST_OCML_UNARY_F64(log) DECL_CONST_OCML_UNARY_F64(log2) DECL_CONST_OCML_UNARY_F64(log10) DECL_CONST_OCML_UNARY_F64(log1p) DECL_CONST_OCML_UNARY_F64(logb) DECL_CONST_OCML_TERNARY_F64(mad) DECL_CONST_OCML_BINARY_F64(max) DECL_CONST_OCML_BINARY_F64(min) DECL_CONST_OCML_BINARY_F64(maxmag) DECL_CONST_OCML_BINARY_F64(minmag) extern double OCML_MANGLE_F64(modf)(double, __private double *); extern __attribute__((const)) double OCML_MANGLE_F64(nan)(ulong); DECL_CONST_OCML_UNARY_F64(ncdf) DECL_CONST_OCML_UNARY_F64(ncdfinv) DECL_CONST_OCML_UNARY_F64(nearbyint) DECL_CONST_OCML_BINARY_F64(nextafter) DECL_CONST_OCML_BINARY_F64(pow) DECL_CONST_OCML_BINARY_F64(powr) extern __attribute__((pure)) double OCML_MANGLE_F64(pown)(double, int); extern __attribute__((pure)) double OCML_MANGLE_F64(rootn)(double, int); DECL_CONST_OCML_BINARY_F64(remainder) extern double OCML_MANGLE_F64(remquo)(double, double, __private int *); DECL_CONST_OCML_BINARY_F64(rhypot) DECL_CONST_OCML_UNARY_F64(rint) DECL_CONST_OCML_TERNARY_F64(rlen3) extern __attribute__((const)) double OCML_MANGLE_F64(rlen4)(double, double, double, double); DECL_CONST_OCML_UNARY_F64(round) DECL_CONST_OCML_UNARY_F64(rcbrt) DECL_CONST_OCML_UNARY_F64(rsqrt) DECL_CONST_OCML_BINARY_F64(scalb) extern __attribute__((const)) double OCML_MANGLE_F64(scalbn)(double, int); DECL_CONST_OCML_UNARYPRED_F64(signbit) DECL_CONST_OCML_UNARY_F64(sin) extern double OCML_MANGLE_F64(sincos)(double, __private double *); extern double OCML_MANGLE_F64(sincospi)(double, __private double *); DECL_CONST_OCML_UNARY_F64(sinh) DECL_CONST_OCML_UNARY_F64(sinpi) DECL_CONST_OCML_UNARY_F64(sqrt) DECL_CONST_OCML_UNARY_F64(tan) DECL_CONST_OCML_UNARY_F64(tanh) DECL_CONST_OCML_UNARY_F64(tanpi) DECL_CONST_OCML_UNARY_F64(tgamma) DECL_CONST_OCML_UNARY_F64(trunc) DECL_CONST_OCML_UNARY_F64(y0) DECL_CONST_OCML_UNARY_F64(y1) DECL_CONST_OCML_BINARY_F64(add_rte) DECL_CONST_OCML_BINARY_F64(add_rtp) DECL_CONST_OCML_BINARY_F64(add_rtn) DECL_CONST_OCML_BINARY_F64(add_rtz) DECL_CONST_OCML_BINARY_F64(div_rte) DECL_CONST_OCML_BINARY_F64(div_rtp) DECL_CONST_OCML_BINARY_F64(div_rtn) DECL_CONST_OCML_BINARY_F64(div_rtz) DECL_CONST_OCML_TERNARY_F64(fma_rte) DECL_CONST_OCML_TERNARY_F64(fma_rtp) DECL_CONST_OCML_TERNARY_F64(fma_rtn) DECL_CONST_OCML_TERNARY_F64(fma_rtz) DECL_CONST_OCML_BINARY_F64(mul_rte) DECL_CONST_OCML_BINARY_F64(mul_rtp) DECL_CONST_OCML_BINARY_F64(mul_rtn) DECL_CONST_OCML_BINARY_F64(mul_rtz) DECL_CONST_OCML_UNARY_F64(sqrt_rte) DECL_CONST_OCML_UNARY_F64(sqrt_rtp) DECL_CONST_OCML_UNARY_F64(sqrt_rtn) DECL_CONST_OCML_UNARY_F64(sqrt_rtz) DECL_CONST_OCML_BINARY_F64(sub_rte) DECL_CONST_OCML_BINARY_F64(sub_rtp) DECL_CONST_OCML_BINARY_F64(sub_rtn) DECL_CONST_OCML_BINARY_F64(sub_rtz) DECL_CONST_OCML_UNARY_F32(native_recip) DECL_CONST_OCML_UNARY_F64(native_recip) DECL_CONST_OCML_UNARY_F32(native_sqrt) DECL_CONST_OCML_UNARY_F64(native_sqrt) DECL_CONST_OCML_UNARY_F32(native_rsqrt) DECL_CONST_OCML_UNARY_F64(native_rsqrt) DECL_CONST_OCML_UNARY_F32(native_sin) DECL_CONST_OCML_UNARY_F64(native_sin) DECL_CONST_OCML_UNARY_F32(native_cos) DECL_CONST_OCML_UNARY_F64(native_cos) DECL_CONST_OCML_UNARY_F32(native_exp) DECL_CONST_OCML_UNARY_F64(native_exp) DECL_CONST_OCML_UNARY_F32(native_exp2) DECL_CONST_OCML_UNARY_F64(native_exp2) DECL_CONST_OCML_UNARY_F32(native_exp10) DECL_CONST_OCML_UNARY_F32(native_log) DECL_CONST_OCML_UNARY_F64(native_log) DECL_CONST_OCML_UNARY_F32(native_log2) DECL_CONST_OCML_UNARY_F64(native_log2) DECL_CONST_OCML_UNARY_F32(native_log10) DECL_CONST_OCML_UNARY_F64(native_log10) #pragma OPENCL EXTENSION cl_khr_fp16 : enable DECL_CONST_OCML_UNARY_F16(acos) DECL_CONST_OCML_UNARY_F16(acosh) DECL_CONST_OCML_UNARY_F16(acospi) DECL_CONST_OCML_UNARY_F16(asin) DECL_CONST_OCML_UNARY_F16(asinh) DECL_CONST_OCML_UNARY_F16(asinpi) DECL_CONST_OCML_UNARY_F16(atan) DECL_CONST_OCML_UNARY_F16(atanh) DECL_CONST_OCML_UNARY_F16(atanpi) DECL_CONST_OCML_BINARY_F16(atan2) DECL_CONST_OCML_BINARY_F16(atan2pi) DECL_CONST_OCML_UNARY_F16(cbrt) DECL_CONST_OCML_UNARY_F16(ceil) DECL_CONST_OCML_BINARY_F16(copysign) DECL_CONST_OCML_UNARY_F16(cos) DECL_CONST_OCML_UNARY_F16(cosh) DECL_CONST_OCML_UNARY_F16(cospi) DECL_CONST_OCML_UNARY_F16(erf) DECL_CONST_OCML_UNARY_F16(erfc) DECL_CONST_OCML_UNARY_F16(erfinv) DECL_CONST_OCML_UNARY_F16(erfcinv) DECL_CONST_OCML_UNARY_F16(erfcx) DECL_CONST_OCML_UNARY_F16(exp) DECL_CONST_OCML_UNARY_F16(exp2) DECL_CONST_OCML_UNARY_F16(exp10) DECL_CONST_OCML_UNARY_F16(expm1) DECL_CONST_OCML_UNARY_F16(fabs) DECL_CONST_OCML_BINARY_F16(fdim) DECL_CONST_OCML_UNARY_F16(floor) DECL_CONST_OCML_TERNARY_F16(fma) DECL_CONST_OCML_TERNARY_F16(fmuladd) DECL_CONST_OCML_TERNARY_F16(fma_rte) DECL_CONST_OCML_TERNARY_F16(fma_rtp) DECL_CONST_OCML_TERNARY_F16(fma_rtn) DECL_CONST_OCML_TERNARY_F16(fma_rtz) DECL_CONST_OCML_BINARY_F16(fmax) DECL_CONST_OCML_BINARY_F16(fmin) DECL_CONST_OCML_BINARY_F16(fmod) DECL_CONST_OCML_UNARYPRED_F16(fpclassify) extern half OCML_MANGLE_F16(fract)(half, __private half *); extern half OCML_MANGLE_F16(frexp)(half, __private int *); DECL_CONST_OCML_BINARY_F16(hypot) DECL_CONST_OCML_UNARYPRED_F16(ilogb) DECL_CONST_OCML_UNARYPRED_F16(isfinite) DECL_CONST_OCML_UNARYPRED_F16(isinf) DECL_CONST_OCML_UNARYPRED_F16(isnan) DECL_CONST_OCML_UNARYPRED_F16(isnormal) DECL_CONST_OCML_UNARY_F16(i0) DECL_CONST_OCML_UNARY_F16(i1) DECL_CONST_OCML_UNARY_F16(j0) DECL_CONST_OCML_UNARY_F16(j1) extern __attribute__((const)) half OCML_MANGLE_F16(ldexp)(half, int); DECL_CONST_OCML_TERNARY_F16(len3) extern __attribute__((const)) half OCML_MANGLE_F16(len4)(half, half, half, half); DECL_CONST_OCML_UNARY_F16(lgamma) extern half OCML_MANGLE_F16(lgamma_r)(half, __private int *); DECL_CONST_OCML_UNARY_F16(log) DECL_CONST_OCML_UNARY_F16(logb) DECL_CONST_OCML_UNARY_F16(log2) DECL_CONST_OCML_UNARY_F16(log10) DECL_CONST_OCML_UNARY_F16(log1p) DECL_CONST_OCML_TERNARY_F16(mad) DECL_CONST_OCML_BINARY_F16(max) DECL_CONST_OCML_BINARY_F16(min) DECL_CONST_OCML_BINARY_F16(maxmag) DECL_CONST_OCML_BINARY_F16(minmag) extern half OCML_MANGLE_F16(modf)(half, __private half *); extern __attribute__((const)) half OCML_MANGLE_F16(nan)(ushort); DECL_CONST_OCML_UNARY_F16(ncdf) DECL_CONST_OCML_UNARY_F16(ncdfinv) DECL_CONST_OCML_UNARY_F16(nearbyint) DECL_CONST_OCML_BINARY_F16(nextafter) DECL_CONST_OCML_BINARY_F16(pow) DECL_CONST_OCML_BINARY_F16(powr) extern __attribute__((pure)) half OCML_MANGLE_F16(pown)(half, int); extern __attribute__((pure)) half OCML_MANGLE_F16(rootn)(half, int); DECL_CONST_OCML_UNARY_F16(rcbrt) DECL_CONST_OCML_BINARY_F16(remainder) extern half OCML_MANGLE_F16(remquo)(half, half, __private int *); DECL_CONST_OCML_BINARY_F16(rhypot) DECL_CONST_OCML_UNARY_F16(rint) DECL_CONST_OCML_TERNARY_F16(rlen3) extern __attribute__((const)) half OCML_MANGLE_F16(rlen4)(half, half, half, half); DECL_CONST_OCML_UNARY_F16(round) DECL_CONST_OCML_UNARY_F16(rsqrt) DECL_CONST_OCML_BINARY_F16(scalb) extern __attribute__((const)) half OCML_MANGLE_F16(scalbn)(half, int); DECL_CONST_OCML_UNARYPRED_F16(signbit) DECL_CONST_OCML_UNARY_F16(sin) DECL_CONST_OCML_UNARY_F16(sinh) DECL_CONST_OCML_UNARY_F16(sinpi) extern half OCML_MANGLE_F16(sincos)(half, __private half *); extern half OCML_MANGLE_F16(sincospi)(half, __private half *); DECL_CONST_OCML_UNARY_F16(sqrt) DECL_CONST_OCML_UNARY_F16(sqrt_rte) DECL_CONST_OCML_UNARY_F16(sqrt_rtp) DECL_CONST_OCML_UNARY_F16(sqrt_rtn) DECL_CONST_OCML_UNARY_F16(sqrt_rtz) DECL_CONST_OCML_UNARY_F16(tan) DECL_CONST_OCML_UNARY_F16(tanpi) DECL_CONST_OCML_UNARY_F16(tanh) DECL_CONST_OCML_UNARY_F16(tgamma) DECL_CONST_OCML_UNARY_F16(trunc) DECL_CONST_OCML_UNARY_F16(y0) DECL_CONST_OCML_UNARY_F16(y1) DECL_CONST_OCML_BINARY_F16(add_rte) DECL_CONST_OCML_BINARY_F16(add_rtp) DECL_CONST_OCML_BINARY_F16(add_rtn) DECL_CONST_OCML_BINARY_F16(add_rtz) DECL_CONST_OCML_BINARY_F16(div_rte) DECL_CONST_OCML_BINARY_F16(div_rtp) DECL_CONST_OCML_BINARY_F16(div_rtn) DECL_CONST_OCML_BINARY_F16(div_rtz) DECL_CONST_OCML_TERNARY_F16(fma_rte) DECL_CONST_OCML_TERNARY_F16(fma_rtp) DECL_CONST_OCML_TERNARY_F16(fma_rtn) DECL_CONST_OCML_TERNARY_F16(fma_rtz) DECL_CONST_OCML_BINARY_F16(mul_rte) DECL_CONST_OCML_BINARY_F16(mul_rtp) DECL_CONST_OCML_BINARY_F16(mul_rtn) DECL_CONST_OCML_BINARY_F16(mul_rtz) DECL_CONST_OCML_UNARY_F16(sqrt_rte) DECL_CONST_OCML_UNARY_F16(sqrt_rtp) DECL_CONST_OCML_UNARY_F16(sqrt_rtn) DECL_CONST_OCML_UNARY_F16(sqrt_rtz) DECL_CONST_OCML_BINARY_F16(sub_rte) DECL_CONST_OCML_BINARY_F16(sub_rtp) DECL_CONST_OCML_BINARY_F16(sub_rtn) DECL_CONST_OCML_BINARY_F16(sub_rtz) // 2-vector functions DECL_CONST_OCML_UNARY_2F16(acos) DECL_CONST_OCML_UNARY_2F16(acosh) DECL_CONST_OCML_UNARY_2F16(acospi) DECL_CONST_OCML_UNARY_2F16(asin) DECL_CONST_OCML_UNARY_2F16(asinh) DECL_CONST_OCML_UNARY_2F16(asinpi) DECL_CONST_OCML_UNARY_2F16(atan) DECL_CONST_OCML_UNARY_2F16(atanh) DECL_CONST_OCML_UNARY_2F16(atanpi) DECL_CONST_OCML_BINARY_2F16(atan2) DECL_CONST_OCML_BINARY_2F16(atan2pi) DECL_CONST_OCML_UNARY_2F16(cbrt) DECL_CONST_OCML_UNARY_2F16(ceil) DECL_CONST_OCML_BINARY_2F16(copysign) DECL_CONST_OCML_UNARY_2F16(cos) DECL_CONST_OCML_UNARY_2F16(cosh) DECL_CONST_OCML_UNARY_2F16(cospi) DECL_CONST_OCML_UNARY_2F16(erf) DECL_CONST_OCML_UNARY_2F16(erfc) DECL_CONST_OCML_UNARY_2F16(erfinv) DECL_CONST_OCML_UNARY_2F16(erfcinv) DECL_CONST_OCML_UNARY_2F16(erfcx) DECL_CONST_OCML_UNARY_2F16(exp) DECL_CONST_OCML_UNARY_2F16(exp2) DECL_CONST_OCML_UNARY_2F16(exp10) DECL_CONST_OCML_UNARY_2F16(expm1) DECL_CONST_OCML_UNARY_2F16(fabs) DECL_CONST_OCML_BINARY_2F16(fdim) DECL_CONST_OCML_UNARY_2F16(floor) DECL_CONST_OCML_TERNARY_2F16(fma) DECL_CONST_OCML_TERNARY_2F16(fmuladd) DECL_CONST_OCML_TERNARY_2F16(fma_rte) DECL_CONST_OCML_TERNARY_2F16(fma_rtp) DECL_CONST_OCML_TERNARY_2F16(fma_rtn) DECL_CONST_OCML_TERNARY_2F16(fma_rtz) DECL_CONST_OCML_BINARY_2F16(fmax) DECL_CONST_OCML_BINARY_2F16(fmin) DECL_CONST_OCML_BINARY_2F16(fmod) DECL_CONST_OCML_UNARYPRED_2F16(fpclassify) extern half2 OCML_MANGLE_2F16(fract)(half2, __private half2 *); extern half2 OCML_MANGLE_2F16(frexp)(half2, __private int2 *); DECL_CONST_OCML_BINARY_2F16(hypot) extern __attribute__((const)) int2 OCML_MANGLE_2F16(ilogb)(half2); DECL_CONST_OCML_UNARYPRED_2F16(isfinite) DECL_CONST_OCML_UNARYPRED_2F16(isinf) DECL_CONST_OCML_UNARYPRED_2F16(isnan) DECL_CONST_OCML_UNARYPRED_2F16(isnormal) DECL_CONST_OCML_UNARY_2F16(i0) DECL_CONST_OCML_UNARY_2F16(i1) DECL_CONST_OCML_UNARY_2F16(j0) DECL_CONST_OCML_UNARY_2F16(j1) extern __attribute__((const)) half2 OCML_MANGLE_2F16(ldexp)(half2, int2); DECL_CONST_OCML_UNARY_2F16(lgamma) extern half2 OCML_MANGLE_2F16(lgamma_r)(half2, __private int2 *); DECL_CONST_OCML_UNARY_2F16(log) DECL_CONST_OCML_UNARY_2F16(logb) DECL_CONST_OCML_UNARY_2F16(log2) DECL_CONST_OCML_UNARY_2F16(log10) DECL_CONST_OCML_UNARY_2F16(log1p) DECL_CONST_OCML_TERNARY_2F16(mad) DECL_CONST_OCML_BINARY_2F16(max) DECL_CONST_OCML_BINARY_2F16(min) DECL_CONST_OCML_BINARY_2F16(maxmag) DECL_CONST_OCML_BINARY_2F16(minmag) extern half2 OCML_MANGLE_2F16(modf)(half2, __private half2 *); extern __attribute__((const)) half2 OCML_MANGLE_2F16(nan)(ushort2); DECL_CONST_OCML_UNARY_2F16(ncdf) DECL_CONST_OCML_UNARY_2F16(ncdfinv) DECL_CONST_OCML_UNARY_2F16(nearbyint) DECL_CONST_OCML_BINARY_2F16(nextafter) DECL_CONST_OCML_BINARY_2F16(pow) DECL_CONST_OCML_BINARY_2F16(powr) extern __attribute__((pure)) half2 OCML_MANGLE_2F16(pown)(half2, int2); extern __attribute__((pure)) half2 OCML_MANGLE_2F16(rootn)(half2, int2); DECL_CONST_OCML_UNARY_2F16(rcbrt) DECL_CONST_OCML_BINARY_2F16(remainder) extern half2 OCML_MANGLE_2F16(remquo)(half2, half2, __private int2 *); DECL_CONST_OCML_UNARY_2F16(rint) DECL_CONST_OCML_UNARY_2F16(round) DECL_CONST_OCML_UNARY_2F16(rsqrt) DECL_CONST_OCML_BINARY_2F16(scalb) extern __attribute__((const)) half2 OCML_MANGLE_2F16(scalbn)(half2, int2); DECL_CONST_OCML_UNARYPRED_2F16(signbit) DECL_CONST_OCML_UNARY_2F16(sin) DECL_CONST_OCML_UNARY_2F16(sinh) DECL_CONST_OCML_UNARY_2F16(sinpi) extern half2 OCML_MANGLE_2F16(sincos)(half2, __private half2 *); extern half2 OCML_MANGLE_2F16(sincospi)(half2, __private half2 *); DECL_CONST_OCML_UNARY_2F16(sqrt) DECL_CONST_OCML_UNARY_2F16(sqrt_rte) DECL_CONST_OCML_UNARY_2F16(sqrt_rtp) DECL_CONST_OCML_UNARY_2F16(sqrt_rtn) DECL_CONST_OCML_UNARY_2F16(sqrt_rtz) DECL_CONST_OCML_UNARY_2F16(tan) DECL_CONST_OCML_UNARY_2F16(tanpi) DECL_CONST_OCML_UNARY_2F16(tanh) DECL_CONST_OCML_UNARY_2F16(tgamma) DECL_CONST_OCML_UNARY_2F16(trunc) DECL_CONST_OCML_UNARY_2F16(y0) DECL_CONST_OCML_UNARY_2F16(y1) DECL_CONST_OCML_BINARY_2F16(add_rte) DECL_CONST_OCML_BINARY_2F16(add_rtp) DECL_CONST_OCML_BINARY_2F16(add_rtn) DECL_CONST_OCML_BINARY_2F16(add_rtz) DECL_CONST_OCML_BINARY_2F16(div_rte) DECL_CONST_OCML_BINARY_2F16(div_rtp) DECL_CONST_OCML_BINARY_2F16(div_rtn) DECL_CONST_OCML_BINARY_2F16(div_rtz) DECL_CONST_OCML_TERNARY_2F16(fma_rte) DECL_CONST_OCML_TERNARY_2F16(fma_rtp) DECL_CONST_OCML_TERNARY_2F16(fma_rtn) DECL_CONST_OCML_TERNARY_2F16(fma_rtz) DECL_CONST_OCML_BINARY_2F16(mul_rte) DECL_CONST_OCML_BINARY_2F16(mul_rtp) DECL_CONST_OCML_BINARY_2F16(mul_rtn) DECL_CONST_OCML_BINARY_2F16(mul_rtz) DECL_CONST_OCML_UNARY_2F16(sqrt_rte) DECL_CONST_OCML_UNARY_2F16(sqrt_rtp) DECL_CONST_OCML_UNARY_2F16(sqrt_rtn) DECL_CONST_OCML_UNARY_2F16(sqrt_rtz) DECL_CONST_OCML_BINARY_2F16(sub_rte) DECL_CONST_OCML_BINARY_2F16(sub_rtp) DECL_CONST_OCML_BINARY_2F16(sub_rtn) DECL_CONST_OCML_BINARY_2F16(sub_rtz) DECL_CONST_OCML_UNARY_F16(native_recip) DECL_CONST_OCML_UNARY_F16(native_sqrt) DECL_CONST_OCML_UNARY_F16(native_rsqrt) DECL_CONST_OCML_UNARY_F16(native_sin) DECL_CONST_OCML_UNARY_F16(native_cos) DECL_CONST_OCML_UNARY_F16(native_exp2) DECL_CONST_OCML_UNARY_F16(native_log2) extern __attribute__((const)) float OCML_MANGLE_F32(cabs)(float2); extern __attribute__((const)) double OCML_MANGLE_F64(cabs)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(cacos)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(cacos)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(cacosh)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(cacosh)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(casin)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(casin)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(casinh)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(casinh)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(catan)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(catan)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(catanh)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(catanh)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(cexp)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(cexp)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(clog)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(clog)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(ccos)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(ccos)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(ccosh)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(ccosh)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(csin)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(csin)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(csinh)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(csinh)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(ctan)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(ctan)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(ctanh)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(ctanh)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(csqrt)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(csqrt)(double2); extern __attribute__((const)) float2 OCML_MANGLE_F32(cdiv)(float2, float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(cdiv)(double2, double2); #pragma OPENCL EXTENSION cl_khr_fp16 : disable #endif // OCML_H ROCm-Device-Libs-rocm-5.0.0/ocml/src/000077500000000000000000000000001415221260100170515ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/ocml/src/acosD.cl000066400000000000000000000042501415221260100204230ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" CONSTATTR double MATH_MANGLE(acos)(double x) { // Computes arccos(x). // The argument is first reduced by noting that arccos(x) // is invalid for abs(x) > 1. For denormal and small // arguments arccos(x) = pi/2 to machine accuracy. // Remaining argument ranges are handled as follows. // For abs(x) <= 0.5 use // arccos(x) = pi/2 - arcsin(x) // = pi/2 - (x + x^3*R(x^2)) // where R(x^2) is a rational minimax approximation to // (arcsin(x) - x)/x^3. // For abs(x) > 0.5 exploit the identity: // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) // together with the above rational approximation, and // reconstruct the terms carefully. double y = BUILTIN_ABS_F64(x); bool transform = y >= 0.5; double rt = MATH_MAD(y, -0.5, 0.5); double y2 = y * y; double r = transform ? rt : y2; double u = r * MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, 0x1.059859fea6a70p-5, -0x1.0a5a378a05eafp-6), 0x1.4052137024d6ap-6), 0x1.ab3a098a70509p-8), 0x1.8ed60a300c8d2p-7), 0x1.c6fa84b77012bp-7), 0x1.1c6c111dccb70p-6), 0x1.6e89f0a0adacfp-6), 0x1.f1c72c668963fp-6), 0x1.6db6db41ce4bdp-5), 0x1.333333336fd5bp-4), 0x1.5555555555380p-3); double z = MATH_MAD(0x1.dd9ad336a0500p-1, 0x1.af154eeb562d6p+0, -MATH_MAD(x, u, x)); if (transform) { double2 s = root2(r); double zm = MATH_MAD(0x1.dd9ad336a0500p+0, 0x1.af154eeb562d6p+0, -2.0*MATH_MAD(s.hi, u, s.hi)); double zp = 2.0 * (s.hi + MATH_MAD(s.hi, u, s.lo)); z = x < 0.0 ? zm : zp; z = x == -1.0 ? 0x1.921fb54442d18p+1 : z; z = x == 1.0 ? 0.0 : z; } return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/acosF.cl000066400000000000000000000032761415221260100204340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(acos)(float x) { // Computes arccos(x). // The argument is first reduced by noting that arccos(x) // is invalid for abs(x) > 1 and arccos(-x) = arccos(x). // For denormal and small arguments arccos(x) = pi/2 to machine // accuracy. Remaining argument ranges are handled as follows. // For abs(x) <= 0.5 use // arccos(x) = pi/2 - arcsin(x) // = pi/2 - (x + x^3*R(x^2)) // where R(x^2) is a rational minimax approximation to // (arcsin(x) - x)/x^3. // For abs(x) > 0.5 exploit the identity: // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) // together with the above rational approximation, and // reconstruct the terms carefully. float ax = BUILTIN_ABS_F32(x); float rt = MATH_MAD(-0.5f, ax, 0.5f); float x2 = ax * ax; float r = ax > 0.5f ? rt : x2; float u = r * MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, 0x1.38434ep-5f, 0x1.bf8bb4p-7f), 0x1.069878p-5f), 0x1.6c8362p-5f), 0x1.33379p-4f), 0x1.555558p-3f); float s = MATH_FAST_SQRT(r); float ztp = 2.0f * MATH_MAD(s, u, s); float ztn = MATH_MAD(0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -ztp); float zt = x < 0.0f ? ztn : ztp; float z = MATH_MAD(0x1.ddcb02p-1f, 0x1.aee9d6p+0f, -MATH_MAD(x, u, x)); z = ax > 0.5f ? zt : z; return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/acosH.cl000066400000000000000000000030041415221260100204230ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(acos) CONSTATTR half MATH_MANGLE(acos)(half x) { // Computes arccos(x). // The argument is first reduced by noting that arccos(x) // is invalid for abs(x) > 1 and arccos(-x) = arccos(x). // For denormal and small arguments arccos(x) = pi/2 to machine // accuracy. Remaining argument ranges are handled as follows. // For abs(x) <= 0.5 use // arccos(x) = pi/2 - arcsin(x) // = pi/2 - (x + x^3*R(x^2)) // where R(x^2) is a rational minimax approximation to // (arcsin(x) - x)/x^3. // For abs(x) > 0.5 exploit the identity: // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) // together with the above rational approximation, and // reconstruct the terms carefully. half ax = BUILTIN_ABS_F16(x); half rt = MATH_MAD(-0.5h, ax, 0.5h); half x2 = ax * ax; half r = ax > 0.5h ? rt : x2; half u = r * MATH_MAD(r, 0x1.828p-4h, 0x1.52p-3h); half s = MATH_FAST_SQRT(r); half ztp = 2.0h * MATH_MAD(s, u, s); half ztn = MATH_MAD(0x1.ea8p+0h, 0x1.a3cp+0h, -ztp); half zt = x < 0.0h ? ztn : ztp; half z = MATH_MAD(0x1.ea8p-1h, 0x1.a3cp+0h, -MATH_MAD(x, u, x)); z = ax > 0.5h ? zt : z; return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/acoshD.cl000066400000000000000000000015301415221260100205710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea); CONSTATTR double MATH_MANGLE(acosh)(double x) { bool b = x >= 0x1.0p+512; double s = b ? 0x1.0p-512 : 1.0; double sx = x * s; double2 a = add(sx, root2(sub(sqr(sx), s*s))); double z = MATH_PRIVATE(lnep)(a, b ? 512 : 0); if (!FINITE_ONLY_OPT()) { z = BUILTIN_CLASS_F64(x, CLASS_PINF) ? x : z; z = x < 1.0 ? AS_DOUBLE(QNANBITPATT_DP64) : z; } return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/acoshF.cl000066400000000000000000000015171415221260100206000ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea); CONSTATTR float MATH_MANGLE(acosh)(float x) { bool b = x >= 0x1.0p+64f; float s = b ? 0x1.0p-64f : 1.0f; float sx = x * s; float2 a = add(sx, root2(sub(sqr(sx), s*s))); float z = MATH_PRIVATE(lnep)(a, b ? 64 : 0); if (!FINITE_ONLY_OPT()) { z = BUILTIN_CLASS_F32(x, CLASS_PINF) ? x : z; z = x < 1.0f ? AS_FLOAT(QNANBITPATT_SP32) : z; } return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/acoshH.cl000066400000000000000000000012661415221260100206030ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(acosh) CONSTATTR half MATH_MANGLE(acosh)(half hx) { half ret; float x = (float)hx; float t = x + BUILTIN_SQRT_F32(BUILTIN_MAD_F32(x, x, -1.0f)); ret = (half)(BUILTIN_LOG2_F32(t) * 0x1.62e430p-1f); if (!FINITE_ONLY_OPT()) { ret = hx < 1.0h ? AS_HALF((short)QNANBITPATT_HP16) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/acospiD.cl000066400000000000000000000041631415221260100207570ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" CONSTATTR double MATH_MANGLE(acospi)(double x) { // Computes arccos(x). // The argument is first reduced by noting that arccos(x) // is invalid for abs(x) > 1. For denormal and small // arguments arccos(x) = pi/2 to machine accuracy. // Remaining argument ranges are handled as follows. // For abs(x) <= 0.5 use // arccos(x) = pi/2 - arcsin(x) // = pi/2 - (x + x^3*R(x^2)) // where R(x^2) is a rational minimax approximation to // (arcsin(x) - x)/x^3. // For abs(x) > 0.5 exploit the identity: // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) // together with the above rational approximation, and // reconstruct the terms carefully. double y = BUILTIN_ABS_F64(x); bool transform = y >= 0.5; double rt = MATH_MAD(y, -0.5, 0.5); double y2 = y * y; double r = transform ? rt : y2; double u = r * MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, 0x1.547a51d41fb0bp-7, -0x1.6a3fb0718a8f7p-8), 0x1.a7b91f7177ee8p-8), 0x1.035d3435b8ad8p-9), 0x1.ff0549b4e0449p-9), 0x1.21604ae288f96p-8), 0x1.6a2b36f9aec49p-8), 0x1.d2b076c914f04p-8), 0x1.3ce53861f8f1fp-7), 0x1.d1a4529a30a69p-7), 0x1.8723a1d61d2e9p-6), 0x1.b2995e7b7af0fp-5); const double piinv = 0x1.45f306dc9c883p-2; double z = 0.5 - MATH_MAD(x, u, piinv*x); if (transform) { double2 s = ldx(root2(r), 1); double zm = 1.0 - MATH_MAD(s.hi, u, piinv*s.hi); double2 zp = fadd(mul(piinv, s), mul(s, u)); z = x < 0.0 ? zm : zp.hi; z = x == -1.0 ? 1.0 : z; z = x == 1.0 ? 0.0 : z; } return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/acospiF.cl000066400000000000000000000020331415221260100207530ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(acospi)(float x) { const float piinv = 0x1.45f306p-2f; float ax = BUILTIN_ABS_F32(x); float rt = MATH_MAD(-0.5f, ax, 0.5f); float x2 = ax * ax; float r = ax > 0.5f ? rt : x2; float u = r * MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, -0x1.3f1c6cp-8f, 0x1.2ac560p-6f), 0x1.80aab4p-8f), 0x1.e53378p-7f), 0x1.86680ap-6f), 0x1.b29c5ap-5f); float s = MATH_FAST_SQRT(r); float ztp = 2.0f * MATH_MAD(s, u, piinv*s); float ztn = 1.0f - ztp; float zt = x < 0.0f ? ztn : ztp; float z = 0.5f - MATH_MAD(x, u, piinv*x); z = ax > 0.5f ? zt : z; return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/acospiH.cl000066400000000000000000000027751415221260100207720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(acospi) CONSTATTR half MATH_MANGLE(acospi)(half x) { // Computes arccos(x). // The argument is first reduced by noting that arccos(x) // is invalid for abs(x) > 1 and arccos(-x) = arccos(x). // For denormal and small arguments arccos(x) = pi/2 to machine // accuracy. Remaining argument ranges are handled as follows. // For abs(x) <= 0.5 use // arccos(x) = pi/2 - arcsin(x) // = pi/2 - (x + x^3*R(x^2)) // where R(x^2) is a rational minimax approximation to // (arcsin(x) - x)/x^3. // For abs(x) > 0.5 exploit the identity: // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) // together with the above rational approximation, and // reconstruct the terms carefully. const half piinv = 0x1.46p-2h; half ax = BUILTIN_ABS_F16(x); half rt = MATH_MAD(-0.5h, ax, 0.5h); half x2 = ax * ax; half r = ax > 0.5h ? rt : x2; half u = r * MATH_MAD(r, 0x1.0b8p-5h, 0x1.a7cp-5h); half s = MATH_FAST_SQRT(r); half ztp = 2.0h * MATH_MAD(s, u, piinv*s); half ztn = 1.0h - ztp; half zt = x < 0.0h ? ztn : ztp; half z = 0.5h - MATH_MAD(x, u, piinv*x); z = ax > 0.5h ? zt : z; return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/addD.cl000066400000000000000000000010621415221260100202240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define GEN(LN,UN) \ CONSTATTR double \ MATH_MANGLE(LN)(double x, double y) \ { \ return BUILTIN_##UN##_F64(x, y); \ } // GEN(add_rte,ADD_RTE) // GEN(add_rtn,ADD_RTN) // GEN(add_rtp,ADD_RTP) // GEN(add_rtz,ADD_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/addF.cl000066400000000000000000000010571415221260100202320ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define GEN(LN,UN) \ CONSTATTR float \ MATH_MANGLE(LN)(float x, float y) \ { \ return BUILTIN_##UN##_F32(x, y); \ } // GEN(add_rte,ADD_RTE) // GEN(add_rtn,ADD_RTN) // GEN(add_rtp,ADD_RTP) // GEN(add_rtz,ADD_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/addH.cl000066400000000000000000000010541415221260100202310ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #define GEN(LN,UN) \ CONSTATTR half \ MATH_MANGLE(LN)(half x, half y) \ { \ return BUILTIN_##UN##_F16(x, y); \ } // GEN(add_rte,ADD_RTE) // GEN(add_rtn,ADD_RTN) // GEN(add_rtp,ADD_RTP) // GEN(add_rtz,ADD_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/asinD.cl000066400000000000000000000040331415221260100204270ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" CONSTATTR double MATH_MANGLE(asin)(double x) { // Computes arcsin(x). // The argument is first reduced by noting that arcsin(x) // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). // For denormal and small arguments arcsin(x) = x to machine // accuracy. Remaining argument ranges are handled as follows. // For abs(x) <= 0.5 use // arcsin(x) = x + x^3*R(x^2) // where R(x^2) is a rational minimax approximation to // (arcsin(x) - x)/x^3. // For abs(x) > 0.5 exploit the identity: // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) // together with the above rational approximation, and // reconstruct the terms carefully. double y = BUILTIN_ABS_F64(x); bool transform = y >= 0.5; double rt = MATH_MAD(y, -0.5, 0.5); double y2 = y * y; double r = transform ? rt : y2; double u = r * MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, 0x1.059859fea6a70p-5, -0x1.0a5a378a05eafp-6), 0x1.4052137024d6ap-6), 0x1.ab3a098a70509p-8), 0x1.8ed60a300c8d2p-7), 0x1.c6fa84b77012bp-7), 0x1.1c6c111dccb70p-6), 0x1.6e89f0a0adacfp-6), 0x1.f1c72c668963fp-6), 0x1.6db6db41ce4bdp-5), 0x1.333333336fd5bp-4), 0x1.5555555555380p-3); double v = MATH_MAD(y, u, y); if (transform) { double2 s = root2(r); double2 ve = fsub(con(0x1.921fb54442d18p-1, 0x1.1a62633145c07p-55), fadd(s, mul(s, u))); v = ve.hi + ve.hi; v = y == 1.0 ? 0x1.921fb54442d18p+0 : v; } return BUILTIN_COPYSIGN_F64(v, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/asinF.cl000066400000000000000000000031301415221260100204260ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(asin)(float x) { // Computes arcsin(x). // The argument is first reduced by noting that arcsin(x) // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). // For denormal and small arguments arcsin(x) = x to machine // accuracy. Remaining argument ranges are handled as follows. // For abs(x) <= 0.5 use // arcsin(x) = x + x^3*R(x^2) // where R(x^2) is a polynomial minimax approximation to // (arcsin(x) - x)/x^3. // For abs(x) > 0.5 exploit the identity: // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) // together with the above polynomial approximation, and // reconstruct the terms carefully. float ax = BUILTIN_ABS_F32(x); float tx = MATH_MAD(ax, -0.5f, 0.5f); float x2 = x*x; float r = ax >= 0.5f ? tx : x2; float u = r * MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, 0x1.38434ep-5f, 0x1.bf8bb4p-7f), 0x1.069878p-5f), 0x1.6c8362p-5f), 0x1.33379p-4f), 0x1.555558p-3f); float s = MATH_FAST_SQRT(r); float ret = MATH_MAD(0x1.ddcb02p-1f, 0x1.aee9d6p+0f, -2.0f*MATH_MAD(s, u, s)); float xux = MATH_MAD(ax, u, ax); ret = ax < 0.5f ? xux : ret; return BUILTIN_COPYSIGN_F32(ret, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/asinH.cl000066400000000000000000000027731415221260100204440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(asin) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(asin)(half x) { // Computes arcsin(x). // The argument is first reduced by noting that arcsin(x) // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). // For denormal and small arguments arcsin(x) = x to machine // accuracy. Remaining argument ranges are handled as follows. // For abs(x) <= 0.5 use // arcsin(x) = x + x^3*R(x^2) // where R(x^2) is a polynomial minimax approximation to // (arcsin(x) - x)/x^3. // For abs(x) > 0.5 exploit the identity: // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) // together with the above polynomial approximation, and // reconstruct the terms carefully. half ax = BUILTIN_ABS_F16(x); half r; if (ax <= 0.5h) { half s = x * x; half p = s * MATH_MAD(s, 0x1.828p-4h, 0x1.52p-3h); r = MATH_MAD(ax, p, ax); } else { float s = BUILTIN_MAD_F32((float)ax, -0.5f, 0.5f); float t = BUILTIN_SQRT_F32(s); float p = BUILTIN_MAD_F32(t, BUILTIN_MAD_F32(s, -0x1.82675ap-2f, -0x1.ff9f6p+0f), 0x1.921fb6p+0f); r = (half)p; } return BUILTIN_COPYSIGN_F16(r, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/asinhD.cl000066400000000000000000000015751415221260100206070ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea); CONSTATTR double MATH_MANGLE(asinh)(double x) { double y = BUILTIN_ABS_F64(x); bool b = y >= 0x1.0p+512; double s = b ? 0x1.0p-512 : 1.0; double sy = y * s; double2 a = add(sy, root2(add(sqr(sy), s*s))); double z = MATH_PRIVATE(lnep)(a, b ? 512 : 0); z = y < 0x1.0p-27 ? y : z; if (!FINITE_ONLY_OPT()) { z = BUILTIN_CLASS_F64(y, CLASS_PINF) ? y : z; } return BUILTIN_COPYSIGN_F64(z, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/asinhF.cl000066400000000000000000000015641415221260100206070ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea); CONSTATTR float MATH_MANGLE(asinh)(float x) { float y = BUILTIN_ABS_F32(x); bool b = y >= 0x1.0p+64f; float s = b ? 0x1.0p-64f : 1.0f; float sy = y * s; float2 a = add(sy, root2(add(sqr(sy), s*s))); float z = MATH_PRIVATE(lnep)(a, b ? 64 : 0); z = y < 0x1.0p-12f ? y : z; if (!FINITE_ONLY_OPT()) { z = BUILTIN_CLASS_F32(y, CLASS_PINF) ? y : z; } return BUILTIN_COPYSIGN_F32(z, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/asinhH.cl000066400000000000000000000014171415221260100206060ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(asinh) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(asinh)(half hx) { half ret; float x = (float)BUILTIN_ABS_F16(hx); float t = x + BUILTIN_SQRT_F32(BUILTIN_MAD_F32(x, x, 1.0f)); ret = BUILTIN_COPYSIGN_F16((half)(BUILTIN_LOG2_F32(t) * 0x1.62e430p-1f), hx); if (!FINITE_ONLY_OPT()) { ret = BUILTIN_CLASS_F16(hx, CLASS_NINF|CLASS_PINF|CLASS_QNAN|CLASS_SNAN) ? hx : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/asinpiD.cl000066400000000000000000000040371415221260100207640ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" CONSTATTR double MATH_MANGLE(asinpi)(double x) { // Computes arcsin(x). // The argument is first reduced by noting that arcsin(x) // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). // For denormal and small arguments arcsin(x) = x to machine // accuracy. Remaining argument ranges are handled as follows. // For abs(x) <= 0.5 use // arcsin(x) = x + x^3*R(x^2) // where R(x^2) is a rational minimax approximation to // (arcsin(x) - x)/x^3. // For abs(x) > 0.5 exploit the identity: // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) // together with the above rational approximation, and // reconstruct the terms carefully. double y = BUILTIN_ABS_F64(x); bool transform = y >= 0.5; double rt = MATH_MAD(y, -0.5, 0.5); double y2 = y * y; double r = transform ? rt : y2; double u = r * MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, 0x1.547a51d41fb0bp-7, -0x1.6a3fb0718a8f7p-8), 0x1.a7b91f7177ee8p-8), 0x1.035d3435b8ad8p-9), 0x1.ff0549b4e0449p-9), 0x1.21604ae288f96p-8), 0x1.6a2b36f9aec49p-8), 0x1.d2b076c914f04p-8), 0x1.3ce53861f8f1fp-7), 0x1.d1a4529a30a69p-7), 0x1.8723a1d61d2e9p-6), 0x1.b2995e7b7af0fp-5); const double piinv = 0x1.45f306dc9c883p-2; double v = MATH_MAD(y, piinv, y*u); if (transform) { double2 s = ldx(root2(r), 1); double2 ve = fsub(0.5, fadd(mul(piinv, s), mul(s, u))); v = ve.hi; v = y == 1.0 ? 0.5 : v; } return BUILTIN_COPYSIGN_F64(v, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/asinpiF.cl000066400000000000000000000031751415221260100207700ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(asinpi)(float x) { // Computes arcsin(x). // The argument is first reduced by noting that arcsin(x) // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). // For denormal and small arguments arcsin(x) = x to machine // accuracy. Remaining argument ranges are handled as follows. // For abs(x) <= 0.5 use // arcsin(x) = x + x^3*R(x^2) // where R(x^2) is a polynomial minimax approximation to // (arcsin(x) - x)/x^3. // For abs(x) > 0.5 exploit the identity: // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) // together with the above polynomial approximation, and // reconstruct the terms carefully. const float piinv = 0x1.45f306p-2f; float ax = BUILTIN_ABS_F32(x); float tx = MATH_MAD(ax, -0.5f, 0.5f); float x2 = ax * ax; float r = ax >= 0.5f ? tx : x2; float u = r * MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, -0x1.3f1c6cp-8f, 0x1.2ac560p-6f), 0x1.80aab4p-8f), 0x1.e53378p-7f), 0x1.86680ap-6f), 0x1.b29c5ap-5f); float s = MATH_FAST_SQRT(r); float ret = MATH_MAD(-2.0f, MATH_MAD(s, u, piinv*s), 0.5f); float xux = MATH_MAD(piinv, ax, ax*u); ret = ax >= 0.5f ? ret : xux; return BUILTIN_COPYSIGN_F32(ret, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/asinpiH.cl000066400000000000000000000030711415221260100207650ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(asinpi) CONSTATTR half MATH_MANGLE(asinpi)(half x) { // Computes arcsin(x). // The argument is first reduced by noting that arcsin(x) // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). // For denormal and small arguments arcsin(x) = x to machine // accuracy. Remaining argument ranges are handled as follows. // For abs(x) <= 0.5 use // arcsin(x) = x + x^3*R(x^2) // where R(x^2) is a polynomial minimax approximation to // (arcsin(x) - x)/x^3. // For abs(x) > 0.5 exploit the identity: // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) // together with the above polynomial approximation, and // reconstruct the terms carefully. const half piinv = 0x1.45f306p-2h; half ax = BUILTIN_ABS_F16(x); half r; if (ax <= 0.5h) { half s = x * x; r = ax * MATH_MAD(s, MATH_MAD(s, 0x1.0b8p-5h, 0x1.a7cp-5h), 0x1.46p-2h); } else { float s = BUILTIN_MAD_F32((float)ax, -0.5f, 0.5f); float t = BUILTIN_SQRT_F32(s); float p = BUILTIN_MAD_F32(t, BUILTIN_MAD_F32(s, BUILTIN_MAD_F32(s, -0x1.f4b736p-5f, -0x1.ad0826p-4f), -0x1.45f5a8p-1f), 0.5f); r = (half)p; } return BUILTIN_COPYSIGN_F16(r, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atan2D.cl000066400000000000000000000025541415221260100205100ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" extern CONSTATTR double MATH_PRIVATE(atanred)(double); CONSTATTR double MATH_MANGLE(atan2)(double y, double x) { const double pi = 0x1.921fb54442d18p+1; const double piby2 = 0x1.921fb54442d18p+0; const double piby4 = 0x1.921fb54442d18p-1; const double threepiby4 = 0x1.2d97c7f3321d2p+1; double ay = BUILTIN_ABS_F64(y); double ax = BUILTIN_ABS_F64(x); double u = BUILTIN_MAX_F64(ax, ay); double v = BUILTIN_MIN_F64(ax, ay); double vbyu = MATH_DIV(v, u); double a = MATH_PRIVATE(atanred)(vbyu); bool xneg = AS_INT2(x).y < 0; double t = piby2 - a; a = ax < ay ? t : a; t = pi - a; a = xneg ? t : a; t = xneg ? pi : 0.0; a = y == 0.0 ? t : a; if (!FINITE_ONLY_OPT()) { t = xneg ? threepiby4 : piby4; t = BUILTIN_COPYSIGN_F64(t, y); a = (BUILTIN_ISINF_F64(x) & BUILTIN_ISINF_F64(y)) ? t : a; a = (BUILTIN_ISNAN_F64(x) | BUILTIN_ISNAN_F64(y)) ? AS_DOUBLE(QNANBITPATT_DP64) : a; } return BUILTIN_COPYSIGN_F64(a, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atan2F.cl000066400000000000000000000027271415221260100205140ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" extern CONSTATTR float MATH_PRIVATE(atanred)(float); CONSTATTR float MATH_MANGLE(atan2)(float y, float x) { const float pi = 0x1.921fb6p+1f; const float piby2 = 0x1.921fb6p+0f; const float piby4 = 0x1.921fb6p-1f; const float threepiby4 = 0x1.2d97c8p+1f; float ax = BUILTIN_ABS_F32(x); float ay = BUILTIN_ABS_F32(y); float v = BUILTIN_MIN_F32(ax, ay); float u = BUILTIN_MAX_F32(ax, ay); float vbyu; if (DAZ_OPT()) { float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f; vbyu = s * MATH_FAST_DIV(v, s*u); } else { vbyu = MATH_DIV(v, u); } float a = MATH_PRIVATE(atanred)(vbyu); float t = piby2 - a; a = ay > ax ? t : a; t = pi - a; a = x < 0.0f ? t : a; t = AS_INT(x) < 0 ? pi : 0.0f; a = y == 0.0f ? t : a; if (!FINITE_ONLY_OPT()) { // x and y are +- Inf t = x < 0.0f ? threepiby4 : piby4; a = (BUILTIN_ISINF_F32(x) & BUILTIN_ISINF_F32(y)) ? t : a; // x or y is NaN a = (BUILTIN_ISNAN_F32(x) | BUILTIN_ISNAN_F32(y)) ? AS_FLOAT(QNANBITPATT_SP32) : a; } return BUILTIN_COPYSIGN_F32(a, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atan2H.cl000066400000000000000000000025461415221260100205150ustar00rootroot00000000000000 /*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" extern CONSTATTR half MATH_PRIVATE(atanred)(half); CONSTATTR BGEN(atan2) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(atan2)(half y, half x) { const half pi = 0x1.921fb6p+1h; const half piby2 = 0x1.921fb6p+0h; const half piby4 = 0x1.921fb6p-1h; const half threepiby4 = 0x1.2d97c8p+1h; half ax = BUILTIN_ABS_F16(x); half ay = BUILTIN_ABS_F16(y); half v = BUILTIN_MIN_F16(ax, ay); half u = BUILTIN_MAX_F16(ax, ay); half vbyu = MATH_DIV(v, u); half a = MATH_PRIVATE(atanred)(vbyu); half t = piby2 - a; a = ay > ax ? t : a; t = pi - a; a = x < 0.0h ? t : a; t = AS_SHORT(x) < 0 ? pi : 0.0h; a = y == 0.0h ? t : a; if (!FINITE_ONLY_OPT()) { // x and y are +- Inf t = x < 0.0h ? threepiby4 : piby4; a = (BUILTIN_ISINF_F16(x) & BUILTIN_ISINF_F16(y)) ? t : a; // x or y is NaN a = (BUILTIN_ISNAN_F16(x) | BUILTIN_ISNAN_F16(y)) ? AS_HALF((short)QNANBITPATT_HP16) : a; } return BUILTIN_COPYSIGN_F16(a, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atan2piD.cl000066400000000000000000000023311415221260100210320ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" extern CONSTATTR double MATH_PRIVATE(atanpired)(double); CONSTATTR double MATH_MANGLE(atan2pi)(double y, double x) { const double pi = 0x1.921fb54442d18p+1; double ay = BUILTIN_ABS_F64(y); double ax = BUILTIN_ABS_F64(x); double u = BUILTIN_MAX_F64(ax, ay); double v = BUILTIN_MIN_F64(ax, ay); double vbyu = MATH_DIV(v, u); double a = MATH_PRIVATE(atanpired)(vbyu); bool xneg = AS_INT2(x).y < 0; double t = 0.5 - a; a = ax < ay ? t : a; t = 1.0 - a; a = xneg ? t : a; t = xneg ? 1.0 : 0.0; a = y == 0.0 ? t : a; if (!FINITE_ONLY_OPT()) { t = xneg ? 0.75 : 0.25; t = BUILTIN_COPYSIGN_F64(t, y); a = (BUILTIN_ISINF_F64(x) & BUILTIN_ISINF_F64(y)) ? t : a; a = (BUILTIN_ISNAN_F64(x) | BUILTIN_ISNAN_F64(y)) ? AS_DOUBLE(QNANBITPATT_DP64) : a; } return BUILTIN_COPYSIGN_F64(a, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atan2piF.cl000066400000000000000000000025461415221260100210440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" extern CONSTATTR float MATH_PRIVATE(atanpired)(float); CONSTATTR float MATH_MANGLE(atan2pi)(float y, float x) { const float pi = 0x1.921fb6p+1f; float ax = BUILTIN_ABS_F32(x); float ay = BUILTIN_ABS_F32(y); float v = BUILTIN_MIN_F32(ax, ay); float u = BUILTIN_MAX_F32(ax, ay); float vbyu; if (DAZ_OPT()) { float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f; vbyu = s * MATH_FAST_DIV(v, s*u); } else { vbyu = MATH_DIV(v, u); } float a = MATH_PRIVATE(atanpired)(vbyu); float at = 0.5f - a; a = ay > ax ? at : a; at = 1.0f - a; a = x < 0.0f ? at : a; at = AS_INT(x) < 0 ? 1.0f : 0.0f; a = y == 0.0f ? at : a; if (!FINITE_ONLY_OPT()) { // x and y are +- Inf at = x < 0.0f ? 0.75f : 0.25f; a = (BUILTIN_ISINF_F32(x) & BUILTIN_ISINF_F32(y)) ? at : a; // x or y is NaN a = (BUILTIN_ISNAN_F32(x) | BUILTIN_ISNAN_F32(y)) ? AS_FLOAT(QNANBITPATT_SP32) : a; } return BUILTIN_COPYSIGN_F32(a, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atan2piH.cl000066400000000000000000000023421415221260100210400ustar00rootroot00000000000000 /*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" extern CONSTATTR half MATH_PRIVATE(atanpired)(half); CONSTATTR BGEN(atan2pi) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(atan2pi)(half y, half x) { half ax = BUILTIN_ABS_F16(x); half ay = BUILTIN_ABS_F16(y); half v = BUILTIN_MIN_F16(ax, ay); half u = BUILTIN_MAX_F16(ax, ay); half vbyu = MATH_DIV(v, u); half a = MATH_PRIVATE(atanpired)(vbyu); half at = 0.5h - a; a = ay > ax ? at : a; at = 1.0h - a; a = x < 0.0h ? at : a; at = AS_SHORT(x) < 0 ? 1.0h : 0.0h; a = y == 0.0h ? at : a; if (!FINITE_ONLY_OPT()) { // x and y are +- Inf at = x < 0.0h ? 0.75h : 0.25h; a = (BUILTIN_ISINF_F16(x) & BUILTIN_ISINF_F16(y)) ? at : a; // x or y is NaN a = (BUILTIN_ISNAN_F16(x) | BUILTIN_ISNAN_F16(y)) ? AS_HALF((short)QNANBITPATT_HP16) : a; } return BUILTIN_COPYSIGN_F16(a, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanD.cl000066400000000000000000000013331415221260100204200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" extern CONSTATTR double MATH_PRIVATE(atanred)(double); CONSTATTR double MATH_MANGLE(atan)(double x) { double v = BUILTIN_ABS_F64(x); bool g = v > 1.0; if (g) { v = MATH_RCP(v); } double a = MATH_PRIVATE(atanred)(v); double y = BUILTIN_FMA_F64(0x1.dd9ad336a0500p-1, 0x1.af154eeb562d6p+0, -a); a = g ? y : a; return BUILTIN_COPYSIGN_F64(a, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanF.cl000066400000000000000000000013131415221260100204200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" extern CONSTATTR float MATH_PRIVATE(atanred)(float); CONSTATTR float MATH_MANGLE(atan)(float x) { float v = BUILTIN_ABS_F32(x); bool g = v > 1.0f; float vi = MATH_FAST_RCP(v); v = g ? vi : v; float a = MATH_PRIVATE(atanred)(v); float y = MATH_MAD(0x1.ddcb02p-1f, 0x1.aee9d6p+0f, -a); a = g ? y : a; return BUILTIN_COPYSIGN_F32(a, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanH.cl000066400000000000000000000013501415221260100204230ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" extern CONSTATTR half MATH_PRIVATE(atanred)(half); CONSTATTR UGEN(atan) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(atan)(half x) { half v = BUILTIN_ABS_F16(x); bool g = v > 1.0h; half vi = MATH_FAST_RCP(v); v = g ? vi : v; half a = MATH_PRIVATE(atanred)(v); half y = MATH_MAD(0x1.ea8p-1h, 0x1.a3cp+0h, -a); a = g ? y : a; return BUILTIN_COPYSIGN_F16(a, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanhD.cl000066400000000000000000000015261415221260100205740ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea); CONSTATTR double MATH_MANGLE(atanh)(double x) { double y = BUILTIN_ABS_F64(x); double2 a = fdiv(fadd(1.0, y), fsub(1.0, y)); double z = 0.5 * MATH_PRIVATE(lnep)(a, 0); z = y < 0x1.0p-27 ? y : z; if (!FINITE_ONLY_OPT()) { z = y > 1.0 ? AS_DOUBLE(QNANBITPATT_DP64) : z; z = y == 1.0 ? AS_DOUBLE(PINFBITPATT_DP64) : z; } return BUILTIN_COPYSIGN_F64(z, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanhF.cl000066400000000000000000000015221415221260100205720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea); CONSTATTR float MATH_MANGLE(atanh)(float x) { float y = BUILTIN_ABS_F32(x); float2 a = fdiv(fadd(1.0f, y), fsub(1.0f, y)); float z = 0.5f * MATH_PRIVATE(lnep)(a, 0); z = y < 0x1.0p-12f ? y : z; if (!FINITE_ONLY_OPT()) { z = y > 1.0f ? AS_FLOAT(QNANBITPATT_SP32) : z; z = y == 1.0f ? AS_FLOAT(PINFBITPATT_SP32) : z; } return BUILTIN_COPYSIGN_F32(z, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanhH.cl000066400000000000000000000015211415221260100205730ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(atanh) CONSTATTR half MATH_MANGLE(atanh)(half hx) { half ret; float x = (float)BUILTIN_ABS_F16(hx); float t = (1.0f + x) * BUILTIN_RCP_F32(1.0f - x); ret = (half)(BUILTIN_LOG2_F32(t) * 0x1.62e430p-2f); ret = x < 0x1.0p-7f ? x : ret; if (!FINITE_ONLY_OPT()) { ret = x == 1.0f ? AS_HALF((short)PINFBITPATT_HP16) : ret; ret = (x > 1.0f) | BUILTIN_ISNAN_F32(x) ? AS_HALF((short)QNANBITPATT_HP16) : ret; } return BUILTIN_COPYSIGN_F16(ret, hx); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanpiD.cl000066400000000000000000000012511415221260100207500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" extern CONSTATTR double MATH_PRIVATE(atanpired)(double); CONSTATTR double MATH_MANGLE(atanpi)(double x) { double v = BUILTIN_ABS_F64(x); bool g = v > 1.0; if (g) { v = MATH_RCP(v); } double a = MATH_PRIVATE(atanpired)(v); double y = 0.5 - a; a = g ? y : a; return BUILTIN_COPYSIGN_F64(a, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanpiF.cl000066400000000000000000000012561415221260100207570ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" extern CONSTATTR float MATH_PRIVATE(atanpired)(float); CONSTATTR float MATH_MANGLE(atanpi)(float x) { float v = BUILTIN_ABS_F32(x); bool g = v > 1.0f; float vi = MATH_FAST_RCP(v); v = g ? vi : v; float a = MATH_PRIVATE(atanpired)(v); float y = 0.5f - a; a = g ? y : a; return BUILTIN_COPYSIGN_F32(a, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanpiH.cl000066400000000000000000000013241415221260100207550ustar00rootroot00000000000000 /*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" extern CONSTATTR half MATH_PRIVATE(atanpired)(half); CONSTATTR UGEN(atanpi) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(atanpi)(half x) { half v = BUILTIN_ABS_F16(x); bool g = v > 1.0h; half vi = MATH_FAST_RCP(v); v = g ? vi : v; half a = MATH_PRIVATE(atanpired)(v); half y = 0.5h - a; a = g ? y : a; return BUILTIN_COPYSIGN_F16(a, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanpiredD.cl000066400000000000000000000025271415221260100214520ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_PRIVATE(atanpired)(double v) { double t = v * v; double z = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.39e58b43320d2p-18, -0x1.be9e52f5df14fp-15), 0x1.2d7a6cad8e9dbp-12), -0x1.024ebcc10f8a6p-10), 0x1.3df92946a87d8p-9), -0x1.2f04271b6cd94p-8), 0x1.d91b9a6908690p-8), -0x1.3e1c18f5ea692p-7), 0x1.8253e53662be6p-7), -0x1.ba3db7e462112p-7), 0x1.ed7188505388cp-7), -0x1.121f707a5851bp-6), 0x1.32b737d7f904ap-6), -0x1.5bac13378ea68p-6), 0x1.912af944c4411p-6), -0x1.da1babd44fccfp-6), 0x1.21bb945aacd29p-5), -0x1.7483758f7040fp-5), 0x1.04c26be3b5934p-4), -0x1.b2995e7b7b74dp-4), 0x1.45f306dc9c883p-2); return v * z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanpiredF.cl000066400000000000000000000014031415221260100214440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_PRIVATE(atanpired)(float v) { float t = v * v; float z = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.ccf836p-11f, -0x1.4761e4p-8f), 0x1.b6662ep-7f), -0x1.8423b4p-6f), 0x1.149cb4p-5f), -0x1.721cccp-5f), 0x1.04a466p-4f), -0x1.b2981cp-4f), 0x1.45f306p-2f); return v * z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanpiredH.cl000066400000000000000000000010511415221260100214450ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half MATH_PRIVATE(atanpired)(half v) { half t = v * v; half z = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.ef4p-7h, 0x1.a44p-5h), -0x1.ac8p-4h), 0x1.46p-2h); return v * z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanredD.cl000066400000000000000000000024671415221260100211240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_PRIVATE(atanred)(double v) { double t = v * v; double z = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.ba404b5e68a13p-17, -0x1.3e260bd3237f4p-13), 0x1.b2bb069efb384p-11), -0x1.7952daf56de9bp-9), 0x1.d6d43a595c56fp-8), -0x1.c6ea4a57d9582p-7), 0x1.67e295f08b19fp-6), -0x1.e9ae6fc27006ap-6), 0x1.2c15b5711927ap-5), -0x1.59976e82d3ff0p-5), 0x1.82d5d6ef28734p-5), -0x1.ae5ce6a214619p-5), 0x1.e1bb48427b883p-5), -0x1.110e48b207f05p-4), 0x1.3b13657b87036p-4), -0x1.745d119378e4fp-4), 0x1.c71c717e1913cp-4), -0x1.2492492376b7dp-3), 0x1.99999999952ccp-3), -0x1.5555555555523p-2); z = MATH_MAD(v, t*z, v); return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanredF.cl000066400000000000000000000013511415221260100211150ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_PRIVATE(atanred)(float v) { float t = v * v; float z = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.5a54bp-9f, -0x1.f4b218p-7f), 0x1.53f67ep-5f), -0x1.2fa9aep-4f), 0x1.b26364p-4f), -0x1.22c1ccp-3f), 0x1.99717ep-3f), -0x1.5554c4p-2f); z = MATH_MAD(v, t*z, v); return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/atanredH.cl000066400000000000000000000010261415221260100211160ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half MATH_PRIVATE(atanred)(half v) { half t = v * v; half z = MATH_MAD(t, MATH_MAD(t, -0x1.788p-5h, 0x1.44cp-3h), -0x1.4f4p-2h); z = MATH_MAD(v, t*z, v); return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ba0D.cl000066400000000000000000000013751415221260100201450ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_PRIVATE(ba0)(double t) { return MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.44395cd7ac32cp+20, -0x1.25bf3abbee803p+16), 0x1.55a4a78625b0fp+11), -0x1.a826c7ea56321p+6), 0x1.763253bbf53b6p+2), -0x1.15efaff948953p-1), 0x1.a7ffff967a1d4p-4), -0x1.fffffffff2868p-5), 0x1.0000000000000p+0); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ba0F.cl000066400000000000000000000007601415221260100201440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_PRIVATE(ba0)(float t) { return MATH_MAD(t, MATH_MAD(t, 0x1.92aeccp-4f, -0x1.ffe472p-5f), 0x1.000000p+0f); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ba1D.cl000066400000000000000000000013751415221260100201460ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_PRIVATE(ba1)(double t) { return MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.7940a06621145p+20, 0x1.591fb68428bafp+16), -0x1.996552a8bafb0p+11), 0x1.0795578cd8c93p+7), -0x1.ef38364596b5ap+2), 0x1.9c4fa465744c7p-1), -0x1.8bffffc3937c1p-3), 0x1.7ffffffffc240p-3), 0x1.0000000000000p+0); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ba1F.cl000066400000000000000000000007601415221260100201450ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_PRIVATE(ba1)(float t) { return MATH_MAD(t, MATH_MAD(t, -0x1.7c0d46p-3f, 0x1.7ff5aap-3f), 0x1.000000p+0f); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/besselD_table.h000066400000000000000000000517371415221260100217670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ DECLARE_TABLE(double, M64_J0, 8*15) 1.0, -0.14269328868608038e-15, -0.24999999999999378, -0.10717704790389966e-12, 0.015625000000966751, -0.52511567891715885e-11, -0.00043402775917084975, -0.45154263377571991e-10, 0.6781761279002329e-5, -0.94524619593582299e-10, -0.67734011417068302e-7, -0.51276965587306847e-10, 0.49259222901902222e-9, -0.57479109221671054e-11, -0.16331521876245402e-11, 0.0, -0.51914749728946679, 0.10793870175492009, 0.056601774437946192, -0.0086576695933049068, -0.0021942003590150295, 0.00026437703675251415, 0.43729192716923728e-4, -0.43388262868833412e-5, -0.53049137594784273e-6, 0.44700551042149104e-7, 0.43264003773432392e-8, -0.31664470012675611e-9, -0.25122835305798086e-10, 0.16215931083463106e-11, -0.40275939570255297, -0.52181326018778115e-18, 0.20137969785127645, -0.017518715285659044, -0.013352611033180267, 0.0010359438491269923, 0.00037218755651442075, -0.24952041524263142e-4, -0.57760876091040014e-5, 0.33742922699801002e-6, 0.57277913211048927e-7, -0.29528827354673038e-8, -0.39441693779923091e-9, 0.18022594969949103e-10, 0.18857204715831148e-11, 0.0, 0.34026480655836815, -0.030820651425593648, -0.052988552867604362, 0.0046310421459076305, 0.0022574402290271133, -0.00017518572899406692, -0.46521090692503814e-4, 0.31997869075739445e-5, 0.57164888846826257e-6, -0.35115366797673734e-7, -0.46830399346222682e-8, 0.25923658333924528e-9, 0.27115172723816524e-10, -0.13884165974276054e-11, 0.30011575252613256, 0.2057050400962928e-17, -0.15005787626306626, 0.0071297376031137401, 0.011742619737434781, -0.00062605834520753437, -0.00035093119053508375, 0.17929701348313658e-4, 0.56239343808321796e-5, -0.26684224520542096e-6, -0.56652615547124157e-7, 0.24792586052774415e-8, 0.39325985931918323e-9, -0.15724313427150255e-10, -0.19341803571391105e-11, 0.0, -0.27145229992838192, 0.015684124960953883, 0.044033774963411685, -0.0025093022272106884, -0.0020603351551222082, 0.00011243486789352708, 0.44823035412848692e-4, -0.22883910078014302e-5, -0.56793781722802321e-6, 0.26941566442661998e-7, 0.47365215013159892e-8, -0.20866089859212072e-9, -0.27761981412381772e-10, 0.11411583417182674e-11, -0.2497048770578432, -0.21909546936929062e-17, 0.12485243852892159, -0.0040907858517003804, -0.010102792347697843, 0.00038536375944999447, 0.0003185971148934128, -0.12373899203877618e-4, -0.53013953324799306e-5, 0.20010876457654013e-6, 0.54715979534900829e-7, -0.19711317018282613e-8, -0.38584018939012558e-9, 0.13028557538648307e-10, 0.19387251405422158e-11, 0.0, 0.23245983136472478, -0.0098570645138257917, -0.03818600911162297, 0.0016073972920896773, 0.0018420433388659426, -0.75813584809846931e-4, -0.41592845395702554e-4, 0.16506463478622605e-5, 0.54254505636478441e-6, -0.20558027910130633e-7, -0.46196044646920421e-8, 0.16630784845680672e-9, 0.27483865275708142e-10, -0.93846646239935553e-12, END_TABLE() DECLARE_TABLE(double, M64_J1, 8*15) 0.0, 0.5, -0.12970309732986903e-17, -0.062499999999999923, -0.17942214325033243e-14, 0.0026041666666885299, -0.15964519165155314e-12, -0.54253471466663886e-4, -0.242857790709361e-11, 0.67817384698301118e-6, -0.86070068625189802e-11, -0.56418387778447458e-8, -0.73192849689297935e-11, 0.37319822951004815e-10, -0.11001445955275011e-11, 0.58186522428159638, -0.56159765491837453e-17, -0.20511071214777315, 0.006058948324603733, 0.013801769807954829, -0.00037231709715965684, -0.00039495907353545311, 0.92029498173768214e-5, 0.62672896236849497e-5, -0.1267857801249798e-6, -0.63255257619028979e-7, 0.11251771403253868e-8, 0.44176005585408683e-9, -0.69798300547918846e-11, -0.21578026548615529e-11, 0.0, -0.402759395702553, 0.052556145856977239, 0.053410444132727687, -0.0051797192456383855, -0.0022331253392001435, 0.00017466429070665996, 0.46208701653337802e-4, -0.30368632238776932e-5, -0.57278166634453134e-6, 0.32482189325657561e-7, 0.47369084764612076e-8, -0.23499460493506461e-9, -0.28705938354850318e-10, 0.44693128781201312e-12, -0.34612620185379152, -0.17631593012980777e-17, 0.16697453550109302, -0.0096782685428780814, -0.012099225779141488, 0.00066540090064072656, 0.00035413890079260022, -0.17427203124603725e-4, -0.56552935762375831e-5, 0.24842942396474063e-6, 0.57098949030140281e-7, -0.22536110266152491e-8, -0.39802896432910825e-9, 0.14090328151677641e-10, 0.19636717850506288e-11, 0.0, 0.30011575252613256, -0.021389212809341581, -0.04697047894974129, 0.0031302917260480798, 0.0021055871432437381, -0.00012550790955127199, -0.44991475264757161e-4, 0.24015807952585114e-5, 0.56652684843934755e-6, -0.27273424894801725e-7, -0.47201704013422051e-8, 0.20653028510455782e-9, 0.27690106438474044e-10, -0.11154568938183541e-11, 0.27329994163319985, 0.2232142433641675e-17, -0.13477468037992365, 0.0051163403464879163, 0.010631861751984214, -0.00044874368373337155, -0.00032680001851823873, 0.13382555960237626e-4, 0.53631771344886529e-5, -0.20647195244065982e-6, -0.54999812559703342e-7, 0.19736935833650958e-8, 0.38691574660208312e-9, -0.12790599536440081e-10, -0.19364854538966976e-11, 0.0, -0.24970487705784317, 0.012272357555101521, 0.040411169390789711, -0.001926818797260396, -0.0019115826893325857, 0.86617294531543399e-4, 0.42411162505820529e-4, -0.18009793753942718e-5, -0.5471594365997978e-6, 0.21683657796392875e-7, 0.46297313740491134e-8, -0.17085932625435942e-9, -0.27035506268991826e-10, 0.73146488801751189e-12, -0.23330441717143407, -0.22662118296062933e-17, 0.11580092244607786, -0.0032489977328225844, -0.0093725272060512657, 0.00030361382116634888, 0.00029804555532176523, -0.98138185687649243e-5, -0.50242299853933591e-5, 0.16136260748150418e-6, 0.5251960653430569e-7, -0.16180019977389104e-8, -0.37446742393781688e-9, 0.10863405480283854e-10, 0.19078934776878301e-11, END_TABLE() DECLARE_TABLE(double, M64_Y0, 18*15) -0.073804295108687225, 0.17760601686906714, -0.016073968025938426, 0.00053860266686165496, -0.94950052052215465e-5, 0.10358476033628097e-6, -0.76930799009029319e-9, 0.41435657365127098e-11, -0.1693271517935695e-13, 0.54310606578547998e-16, -0.14038708139145726e-18, 0.29871591749670351e-21, -0.53238579320936109e-24, 0.80636887083404931e-27, -0.10479788308161506e-29, -0.77912935353834307, 2.2110954318911016, -3.1481880142409648, 6.7631541766023146, -16.558846016561116, 42.556164402735613, -113.65090971911888, 311.92221820936423, -872.50902177512439, 2461.0565691666882, -6829.049205644454, 17617.540310147784, -38115.181270412403, 58513.491703205172, -45741.69055512617, -0.54179079742759428, 1.64879305137253, -1.6134395171403224, 2.3901721546248332, -4.2770404998133958, 7.8857581113382368, -15.060011460820601, 29.549657999172217, -59.136402510594911, 119.95202976931475, -243.64086705143111, 478.7020767792245, -836.74741023460869, 1104.0427235801185, -779.71306204835432, -0.35708307020027898, 1.3315403043553127, -1.0050498465490202, 1.0750491956121098, -1.5469100036757135, 2.235635072477068, -3.324194198035296, 5.0776635871010325, -7.9096546309462989, 12.50166753906456, -19.905699415239301, 31.245221424718389, -45.309925774701995, 52.094004174782553, -33.533831674941474, -0.2045648213118789, 1.120816812372814, -0.71285708925156112, 0.55404402904516822, -0.68086349391521071, 0.81641946964915076, -0.99376659920171963, 1.2431212752135579, -1.5855777667632761, 2.0522491911004844, -2.6819002952055626, 3.4877724825589845, -4.2917811335732653, 4.3478499271457812, -2.5645514824451464, 0.0, 0.87942080249719477, -0.49207893426297755, 0.22055282848170949, -0.22612171354423224, 0.21894842697129336, -0.20487719776562028, 0.19733568623230481, -0.1939501765143562, 0.19337292001268456, -0.19504328259403041, 0.1989415973717781, -0.20633673974538298, 0.20488487879343473, -0.12698771588648888, 0.088256964215676958, 0.7812128213002887, -0.43473489275797808, 0.14491163091871858, -0.1375568838608908, 0.12453666860389533, -0.10402567514600134, 0.089474169159502648, -0.078647603970442897, 0.070036305115760506, -0.062684214895833727, 0.054972325513095258, -0.043964628503220077, 0.027371209537030947, -0.0093703929219555162, 0.25821685159454078, 0.58436403661500803, -0.36285404044324346, 0.061699235252148297, -0.045739306782895844, 0.040702353485939169, -0.027255526573770462, 0.018591111730641299, -0.013104420664549169, 0.0093397328068473626, -0.0066469721051120698, 0.0045586325249059059, -0.0027647918918092109, 0.0012569316613639002, -0.00030394891460079893, 0.42891756089319696, 0.33169442327191864, -0.31651860299180319, 0.030579837257061538, -0.0047471912131737328, 0.01054712074005649, -0.0058778174555227628, 0.0029188053177132331, -0.0015824799060393402, 0.00087461459619324866, -0.00048386068841997002, 0.00026310045468230596, -0.00013160965333042817, 0.51894745655900052e-4, -0.11391844004684635e-4, 0.52078641240226751, -0.20584037223089673e-17, -0.2603932062011338, 0.039504848583033348, 0.0082143493513316977, 0.00095956233382919533, -0.001237092222826762, 0.00037074882687906914, -0.00013335661481505372, 0.56621847806301764e-4, -0.23586337096205168e-4, 0.98050240371430491e-5, -0.4128688513318286e-5, 0.16930914560772783e-5, -0.49720344100766544e-6, 0.49329724488711617, -0.1595121262755564, -0.21514005429036172, 0.050767278479624522, 0.0081376092965840492, -0.00086057023571742532, -0.00065647861248115662, 0.00016624499281830832, -0.39672451667644922e-4, 0.1521990078761635e-4, -0.56848551522514058e-5, 0.20098385792952417e-5, -0.67252825610378239e-6, 0.1852827673508686e-6, -0.29634836035302199e-7, 0.37685001001279038, -0.32467442479179998, -0.13431260087442852, 0.063023537103350963, 0.0044555664857033608, -0.0021007845703210802, -0.00026522913415021587, 0.90436772580354379e-4, -0.91363588694971671e-5, 0.26783638970524461e-5, -0.10352374020714479e-5, 0.3132681441256256e-6, -0.88816500198197074e-7, 0.2157981376131948e-7, -0.31353375574613877e-8, 0.0, -0.40254267177502424, 0.050855909592158235, 0.058523822105172299, -0.0068525666771120393, -0.002183518874131455, 0.00019526940252310014, 0.50922915003220723e-4, -0.48933708281804964e-5, -0.29349580100499912e-6, -0.21840554837306539e-7, 0.18947787013197809e-7, -0.37046653083214055e-8, 0.76430136737808284e-9, -0.12422824562419604e-9, -0.34031804552344056, 0.94101386107437916e-17, 0.17015902276172035, -0.010446225814696104, -0.012736984935856988, 0.00083202318688738824, 0.0003609997918678326, -0.20945841912907079e-4, -0.58073349754263144e-5, 0.31820723275099966e-6, 0.54644418381581921e-7, -0.2319265892331721e-8, -0.46670788412863405e-9, 0.30342197107751323e-10, -0.15335078035720073e-12, 0.0, 0.30009761491047518, -0.021175236556769531, -0.048024070076259688, 0.0033183482688956215, 0.0021759840164388624, -0.00014060259774065803, -0.45951406671209629e-4, 0.27013637918060207e-5, 0.57493481425343566e-6, -0.30984700082815646e-7, -0.47169293824539992e-8, 0.23029054509089804e-9, 0.27973463750937909e-10, -0.13064221620824322e-11, 0.27145987731153354, 0.25221283178979203e-17, -0.13572993865576675, 0.0052632947880988247, 0.010851606676849659, -0.00048359134656347859, -0.00033524866905954335, 0.14885926419217314e-4, 0.54759245688276116e-5, -0.23132509119378262e-6, -0.55865240503001576e-7, 0.22197827167333758e-8, 0.39026801352550049e-9, -0.14329181797023679e-10, -0.19438316968801125e-11, 0.0, -0.24970123751468478, 0.012213500740397518, 0.040820349832455694, -0.0019771436063412679, -0.001946025604344518, 0.9143803534139555e-4, 0.43271963415458645e-4, -0.19373031522149208e-5, -0.55677520594475748e-6, 0.235112582604214e-7, 0.46932869756461156e-8, -0.18637017854067415e-9, -0.27698695184429241e-10, 0.10369143470533369e-11, -0.23246176601703874, -0.20096023187886984e-17, 0.11623088300851936, -0.0032975672060945613, -0.00947540876323849, 0.00031542390044000931, 0.00030283033368618402, -0.10400844347883093e-4, -0.51124999467324777e-5, 0.17326393448661488e-6, 0.53369289930627684e-7, -0.1748658677916985e-8, -0.37952700634084811e-9, 0.11780616758320276e-10, 0.19200057712000834e-11, END_TABLE() DECLARE_TABLE(double, M64_Y1, 18*15) -0.19605709064623895, 0.054348688160510244, -0.0029553053360798337, 0.71642687499739621e-4, -0.99267406194248216e-6, 0.89318796212201327e-8, -0.56480245515956582e-10, 0.26494815070087778e-12, -0.95914865863351391e-15, 0.2761635978378275e-17, -0.64764905786424363e-20, 0.12611877823331126e-22, -0.20721023543487956e-25, 0.29110987879568911e-28, -0.35303800868251434e-31, -1.4714723926702431, 2.4984260518337782, -4.7056346408383019, 9.975846534619563, -20.184163337621461, 40.496950477031913, -81.152327528374615, 162.49087766015681, -325.15079903464149, 649.45520042742928, -1285.2103823941194, 2448.4804541756212, -4158.2943098614827, 5366.6187995050527, -3734.8653515324813, -1.2171501026500124, 1.6698931974778848, -2.2852916380492847, 4.027297809371497, -6.58721416369891, 10.581942141908384, -16.980016700063269, 27.217091032511358, -43.594174240672638, 69.758612215607575, -111.12421285866862, 173.1086092367898, -248.34507089127534, 282.90379126506623, -181.11662875814501, -1.0375945507692854, 1.2462866316399409, -1.2343667463922096, 1.8992610235521382, -2.6371985712336499, 3.5310230382807777, -4.7256334014727215, 6.3171220523241033, -8.43472396630236, 11.238328821759806, -14.832285088567444, 18.842520279278443, -21.335046358108435, 18.354793359003515, -8.5142522678468439, -0.83739733543088325, 0.93091920108100523, -0.55417761257185901, 0.73371086127587253, -0.8605660052576892, 0.92065952159238525, -0.98595650054219686, 1.0559318894794136, -1.1285411140365644, 1.2010298650373751, -1.2569339904113142, 1.2431629401764116, -1.0626487102726304, 0.66622019625478456, -0.21854889181260231, -0.60722895611445335, 0.73783834150938075, -0.20349423373260017, 0.21007628524484786, -0.23108815947056327, 0.19023828049773805, -0.15557188762716865, 0.12853382930576615, -0.10591075611629479, 0.086962780125352593, -0.070629828108562505, 0.055054410547947963, -0.038059769484626943, 0.019874794635230189, -0.0055679593657415689, -0.39186795572488388, 0.65092742964440393, -0.10017743328805587, 0.042238681309637533, -0.072373258513592223, 0.049513700809545086, -0.031072379727666883, 0.020463565150300302, -0.013481748934993475, 0.0088356115908746828, -0.005755424546448715, 0.0036575069327209979, -0.00213090561761424, 0.00097979744072177105, -0.00025173477341455765, -0.19751370735770753, 0.5937698116451558, -0.091316608073566029, -0.013725290582052461, -0.02520163771055933, 0.017656792842510859, -0.0084263349025423682, 0.0045403485605132319, -0.0025115912162854004, 0.0013715944740165292, -0.00074611329874713034, 0.00040289752728649585, -0.00020940277765196283, 0.93632028450469852e-4, -0.25814036473647126e-4, 0.0, 0.52078641240226751, -0.11851454574909661, -0.03285739740528641, -0.0047978116701054375, 0.0074225533327078612, -0.0025952416882643165, 0.0010668529999046694, -0.00050960130430697147, 0.00023587001107416522, -0.00010776044792753716, 0.49241735014382706e-4, -0.22490135982788418e-4, 0.10381851066729738e-4, -0.47312084483604926e-5, 0.05844893809242382, 0.49210809848628195, -0.13016130840056476, -0.034157117371611476, -0.00098301670572829796, 0.0058853422453829204, -0.0018968019544171182, 0.00069225552522263757, -0.00031849356470937341, 0.00014108071977016201, -0.61019246332646756e-4, 0.259848814058572e-4, -0.10339422105751848e-4, 0.33382444533901786e-5, -0.61932264209037923e-6, 0.24036464316389888, 0.36455391898900915, -0.17076959201913428, -0.027607701726389703, 0.007662008241120601, 0.0027418045055298321, -0.00083742854982005548, 0.00016091822625852173, -0.64785030434387758e-4, 0.2631442900599476e-4, -0.96223335840663514e-5, 0.34748743059101633e-5, -0.1198065480145674e-5, 0.34472135494879576e-6, -0.58837374903150623e-7, 0.41672992810645138, 0.81128688460579782e-16, -0.19300409215719407, 0.01468742340953761, 0.01209580243213119, -0.00052499504751491293, -0.00042681013683971668, 0.34551267613418576e-4, 0.12100652590179381e-5, 0.10310843017597674e-5, -0.41067559222547041e-6, 0.98546821830054323e-7, -0.25955363104051318e-7, 0.78201506283918034e-8, -0.21638997586341882e-8, 0.36744453322260277, -0.18232210186321943, -0.15163377893315316, 0.03732287252728852, 0.0091785756539438159, -0.0016447980937961341, -0.00028461639559388612, 0.44484416858016556e-4, 0.26514408607837476e-5, -0.1738325789066566e-6, -0.12035030532030089e-6, 0.2373546497427958e-7, -0.4225969587933059e-8, 0.88456287372942355e-9, -0.1174963136343885e-9, 0.0, -0.34031804552344055, 0.031338677444086685, 0.050947939743419497, -0.0041601159343906282, -0.0021659987510719401, 0.00014662089289157448, 0.46458678895700102e-4, -0.28638625162956868e-5, -0.54644125942198329e-6, 0.25505034027877053e-7, 0.5596020795002169e-8, -0.38526321659827537e-9, 0.47571185910585838e-11, -0.48327078086606375e-11, -0.30317374013748944, -0.15684842920394412e-17, 0.14844089746983234, -0.0068260439972667603, -0.011386707499252168, 0.00055604651706746647, 0.0003404258903470296, -0.15413284814952045e-4, -0.55274263865177847e-5, 0.23191400254952198e-6, 0.55761686038137685e-7, -0.20980096215935158e-8, -0.39851955096283248e-9, 0.14594580744289001e-10, 0.18208102967600173e-11, 0.0, 0.27145987731153354, -0.015789884364296906, -0.043406426707400558, 0.0024179567328294551, 0.0020114920143860492, -0.0001042014850609257, -0.43807396734390487e-4, 0.20819264522088036e-5, 0.55865297153285871e-6, -0.24419231590119171e-7, -0.46840491648468389e-8, 0.18834793161094204e-9, 0.27682023845401218e-10, -0.10382770024573064e-11, 0.25091253627781262, 0.20958312999524093e-17, -0.12423210535891706, 0.0040099743760130122, 0.0099565661817092748, -0.00036590017033001253, -0.0003122461086376193, 0.11455332592119589e-4, 0.51972538301279162e-5, -0.18290468581196801e-6, -0.53824305862244231e-7, 0.1793715153149277e-8, 0.38104401282521395e-9, -0.11833239178630346e-10, -0.19174467220108448e-11, 0.0, -0.23246176601703874, 0.0098927016182840341, 0.037901635052955098, -0.001577119502209961, -0.0018169820021341525, 0.72805910540142751e-4, 0.40899999683340315e-4, -0.15593759383351302e-5, -0.53369324013028829e-6, 0.1923660656790709e-7, 0.45548312775946846e-8, -0.15488624419048933e-9, -0.27169020291555582e-10, 0.87150492645533502e-12, END_TABLE() ROCm-Device-Libs-rocm-5.0.0/ocml/src/besselF_table.h000066400000000000000000000226351415221260100217640ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ DECLARE_TABLE(float, M32_J0, 8*9) 1.0f, 0.44869526e-7f, -0.250000678f, 0.394978156e-5f, 0.0156135085f, 0.186404843e-4f, -0.000451465494f, 0.906744475e-5f, 0.462022483e-5f, 0.0f, -0.519147497f, 0.107938702f, 0.0566017522f, -0.00865766565f, -0.00219399941f, 0.000264347633f, 0.431469054e-4f, -0.427168323e-5f, -0.402759396f, -0.133988793e-8f, 0.201379688f, -0.0175186868f, -0.0133525141f, 0.00103577016f, 0.000371882642f, -0.245406847e-4f, -0.544857844e-5f, 0.0f, 0.340264805f, -0.0308206513f, -0.0529884948f, 0.00463103756f, 0.00225704943f, -0.00017515902f, -0.45676898e-4f, 0.314800819e-5f, 0.300115752f, 0.142140419e-8f, -0.150057871f, 0.00712970835f, 0.0117425671f, -0.00062589107f, -0.00035076219f, 0.175677152e-4f, 0.542756342e-5f, 0.0f, -0.271452299f, 0.0156841249f, 0.0440337286f, -0.00250929967f, -0.0020600007f, 0.000112417278f, 0.440465451e-4f, -0.224955545e-5f, -0.249704877f, -0.114020252e-8f, 0.124852435f, -0.00409076252f, -0.0101027605f, 0.000385232178f, 0.000318490142f, -0.120950437e-4f, -0.516322204e-5f, 0.0f, 0.232459831f, -0.00985706448f, -0.0381859695f, 0.00160739566f, 0.00184174666f, -0.758016756e-4f, -0.408780042e-4f, 0.162275495e-5f, END_TABLE() DECLARE_TABLE(float, M32_J1, 8*9) 0.0f, 0.5f, 0.462571126e-8f, -0.0625000886f, 0.646901306e-6f, 0.00260184106f, 0.455472757e-5f, -0.592206849e-4f, 0.284771796e-5f, 0.581865224f, -0.432727717e-10f, -0.205110698f, 0.00605894703f, 0.0138016513f, -0.000372288399f, -0.000394630783f, 0.908655709e-5f, 0.594411649e-5f, 0.0f, -0.402759391f, 0.0525561452f, 0.0534102785f, -0.00517971268f, -0.00223227521f, 0.000174696729f, 0.448728749e-4f, -0.312619124e-5f, -0.346126202f, -0.135982554e-8f, 0.166974529f, -0.00967824094f, -0.0120991661f, 0.000665244429f, 0.000353951297f, -0.170900235e-4f, -0.544345571e-5f, 0.0f, 0.300115751f, -0.0213892127f, -0.0469704276f, 0.00313028838f, 0.00210522941f, -0.000125486758f, -0.441893462e-4f, 0.235877085e-5f, 0.273299942f, 0.123871464e-8f, -0.134774676f, 0.00511631544f, 0.0106318216f, -0.000448605206f, -0.000326670201f, 0.130923618e-4f, 0.520545213e-5f, 0.0f, -0.249704872f, 0.0122723573f, 0.04041102f, -0.00192680868f, -0.00191084766f, 0.865574383e-4f, 0.412630035e-4f, -0.171042992e-5f, -0.233304417f, -0.101355681e-8f, 0.11580092f, -0.00324897742f, -0.00937250256f, 0.000303501923f, 0.000297960941f, -0.958268173e-5f, -0.490863176e-5f, END_TABLE() DECLARE_TABLE(float, M32_Y0, 18*9) -0.0738042951f, 0.177606017f, -0.016073968f, 0.000538602667f, -0.949500521e-5f, 0.10358476e-6f, -0.769307974e-9f, 0.414351772e-11f, -0.168538199e-13f, -0.779129354f, 2.21109539f, -3.14817837f, 6.76234763f, -16.5245871f, 41.721874f, -101.297948f, 197.994167f, -213.204578f, -0.541790797f, 1.64879305f, -1.61343882f, 2.39011447f, -4.27463147f, 7.8283496f, -14.2356687f, 22.309494f, -20.7850723f, -0.35708307f, 1.3315403f, -1.00504975f, 1.07504147f, -1.54659225f, 2.2281907f, -3.21955386f, 4.18656836f, -3.43559538f, -0.204564821f, 1.12081681f, -0.712857069f, 0.554042423f, -0.680799155f, 0.814950073f, -0.973649903f, 1.07700623f, -0.787302821f, 0.0f, 0.879420802f, -0.492078934f, 0.220553062f, -0.226122006f, 0.218871042f, -0.204734177f, 0.205007038f, -0.209851389f, 0.0882569642f, 0.781212821f, -0.434734855f, 0.144909902f, -0.137517504f, 0.124034055f, -0.100221697f, 0.072159059f, -0.0322405804f, 0.258216852f, 0.584364035f, -0.362853954f, 0.0616967017f, -0.0457019916f, 0.0403914876f, -0.0257050488f, 0.0138811594f, -0.00448857991f, 0.428917561f, 0.331694423f, -0.316518592f, 0.0305795132f, -0.00474255594f, 0.0105095903f, -0.00569634195f, 0.0023888513f, -0.000671151428f, 0.520786412f, 0.316257491e-10f, -0.260393207f, 0.0395048433f, 0.00821442047f, 0.000959730625f, -0.00123958131f, 0.00037168397f, -0.000105767765f, 0.493297245f, -0.159512126f, -0.215140053f, 0.050767252f, 0.00813790411f, -0.000862432027f, -0.000649450987f, 0.000150259461f, -0.184581358e-4f, 0.37685001f, -0.324674425f, -0.134312601f, 0.0630235318f, 0.00445562302f, -0.00210112822f, -0.000263972937f, 0.876453474e-4f, -0.546484929e-5f, 0.0f, -0.40254267f, 0.0508559094f, 0.058523724f, -0.00685252463f, -0.002182572f, 0.000194599211f, 0.485251783e-4f, -0.269518635e-5f, -0.340318045f, -0.176035638e-8f, 0.170159015f, -0.0104461902f, -0.0127369142f, 0.000831821655f, 0.000360781298f, -0.205125477e-4f, -0.556989234e-5f, 0.0f, 0.300097614f, -0.0211752365f, -0.0480240177f, 0.00331834481f, 0.00217561974f, -0.000140580184f, -0.451359559e-4f, 0.265455576e-5f, 0.271459877f, 0.139172743e-8f, -0.135729934f, 0.00526326684f, 0.0108515634f, -0.000483436056f, -0.000335109802f, 0.145606732e-4f, 0.530954251e-5f, 0.0f, -0.249701237f, 0.0122135007f, 0.0408203043f, -0.00197714145f, -0.00194569725f, 0.914230753e-4f, 0.425102172e-4f, -0.190386676e-5f, -0.232461766f, -0.985286365e-9f, 0.11623088f, -0.00329754703f, -0.00947537951f, 0.000315310068f, 0.000302731128f, -0.101595111e-4f, -0.49823498e-5f, END_TABLE() DECLARE_TABLE(float, M32_Y1, 18*9) -0.196057091f, 0.0543486882f, -0.00295530534f, 0.716426875e-4f, -0.992674062e-6f, 0.893187962e-8f, -0.564802451e-10f, 0.264946691e-12f, -0.956040552e-15f, -1.47147239f, 2.49842603f, -4.705631f, 9.97554229f, -20.1713128f, 40.1878477f, -76.6812412f, 123.027773f, -115.903802f, -1.2171501f, 1.6698932f, -2.28529116f, 4.02725834f, -6.58555591f, 10.5423814f, -16.4151681f, 22.3415253f, -18.8343596f, -1.03759455f, 1.24628662f, -1.23436566f, 1.89920078f, -2.63550437f, 3.50388357f, -4.46415376f, 4.77962593f, -3.00258761f, -0.837397335f, 0.930919184f, -0.554175938f, 0.733648813f, -0.859400875f, 0.908155864f, -0.904818857f, 0.731425234f, -0.332223767f, -0.607228956f, 0.737838338f, -0.203493924f, 0.210066093f, -0.230917988f, 0.188616636f, -0.14625808f, 0.0958024404f, -0.0364996384f, -0.391867956f, 0.650927429f, -0.100177392f, 0.0422373453f, -0.07235139f, 0.0493095772f, -0.0299278561f, 0.0165691516f, -0.00564529733f, -0.197513707f, 0.593769812f, -0.0913166067f, -0.013725346f, -0.0252004653f, 0.0176426751f, -0.00832470911f, 0.00410178601f, -0.00142662074f, 0.0f, 0.520786412f, -0.118514546f, -0.0328573972f, -0.00479781174f, 0.00742247989f, -0.00259521656f, 0.00107430961f, -0.000512579875f, 0.0584489381f, 0.492108098f, -0.130161305f, -0.0341572041f, -0.000981824109f, 0.00587622283f, -0.00185575707f, 0.000582281731f, -0.000148343917f, 0.240364643f, 0.364553919f, -0.170769591f, -0.0276077249f, 0.00766230439f, 0.00273966927f, -0.000828295737f, 0.000137588785f, -0.305187593e-4f, 0.416729928f, -0.258296385e-9f, -0.193004092f, 0.0146874353f, 0.0120957914f, -0.000525144004f, -0.000426716299f, 0.352388331e-4f, 0.877397631e-6f, 0.367444533f, -0.182322102f, -0.151633779f, 0.0373228744f, 0.00917855673f, -0.00164468944f, -0.000284979934f, 0.451783718e-4f, 0.198340769e-5f, 0.0f, -0.340318045f, 0.0313386774f, 0.0509479111f, -0.00416011363f, -0.00216575933f, 0.000146604317f, 0.458142122e-4f, -0.282657316e-5f, -0.30317374f, -0.139307478e-8f, 0.148440893f, -0.00682601599f, -0.0113866644f, 0.000555890828f, 0.000340287228f, -0.150871178e-4f, -0.536125411e-5f, 0.0f, 0.271459876f, -0.0157898843f, -0.0434063812f, 0.00241795425f, 0.00201116367f, -0.000104184764f, -0.430443521e-4f, 0.204582146e-5f, 0.250912536f, 0.109595535e-8f, -0.124232101f, 0.00400995233f, 0.009956529f, -0.000365777587f, -0.000312125102f, 0.111983746e-4f, 0.504825687e-5f, 0.0f, -0.232461765f, 0.00989270158f, 0.037901591f, -0.0015771177f, -0.00181666561f, 0.72793478e-4f, 0.40167952e-4f, -0.153180005e-5f, END_TABLE() ROCm-Device-Libs-rocm-5.0.0/ocml/src/bp0D.cl000066400000000000000000000014531415221260100201610ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_PRIVATE(bp0)(double t) { return MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.91f780a4a989bp+28, 0x1.52a41923b70a7p+24), -0x1.40a5e31612a8dp+19), 0x1.0c9a0cbe3b3b8p+14), -0x1.0af76167fe583p+9), 0x1.778ea61b94139p+4), -0x1.a3581d1a82662p+0), 0x1.ad33330a1daf2p-3), -0x1.0aaaaaaaa7909p-4), 0x1.0000000000000p-3); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/bp0F.cl000066400000000000000000000010161415221260100201560ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_PRIVATE(bp0)(float t) { return MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.5ec5e6p+0f, 0x1.aafb08p-3f), -0x1.0aa926p-4f), 0x1.000000p-3f); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/bp1D.cl000066400000000000000000000014531415221260100201620ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_PRIVATE(bp1)(double t) { return MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.c22f653d3a76ep+28, -0x1.80a4d95ed3e8ep+24), 0x1.72f1d1f8cdd76p+19), -0x1.3ea4e96460ad7p+14), 0x1.488dd98d9ab3ap+9), -0x1.e9ed612fa3b38p+4), 0x1.2f484fcab9ddap+1), -0x1.7bccccad443c0p-2), 0x1.4ffffffffcbfap-3), -0x1.8000000000000p-2); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/bp1F.cl000066400000000000000000000010161415221260100201570ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_PRIVATE(bp1)(float t) { return MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.0214cep+1f, -0x1.7a54cap-2f), 0x1.4ffefep-3f), -0x1.800000p-2f); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/builtins.h000066400000000000000000000264661415221260100210710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ // Bitcasting #define AS_SHORT(X) __builtin_astype(X, short) #define AS_SHORT2(X) __builtin_astype(X, short2) #define AS_USHORT(X) __builtin_astype(X, ushort) #define AS_USHORT2(X) __builtin_astype(X, ushort2) #define AS_INT(X) __builtin_astype(X, int) #define AS_INT2(X) __builtin_astype(X, int2) #define AS_UINT(X) __builtin_astype(X, uint) #define AS_UINT2(X) __builtin_astype(X, uint2) #define AS_LONG(X) __builtin_astype(X, long) #define AS_ULONG(X) __builtin_astype(X, ulong) #define AS_DOUBLE(X) __builtin_astype(X, double) #define AS_FLOAT(X) __builtin_astype(X, float) #define AS_HALF(X) __builtin_astype(X, half) #define AS_HALF2(X) __builtin_astype(X, half2) // Class mask bits #define CLASS_SNAN 0x001 #define CLASS_QNAN 0x002 #define CLASS_NINF 0x004 #define CLASS_NNOR 0x008 #define CLASS_NSUB 0x010 #define CLASS_NZER 0x020 #define CLASS_PZER 0x040 #define CLASS_PSUB 0x080 #define CLASS_PNOR 0x100 #define CLASS_PINF 0x200 #include "irif.h" #define BUILTIN_ABS_F32 __builtin_fabsf #define BUILTIN_ABS_F64 __builtin_fabs #define BUILTIN_ABS_F16 __builtin_fabsf16 #define BUILTIN_ABS_2F16 __llvm_fabs_2f16 #define BUILTIN_BITALIGN_B32 __builtin_amdgcn_alignbit #define BUILTIN_CEIL_F32 __builtin_ceilf #define BUILTIN_CEIL_F64 __builtin_ceil #define BUILTIN_CEIL_F16 __builtin_ceilf16 #define BUILTIN_CEIL_2F16 __llvm_ceil_2f16 #define BUILTIN_CLASS_F32 __builtin_amdgcn_classf #define BUILTIN_CLASS_F64 __builtin_amdgcn_class #define BUILTIN_CLASS_F16 __builtin_amdgcn_classh #define BUILTIN_ISNAN_F32(x) __builtin_isnan(x) #define BUILTIN_ISNAN_F64(x) __builtin_isnan(x) #define BUILTIN_ISNAN_F16(x) __builtin_isnan(x) #define BUILTIN_ISUNORDERED_F32(x, y) __builtin_isunordered(x, y) #define BUILTIN_ISUNORDERED_F64(x, y) __builtin_isunordered(x, y) #define BUILTIN_ISUNORDERED_F16(x, y) __builtin_isunordered(x, y) #define BUILTIN_ISINF_F32(x) __builtin_isinf(x) #define BUILTIN_ISINF_F64(x) __builtin_isinf(x) #define BUILTIN_ISINF_F16(x) __builtin_isinf(x) #define BUILTIN_ISFINITE_F32(x) __builtin_amdgcn_classf(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR) #define BUILTIN_ISFINITE_F64(x) __builtin_amdgcn_class(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR) #define BUILTIN_ISFINITE_F16(x) __builtin_amdgcn_classh(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR) #define BUILTIN_COPYSIGN_F32 __builtin_copysignf #define BUILTIN_COPYSIGN_F64 __builtin_copysign #define BUILTIN_COPYSIGN_F16 __builtin_copysignf16 #define BUILTIN_COPYSIGN_2F16 __llvm_copysign_2f16 #define BUILTIN_FIRSTBIT_U32(X) ((X) == 0 ? -1 : __builtin_clz(X)) #define BUILTIN_FLOOR_F32 __builtin_floorf #define BUILTIN_FLOOR_F64 __builtin_floor #define BUILTIN_FLOOR_F16 __builtin_floorf16 #define BUILTIN_FLOOR_2F16 __llvm_floor_2f16 #define BUILTIN_FRACTION_F32(X) ({ \ float _fract_x = X; \ float _fract_r = __builtin_amdgcn_fractf(_fract_x); \ _fract_r = __builtin_amdgcn_classf(_fract_x, CLASS_PINF|CLASS_NINF) ? 0.0f : _fract_r; \ _fract_r; \ }) #define BUILTIN_FRACTION_F64(X) ({ \ double _fract_x = X; \ double _fract_r = __builtin_amdgcn_fract(_fract_x); \ _fract_r = __builtin_amdgcn_class(_fract_x, CLASS_PINF|CLASS_NINF) ? 0.0 : _fract_r; \ _fract_r; \ }) #define BUILTIN_FRACTION_F16(X) ({ \ half _fract_x = X; \ half _fract_r = __builtin_amdgcn_fracth(_fract_x); \ _fract_r = __builtin_amdgcn_classh(_fract_x, CLASS_PINF|CLASS_NINF) ? 0.0h : _fract_r; \ _fract_r; \ }) #define BUILTIN_MAD_U32(A,B,C) ((A)*(B)+(C)) #define BUILTIN_MAX_F32 __builtin_fmaxf #define BUILTIN_MAX_F64 __builtin_fmax #define BUILTIN_MAX_F16 __builtin_fmaxf16 #define BUILTIN_MAX_2F16 __llvm_maxnum_2f16 #define BUILTIN_MAX_S32(A,B) ((A) < (B) ? (B) : (A)) #define BUILTIN_MAX_U32(A,B) ((A) < (B) ? (B) : (A)) #define BUILTIN_MIN_F32 __builtin_fminf #define BUILTIN_MIN_F64 __builtin_fmin #define BUILTIN_MIN_F16 __builtin_fminf16 #define BUILTIN_MIN_2F16 __llvm_minnum_2f16 #define BUILTIN_MIN_S32(A,B) ((A) < (B) ? (A) : (B)) #define BUILTIN_MIN_U32(A,B) ((A) < (B) ? (A) : (B)) #define BUILTIN_CANONICALIZE_F32(X) __builtin_canonicalizef(X) #define BUILTIN_CANONICALIZE_F64(X) __builtin_canonicalize(X) #define BUILTIN_CANONICALIZE_F16(X) __builtin_canonicalizef16(X) #define BUILTIN_CANONICALIZE_2F16(X) __llvm_canonicalize_2f16(X) #define BUILTIN_MULHI_U32(A,B) (((ulong)(A) * (ulong)(B)) >> 32) #define BUILTIN_COS_F32 __builtin_amdgcn_cosf #define BUILTIN_EXP2_F32 __builtin_exp2f #define BUILTIN_EXP2_F16 __builtin_exp2f16 #define BUILTIN_LOG2_F32 __builtin_log2f #define BUILTIN_LOG2_F16 __builtin_log2f16 #define BUILTIN_RCP_F32 __builtin_amdgcn_rcpf #define BUILTIN_RCP_F64 __builtin_amdgcn_rcp #define BUILTIN_RCP_F16 __builtin_amdgcn_rcph #define BUILTIN_RSQRT_F32 __builtin_amdgcn_rsqf #define BUILTIN_RSQRT_F64 __builtin_amdgcn_rsq #define BUILTIN_RSQRT_F16 __builtin_amdgcn_rsqh #define BUILTIN_SIN_F32 __builtin_amdgcn_sinf #define BUILTIN_RINT_F32 __builtin_rintf #define BUILTIN_RINT_F64 __builtin_rint #define BUILTIN_RINT_F16 __builtin_rintf16 #define BUILTIN_RINT_2F16 __llvm_rint_2f16 #define BUILTIN_SQRT_F32(X) __builtin_sqrtf(X) #define BUILTIN_SQRT_F64(X) __builtin_sqrt(X) #define BUILTIN_SQRT_F16(X) __builtin_sqrtf16(X) #define BUILTIN_TRUNC_F32 __builtin_truncf #define BUILTIN_TRUNC_F64 __builtin_trunc #define BUILTIN_TRUNC_F16 __builtin_truncf16 #define BUILTIN_TRUNC_2F16 __llvm_trunc_2f16 #define BUILTIN_ROUND_F32 __builtin_roundf #define BUILTIN_ROUND_F64 __builtin_round #define BUILTIN_ROUND_F16 __builtin_roundf16 #define BUILTIN_ROUND_2F16 __llvm_round_2f16 #define BUILTIN_DIV_F32(X,Y) ({ \ float _div_x = X; \ float _div_y = Y; \ float _div_ret = _div_x / _div_y; \ _div_ret; \ }) #define BUILTIN_DIV_F64(X,Y) ({ \ double _div_x = X; \ double _div_y = Y; \ double _div_ret = _div_x / _div_y; \ _div_ret; \ }) #define BUILTIN_DIV_F16(X,Y) ({ \ half _div_x = X; \ half _div_y = Y; \ half _div_ret = _div_x / _div_y; \ _div_ret; \ }) #define BUILTIN_FMA_F32 __builtin_fmaf #define BUILTIN_FMA_2F32 __llvm_fma_2f32 #define BUILTIN_FMA_F64 __builtin_fma #define BUILTIN_FMA_F16 __builtin_fmaf16 #define BUILTIN_FMA_2F16 __llvm_fma_2f16 #define BUILTIN_FLDEXP_F32 __builtin_amdgcn_ldexpf #define BUILTIN_FLDEXP_F64 __builtin_amdgcn_ldexp #define BUILTIN_FLDEXP_F16 __builtin_amdgcn_ldexph #define BUILTIN_FREXP_EXP_F32 __builtin_amdgcn_frexp_expf #define BUILTIN_FREXP_EXP_F64 __builtin_amdgcn_frexp_exp #define BUILTIN_FREXP_EXP_F16 __builtin_amdgcn_frexp_exph #define BUILTIN_FREXP_MANT_F32 __builtin_amdgcn_frexp_mantf #define BUILTIN_FREXP_MANT_F64 __builtin_amdgcn_frexp_mant #define BUILTIN_FREXP_MANT_F16 __builtin_amdgcn_frexp_manth #define BUILTIN_CMAX_F32 __builtin_fmaxf #define BUILTIN_CMAX_F64 __builtin_fmax #define BUILTIN_CMAX_F16 __builtin_fmaxf16 #define BUILTIN_CMAX_2F16 __llvm_maxnum_2f16 #define BUILTIN_CMIN_F32 __builtin_fminf #define BUILTIN_CMIN_F64 __builtin_fmin #define BUILTIN_CMIN_F16 __builtin_fminf16 #define BUILTIN_CMIN_2F16 __llvm_minnum_2f16 #define BUILTIN_TRIG_PREOP_F64 __builtin_amdgcn_trig_preop #define BUILTIN_MAD_F32 __ocml_fmuladd_f32 #define BUILTIN_MAD_2F32 __ocml_fmuladd_2f32 #define BUILTIN_MAD_F64 __ocml_fmuladd_f64 #define BUILTIN_MAD_F16 __ocml_fmuladd_f16 #define BUILTIN_MAD_2F16 __ocml_fmuladd_2f16 // HW has ISA for max3, median3, and min3, median3 can be used to clamp #define BUILTIN_CLAMP_S32(X,L,H) ({ \ int _clamp_x = X; \ int _clamp_l = L; \ int _clamp_h = H; \ int _clamp_r = _clamp_x > _clamp_l ? _clamp_x : _clamp_l; \ _clamp_r = _clamp_r < _clamp_h ? _clamp_r : _clamp_h; \ _clamp_r; \ }) #define BUILTIN_CLAMP_F32(X,L,H) __builtin_amdgcn_fmed3f(X,L,H) #define BUILTIN_CLAMP_F16(X,L,H) __llvm_amdgcn_fmed3_f16(X,L,H) #define BUILTIN_ADD_RTE_F32 __llvm_add_rte_f32 #define BUILTIN_ADD_RTE_F64 __llvm_add_rte_f64 #define BUILTIN_ADD_RTE_F16 __llvm_add_rte_f16 #define BUILTIN_ADD_RTN_F32 __llvm_add_rtn_f32 #define BUILTIN_ADD_RTN_F64 __llvm_add_rtn_f64 #define BUILTIN_ADD_RTN_F16 __llvm_add_rtn_f16 #define BUILTIN_ADD_RTP_F32 __llvm_add_rtp_f32 #define BUILTIN_ADD_RTP_F64 __llvm_add_rtp_f64 #define BUILTIN_ADD_RTP_F16 __llvm_add_rtp_f16 #define BUILTIN_ADD_RTZ_F32 __llvm_add_rtz_f32 #define BUILTIN_ADD_RTZ_F64 __llvm_add_rtz_f64 #define BUILTIN_ADD_RTZ_F16 __llvm_add_rtz_f16 #define BUILTIN_SUB_RTE_F32 __llvm_sub_rte_f32 #define BUILTIN_SUB_RTE_F64 __llvm_sub_rte_f64 #define BUILTIN_SUB_RTE_F16 __llvm_sub_rte_f16 #define BUILTIN_SUB_RTN_F32 __llvm_sub_rtn_f32 #define BUILTIN_SUB_RTN_F64 __llvm_sub_rtn_f64 #define BUILTIN_SUB_RTN_F16 __llvm_sub_rtn_f16 #define BUILTIN_SUB_RTP_F32 __llvm_sub_rtp_f32 #define BUILTIN_SUB_RTP_F64 __llvm_sub_rtp_f64 #define BUILTIN_SUB_RTP_F16 __llvm_sub_rtp_f16 #define BUILTIN_SUB_RTZ_F32 __llvm_sub_rtz_f32 #define BUILTIN_SUB_RTZ_F64 __llvm_sub_rtz_f64 #define BUILTIN_SUB_RTZ_F16 __llvm_sub_rtz_f16 #define BUILTIN_MUL_RTE_F32 __llvm_mul_rte_f32 #define BUILTIN_MUL_RTE_F64 __llvm_mul_rte_f64 #define BUILTIN_MUL_RTE_F16 __llvm_mul_rte_f16 #define BUILTIN_MUL_RTN_F32 __llvm_mul_rtn_f32 #define BUILTIN_MUL_RTN_F64 __llvm_mul_rtn_f64 #define BUILTIN_MUL_RTN_F16 __llvm_mul_rtn_f16 #define BUILTIN_MUL_RTP_F32 __llvm_mul_rtp_f32 #define BUILTIN_MUL_RTP_F64 __llvm_mul_rtp_f64 #define BUILTIN_MUL_RTP_F16 __llvm_mul_rtp_f16 #define BUILTIN_MUL_RTZ_F32 __llvm_mul_rtz_f32 #define BUILTIN_MUL_RTZ_F64 __llvm_mul_rtz_f64 #define BUILTIN_MUL_RTZ_F16 __llvm_mul_rtz_f16 #define BUILTIN_DIV_RTE_F32 __llvm_div_rte_f32 #define BUILTIN_DIV_RTE_F64 __llvm_div_rte_f64 #define BUILTIN_DIV_RTE_F16 __llvm_div_rte_f16 #define BUILTIN_DIV_RTN_F32 __llvm_div_rtn_f32 #define BUILTIN_DIV_RTN_F64 __llvm_div_rtn_f64 #define BUILTIN_DIV_RTN_F16 __llvm_div_rtn_f16 #define BUILTIN_DIV_RTP_F32 __llvm_div_rtp_f32 #define BUILTIN_DIV_RTP_F64 __llvm_div_rtp_f64 #define BUILTIN_DIV_RTP_F16 __llvm_div_rtp_f16 #define BUILTIN_DIV_RTZ_F32 __llvm_div_rtz_f32 #define BUILTIN_DIV_RTZ_F64 __llvm_div_rtz_f64 #define BUILTIN_DIV_RTZ_F16 __llvm_div_rtz_f16 #define BUILTIN_SQRT_RTE_F32 __llvm_sqrt_rte_f32 #define BUILTIN_SQRT_RTE_F64 __llvm_sqrt_rte_f64 #define BUILTIN_SQRT_RTE_F16 __llvm_sqrt_rte_f16 #define BUILTIN_SQRT_RTN_F32 __llvm_sqrt_rtn_f32 #define BUILTIN_SQRT_RTN_F64 __llvm_sqrt_rtn_f64 #define BUILTIN_SQRT_RTN_F16 __llvm_sqrt_rtn_f16 #define BUILTIN_SQRT_RTP_F32 __llvm_sqrt_rtp_f32 #define BUILTIN_SQRT_RTP_F64 __llvm_sqrt_rtp_f64 #define BUILTIN_SQRT_RTP_F16 __llvm_sqrt_rtp_f16 #define BUILTIN_SQRT_RTZ_F32 __llvm_sqrt_rtz_f32 #define BUILTIN_SQRT_RTZ_F64 __llvm_sqrt_rtz_f64 #define BUILTIN_SQRT_RTZ_F16 __llvm_sqrt_rtz_f16 #define BUILTIN_FMA_RTE_F32 __llvm_fma_rte_f32 #define BUILTIN_FMA_RTE_F64 __llvm_fma_rte_f64 #define BUILTIN_FMA_RTE_F16 __llvm_fma_rte_f16 #define BUILTIN_FMA_RTN_F32 __llvm_fma_rtn_f32 #define BUILTIN_FMA_RTN_F64 __llvm_fma_rtn_f64 #define BUILTIN_FMA_RTN_F16 __llvm_fma_rtn_f16 #define BUILTIN_FMA_RTP_F32 __llvm_fma_rtp_f32 #define BUILTIN_FMA_RTP_F64 __llvm_fma_rtp_f64 #define BUILTIN_FMA_RTP_F16 __llvm_fma_rtp_f16 #define BUILTIN_FMA_RTZ_F32 __llvm_fma_rtz_f32 #define BUILTIN_FMA_RTZ_F64 __llvm_fma_rtz_f64 #define BUILTIN_FMA_RTZ_F16 __llvm_fma_rtz_f16 ROCm-Device-Libs-rocm-5.0.0/ocml/src/cabsD.cl000066400000000000000000000006631415221260100204120ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(cabs)(double2 x) { return MATH_MANGLE(hypot)(x.s0, x.s1); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cabsF.cl000066400000000000000000000006611415221260100204120ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(cabs)(float2 x) { return MATH_MANGLE(hypot)(x.s0, x.s1); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cacosD.cl000066400000000000000000000010121415221260100205570ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double2 MATH_MANGLE(cacos)(double2 z) { double2 a = MATH_MANGLE(cacosh)(z); bool b = AS_INT2(z.y).hi < 0; return (double2)(b ? -a.y : a.y, b ? a.x : -a.x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cacosF.cl000066400000000000000000000010021415221260100205600ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float2 MATH_MANGLE(cacos)(float2 z) { float2 a = MATH_MANGLE(cacosh)(z); bool b = AS_INT(z.y) < 0; return (float2)(b ? -a.y : a.y, b ? a.x : -a.x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cacoshD.cl000066400000000000000000000037141415221260100207420ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double4 MATH_PRIVATE(epcsqrtep)(double4 z); extern CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea); CONSTATTR double2 MATH_MANGLE(cacosh)(double2 z) { double x = BUILTIN_ABS_F64(z.x); double y = BUILTIN_ABS_F64(z.y); double2 l2, t; int e = 0; bool b = true; if (x < 0x1.0p+54 && y < 0x1.0p+54) { if (x >= 1.0 || y >= 0x1.0p-53 || y > (1.0 - x)*0x1.0p-26) { double4 z2p1 = (double4)(add(mul(add(y,x), sub(y,x)), 1.0), mul(y,x)*2.0); double4 rz2m1 = MATH_PRIVATE(epcsqrtep)(z2p1); rz2m1 = (double4)(csgn(rz2m1.hi, (double2)z.x), csgn(rz2m1.lo, (double2)z.y)); double4 s = (double4)(add(rz2m1.lo, z.x), add(rz2m1.hi, z.y)); l2 = add(sqr(s.lo), sqr(s.hi)); t = (double2)(s.s1, z.y == 0.0 ? z.y : s.s3); } else { b = false; double r = MATH_FAST_SQRT(BUILTIN_FMA_F64(-x, x, 1.0)); l2 = con(MATH_DIV(y, r), 0.0); t = (double2)(z.x, BUILTIN_COPYSIGN_F64(r, z.y)); } } else { e = BUILTIN_FREXP_EXP_F64(BUILTIN_MAX_F64(x,y)); x = BUILTIN_FLDEXP_F64(x, -e); y = BUILTIN_FLDEXP_F64(y, -e); l2 = add(sqr(x), sqr(y)); e = 2*e + 2; t = z; } double rr; if (b) { rr = 0.5 * MATH_PRIVATE(lnep)(l2, e); } else { rr = l2.hi; } double ri = MATH_MANGLE(atan2)(t.y, t.x); if (!FINITE_ONLY_OPT()) { rr = (BUILTIN_ISINF_F64(z.x) | BUILTIN_ISINF_F64(z.y)) ? AS_DOUBLE(PINFBITPATT_DP64) : rr; } return (double2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cacoshF.cl000066400000000000000000000037271415221260100207500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float4 MATH_PRIVATE(epcsqrtep)(float4 z); extern CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea); CONSTATTR float2 MATH_MANGLE(cacosh)(float2 z) { float x = BUILTIN_ABS_F32(z.x); float y = BUILTIN_ABS_F32(z.y); float2 l2, t; int e = 0; bool b = true; if (x < 0x1.0p+25f && y < 0x1.0p+25f) { if (x >= 1.0f || y >= 0x1.0p-24f || y > (1.0f - x)*0x1.0p-12f) { float4 z2p1 = (float4)(add(mul(add(y,x), sub(y,x)), 1.0f), mul(y,x)*2.0f); float4 rz2m1 = MATH_PRIVATE(epcsqrtep)(z2p1); rz2m1 = (float4)(csgn(rz2m1.hi, (float2)z.x), csgn(rz2m1.lo, (float2)z.y)); float4 s = (float4)(add(rz2m1.lo, z.x), add(rz2m1.hi, z.y)); l2 = add(sqr(s.lo), sqr(s.hi)); t = (float2)(s.s1, z.y == 0.0f ? z.y : s.s3); } else { b = false; float r = MATH_SQRT(BUILTIN_FMA_F32(-x, x, 1.0f)); l2 = con(MATH_DIV(y, r), 0.0f); t = (float2)(z.x, BUILTIN_COPYSIGN_F32(r, z.y)); } } else { e = BUILTIN_FREXP_EXP_F32(AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(x), AS_UINT(y)))); x = BUILTIN_FLDEXP_F32(x, -e); y = BUILTIN_FLDEXP_F32(y, -e); l2 = add(sqr(x), sqr(y)); e = 2*e + 2; t = z; } float rr; if (b) { rr = 0.5f * MATH_PRIVATE(lnep)(l2, e); } else { rr = l2.hi; } float ri = MATH_MANGLE(atan2)(t.y, t.x); if (!FINITE_ONLY_OPT()) { rr = (BUILTIN_ISINF_F32(z.x) | BUILTIN_ISINF_F32(z.y)) ? AS_FLOAT(PINFBITPATT_SP32) : rr; } return (float2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/casinD.cl000066400000000000000000000007461415221260100206010ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double2 MATH_MANGLE(casin)(double2 z) { double2 a = MATH_MANGLE(casinh)((double2)(-z.y, z.x)); return (double2)(a.y, -a.x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/casinF.cl000066400000000000000000000007411415221260100205760ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float2 MATH_MANGLE(casin)(float2 z) { float2 a = MATH_MANGLE(casinh)((float2)(-z.y, z.x)); return (float2)(a.y, -a.x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/casinhD.cl000066400000000000000000000036741415221260100207540ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double4 MATH_PRIVATE(epcsqrtep)(double4 z); extern CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea); CONSTATTR double2 MATH_MANGLE(casinh)(double2 z) { double x = BUILTIN_ABS_F64(z.x); double y = BUILTIN_ABS_F64(z.y); double2 l2, t; int e = 0; bool b = true; if (x < 0x1.0p+54 && y < 0x1.0p+54) { if (y >= 1.0 || x >= 0x1.0p-53 || x > (1.0 - y)*0x1.0p-26f) { double4 z2p1 = (double4)(add(mul(add(x,y), sub(x,y)), 1.0), mul(y,x)*2.0); double4 rz2p1 = MATH_PRIVATE(epcsqrtep)(z2p1); double4 s = (double4)(add(rz2p1.lo, x), add(rz2p1.hi, y)); l2 = add(sqr(s.lo), sqr(s.hi)); t = (double2)(s.s1, s.s3); } else { b = false; double r = MATH_SQRT(BUILTIN_FMA_F64(-y, y, 1.0)); l2 = con(MATH_DIV(x, r), 0.0); t = (double2)(r, y); } } else { t = (double2)(x, y); e = BUILTIN_FREXP_EXP_F64(BUILTIN_MAX_F64(x, y)); x = BUILTIN_FLDEXP_F64(x, -e); y = BUILTIN_FLDEXP_F64(y, -e); l2 = add(sqr(x), sqr(y)); e = 2*e + 2; } double rr; if (b) { rr = 0.5 * MATH_PRIVATE(lnep)(l2, e); } else { rr = l2.hi; } rr = BUILTIN_COPYSIGN_F64(rr, z.x); double ri = BUILTIN_COPYSIGN_F64(MATH_MANGLE(atan2)(t.y, t.x), z.y); if (!FINITE_ONLY_OPT()) { double i = BUILTIN_COPYSIGN_F64(AS_DOUBLE(PINFBITPATT_DP64), z.x); rr = (BUILTIN_ISINF_F64(z.x) | BUILTIN_ISINF_F64(z.y)) ? i : rr; } return (double2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/casinhF.cl000066400000000000000000000037121415221260100207470ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float4 MATH_PRIVATE(epcsqrtep)(float4 z); extern CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea); CONSTATTR float2 MATH_MANGLE(casinh)(float2 z) { float x = BUILTIN_ABS_F32(z.x); float y = BUILTIN_ABS_F32(z.y); float2 l2, t; int e = 0; bool b = true; if (x < 0x1.0p+25f && y < 0x1.0p+25f) { if (y >= 1.0f || x >= 0x1.0p-24f || x > (1.0f - y)*0x1.0p-12f) { float4 z2p1 = (float4)(add(mul(add(x,y), sub(x,y)), 1.0f), mul(y,x)*2.0f); float4 rz2p1 = MATH_PRIVATE(epcsqrtep)(z2p1); float4 s = (float4)(add(rz2p1.lo, x), add(rz2p1.hi, y)); l2 = add(sqr(s.lo), sqr(s.hi)); t = (float2)(s.s1, s.s3); } else { b = false; float r = MATH_SQRT(BUILTIN_FMA_F32(-y, y, 1.0f)); l2 = con(MATH_DIV(x, r), 0.0f); t = (float2)(r, y); } } else { t = (float2)(x, y); e = BUILTIN_FREXP_EXP_F32(AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(x), AS_UINT(y)))); x = BUILTIN_FLDEXP_F32(x, -e); y = BUILTIN_FLDEXP_F32(y, -e); l2 = add(sqr(x), sqr(y)); e = 2*e + 2; } float rr; if (b) { rr = 0.5f * MATH_PRIVATE(lnep)(l2, e); } else { rr = l2.hi; } rr = BUILTIN_COPYSIGN_F32(rr, z.x); float ri = BUILTIN_COPYSIGN_F32(MATH_MANGLE(atan2)(t.y, t.x), z.y); if (!FINITE_ONLY_OPT()) { float i = BUILTIN_COPYSIGN_F32(AS_FLOAT(PINFBITPATT_SP32), z.x); rr = (BUILTIN_ISINF_F32(z.x) | BUILTIN_ISINF_F32(z.y)) ? i : rr; } return (float2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/catanD.cl000066400000000000000000000007461415221260100205720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double2 MATH_MANGLE(catan)(double2 z) { double2 a = MATH_MANGLE(catanh)((double2)(-z.y, z.x)); return (double2)(a.y, -a.x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/catanF.cl000066400000000000000000000007411415221260100205670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float2 MATH_MANGLE(catan)(float2 z) { float2 a = MATH_MANGLE(catanh)((float2)(-z.y, z.x)); return (float2)(a.y, -a.x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/catanhD.cl000066400000000000000000000035231415221260100207360ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea); CONSTATTR double2 MATH_MANGLE(catanh)(double2 z) { double x = BUILTIN_ABS_F64(z.x); double y = BUILTIN_ABS_F64(z.y); double rr, ri; if (x < 0x1.0p+54 && y < 0x1.0p+54) { double2 omx = sub(1.0, x); double2 opx = add(1.0, x); double2 y2 = sqr(y); double2 b = sub(mul(omx, opx), y2); ri = 0.5 * MATH_MANGLE(atan2)(2.0 * y, b.hi); double2 a; double2 d = add(sqr(opx), y2); if (x < 0x1.0p-3 * d.hi) { a = fsub(1.0, div(4.0*x, d)); } else { a = div(add(sqr(omx), y2), d); } rr = -0.25 * MATH_PRIVATE(lnep)(a, 0); } else { int e = BUILTIN_FREXP_EXP_F64(BUILTIN_MAX_F64(x, y)); x = BUILTIN_FLDEXP_F64(x, -e); y = BUILTIN_FLDEXP_F64(y, -e); rr = BUILTIN_FLDEXP_F64(MATH_DIV(x, MATH_MAD(x, x, y*y)), -e); ri = 0x1.921fb54442d18p+0; } if (!FINITE_ONLY_OPT()) { rr = ((x == 1.0) & (y == 0.0)) ? AS_DOUBLE(PINFBITPATT_DP64) : rr; rr = x == 0.0 ? 0.0 : rr; rr = BUILTIN_ISINF_F64(x) ? 0.0 : rr; rr = (BUILTIN_ISNAN_F64(x) & BUILTIN_ISINF_F64(y)) ? 0.0 : rr; ri = (BUILTIN_ISNAN_F64(x) & BUILTIN_ISFINITE_F64(y)) ? AS_DOUBLE(QNANBITPATT_DP64) : ri; ri = BUILTIN_ISNAN_F64(y) ? y : ri; } rr = BUILTIN_COPYSIGN_F64(rr, z.x); ri = BUILTIN_COPYSIGN_F64(ri, z.y); return (double2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/catanhF.cl000066400000000000000000000035501415221260100207400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea); CONSTATTR float2 MATH_MANGLE(catanh)(float2 z) { float x = BUILTIN_ABS_F32(z.x); float y = BUILTIN_ABS_F32(z.y); float rr, ri; if (x < 0x1.0p+25f && y < 0x1.0p+25f) { float2 omx = sub(1.0f, x); float2 opx = add(1.0f, x); float2 y2 = sqr(y); float2 b = sub(mul(omx, opx), y2); ri = 0.5f * MATH_MANGLE(atan2)(2.0f * y, b.hi); float2 a; float2 d = add(sqr(opx), y2); if (x < 0x1.0p-3f * d.hi) { a = fsub(1.0f, div(4.0f*x, d)); } else { a = div(add(sqr(omx), y2), d); } rr = -0.25f * MATH_PRIVATE(lnep)(a, 0); } else { int e = BUILTIN_FREXP_EXP_F32(AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(x), AS_UINT(y)))); x = BUILTIN_FLDEXP_F32(x, -e); y = BUILTIN_FLDEXP_F32(y, -e); rr = BUILTIN_FLDEXP_F32(MATH_DIV(x, MATH_MAD(x, x, y*y)), -e); ri = 0x1.921fb6p+0f; } if (!FINITE_ONLY_OPT()) { rr = ((x == 1.0f) & (y == 0.0f)) ? AS_FLOAT(PINFBITPATT_SP32) : rr; rr = x == 0.0f ? 0.0f : rr; rr = BUILTIN_ISINF_F32(x) ? 0.0f : rr; rr = (BUILTIN_ISNAN_F32(x) & BUILTIN_ISINF_F32(y)) ? 0.0f : rr; ri = (BUILTIN_ISNAN_F32(x) & BUILTIN_ISFINITE_F32(y)) ? AS_FLOAT(QNANBITPATT_SP32) : ri; ri = BUILTIN_ISNAN_F32(y) ? y : ri; } rr = BUILTIN_COPYSIGN_F32(rr, z.x); ri = BUILTIN_COPYSIGN_F32(ri, z.y); return (float2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cbrtD.cl000066400000000000000000000012211415221260100204230ustar00rootroot00000000000000 #include "mathD.h" CONSTATTR double MATH_MANGLE(cbrt)(double x) { double a = BUILTIN_ABS_F64(x); int e3 = BUILTIN_FREXP_EXP_F64(a); int e = (int)BUILTIN_RINT_F32(0x1.555556p-2f * (float)e3); a = BUILTIN_FLDEXP_F64(a, -3*e); double c = (double)BUILTIN_EXP2_F32(0x1.555556p-2f * BUILTIN_LOG2_F32((float)a)); double c2 = c * c; c = MATH_MAD(c, MATH_FAST_DIV(MATH_MAD(-c, c2, a), MATH_MAD(c+c, c2, a)), c); c = BUILTIN_FLDEXP_F64(c, e); if (!FINITE_ONLY_OPT()) { c = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_PINF|CLASS_NINF|CLASS_PZER|CLASS_NZER) ? x : c; } return BUILTIN_COPYSIGN_F64(c, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cbrtF.cl000066400000000000000000000020321415221260100204260ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(cbrt)(float x) { if (DAZ_OPT()) { x = BUILTIN_CANONICALIZE_F32(x); } float ax = BUILTIN_ABS_F32(x); if (!DAZ_OPT()) { ax = BUILTIN_CLASS_F32(x, CLASS_NSUB|CLASS_PSUB) ? BUILTIN_FLDEXP_F32(ax, 24) : ax; } float z = BUILTIN_EXP2_F32(0x1.555556p-2f * BUILTIN_LOG2_F32(ax)); z = MATH_MAD(MATH_MAD(MATH_FAST_RCP(z*z), -ax, z), -0x1.555556p-2f, z); if (!DAZ_OPT()) { z = BUILTIN_CLASS_F32(x, CLASS_NSUB|CLASS_PSUB) ? BUILTIN_FLDEXP_F32(z, -8) : z; } z = BUILTIN_CLASS_F32(x, CLASS_QNAN|CLASS_SNAN|CLASS_PINF|CLASS_NINF|CLASS_PZER|CLASS_NZER) ? x : z; return BUILTIN_COPYSIGN_F32(z, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cbrtH.cl000066400000000000000000000012701415221260100204330ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(cbrt) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(cbrt)(half x) { half ret = (half)BUILTIN_EXP2_F32(0x1.555556p-2f * BUILTIN_LOG2_F32((float)BUILTIN_ABS_F16(x))); ret = BUILTIN_COPYSIGN_F16(ret, x); ret = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_PINF|CLASS_NINF|CLASS_PZER|CLASS_NZER) ? x : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ccosD.cl000066400000000000000000000006761415221260100204350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double2 MATH_MANGLE(ccos)(double2 z) { return MATH_MANGLE(ccosh)((double2)(-z.y, z.x)); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ccosF.cl000066400000000000000000000006731415221260100204340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float2 MATH_MANGLE(ccos)(float2 z) { return MATH_MANGLE(ccosh)((float2)(-z.y, z.x)); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ccoshD.cl000066400000000000000000000030051415221260100205720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 z); CONSTATTR double2 MATH_MANGLE(ccosh)(double2 z) { double x = BUILTIN_ABS_F64(z.x); double2 e = MATH_PRIVATE(epexpep)(sub(x, con(0x1.62e42fefa39efp+0,0x1.abc9e3b39803fp-55))); double2 er = rcp(e); er = ldx(er, -4); double2 cx = fadd(e, er); double2 sx = fsub(e, er); double cy; double sy = MATH_MANGLE(sincos)(z.y, &cy); double cxhi, sxhi; if (FINITE_ONLY_OPT()) { cxhi = cx.hi; sxhi = sx.hi; } else { bool b = x >= 0x1.6395a2079b70cp+9; cxhi = b ? AS_DOUBLE(PINFBITPATT_DP64) : cx.hi; sxhi = b ? AS_DOUBLE(PINFBITPATT_DP64) : sx.hi; } double rr = BUILTIN_FLDEXP_F64(cxhi * cy, 1); bool s = x >= 0x1.0p-27; double ri = BUILTIN_FLDEXP_F64(BUILTIN_COPYSIGN_F64(s ? sxhi : x, z.x) * sy, s); if (!FINITE_ONLY_OPT()) { ri = ((x == 0.0) | (z.y == 0.0)) ? BUILTIN_COPYSIGN_F64(0.0, z.y) : ri; rr = (BUILTIN_ISINF_F64(x) & BUILTIN_CLASS_F64(z.y, CLASS_PINF|CLASS_NINF|CLASS_PZER|CLASS_NZER|CLASS_QNAN|CLASS_SNAN)) ? x : rr; } return (double2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ccoshF.cl000066400000000000000000000027511415221260100206030ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 z); CONSTATTR float2 MATH_MANGLE(ccosh)(float2 z) { float x = BUILTIN_ABS_F32(z.x); float2 e = MATH_PRIVATE(epexpep)(sub(x, con(0x1.62e430p+0, -0x1.05c610p-28f))); float2 er = rcp(e); er = ldx(er, -4); float2 cx = fadd(e, er); float2 sx = fsub(e, er); float cy; float sy = MATH_MANGLE(sincos)(z.y, &cy); float cxhi, sxhi; if (FINITE_ONLY_OPT()) { cxhi = cx.hi; sxhi = sx.hi; } else { bool b = x >= 0x1.686fc0p+6f; cxhi = b ? AS_FLOAT(PINFBITPATT_SP32) : cx.hi; sxhi = b ? AS_FLOAT(PINFBITPATT_SP32) : sx.hi; } float rr = BUILTIN_FLDEXP_F32(cxhi * cy, 1); bool s = x >= 0x1.0p-12f; float ri = BUILTIN_FLDEXP_F32(BUILTIN_COPYSIGN_F32(s ? sxhi : x, z.x) * sy, s); if (!FINITE_ONLY_OPT()) { ri = ((x == 0.0f) | (z.y == 0.0f)) ? BUILTIN_COPYSIGN_F32(0.0f, z.y) : ri; rr = (BUILTIN_ISINF_F32(x) & BUILTIN_CLASS_F32(z.y, CLASS_PINF|CLASS_NINF|CLASS_PZER|CLASS_NZER|CLASS_QNAN|CLASS_SNAN)) ? x : rr; } return (float2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cdivD.cl000066400000000000000000000054451415221260100204320ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define CP(A,B,C,D) ({ \ double _a = A; \ double _b = B; \ double _c = C; \ double _d = D; \ double _bd = _b * _d; \ double _e = BUILTIN_FMA_F64(_b, _d, -_bd); \ double _f = BUILTIN_FMA_F64(_a, _c, _bd); \ _f + _e; \ }) CONSTATTR double2 MATH_MANGLE(cdiv)(double2 zn, double2 zd) { double zdx = zd.x; double zdy = zd.y; bool g = BUILTIN_ABS_F64(zdx) > BUILTIN_ABS_F64(zdy); int ed = BUILTIN_FREXP_EXP_F64(g ? zdx : zdy); int en = BUILTIN_FREXP_EXP_F64(BUILTIN_MAX_F64(BUILTIN_ABS_F64(zn.x), BUILTIN_ABS_F64(zn.y))); int es1 = 1022 - ed; int es2 = 1022 - ed - ed; int es3 = 1022 - ed - en; int es = BUILTIN_MIN_S32(BUILTIN_MIN_S32(es1, es2), es3) >> 1; zdx = BUILTIN_FLDEXP_F64(zdx, es); zdy = BUILTIN_FLDEXP_F64(zdy, es); double u = g ? zdx : zdy; double v = g ? zdy : zdx; double d2 = BUILTIN_FMA_F64(u, u, v*v); zdx = BUILTIN_FLDEXP_F64(zdx, es); zdy = BUILTIN_FLDEXP_F64(zdy, es); double tr = CP(zn.x, zn.y, zdx, zdy); double ti = CP(zn.y, -zn.x, zdx, zdy); double rr = MATH_DIV(tr, d2); double ri = MATH_DIV(ti, d2); if (!FINITE_ONLY_OPT()) { if (BUILTIN_ISNAN_F64(rr) && BUILTIN_ISNAN_F64(ri)) { if (d2 == 0.0 && (!BUILTIN_ISNAN_F64(zn.x) || !BUILTIN_ISNAN_F64(zn.y))) { double i = BUILTIN_COPYSIGN_F64(AS_DOUBLE(PINFBITPATT_DP64), zd.x); rr = i * zn.x; ri = i * zn.y; } else if ((BUILTIN_ISINF_F64(zn.x) || BUILTIN_ISINF_F64(zn.y)) && (BUILTIN_ISFINITE_F64(zd.x) && BUILTIN_ISFINITE_F64(zd.y))) { double znx = BUILTIN_COPYSIGN_F64(BUILTIN_ISINF_F64(zn.x) ? 1.0 : 0.0, zn.x); double zny = BUILTIN_COPYSIGN_F64(BUILTIN_ISINF_F64(zn.y) ? 1.0 : 0.0, zn.y); rr = AS_DOUBLE(PINFBITPATT_DP64) * MATH_MAD(znx, zd.x, zny * zd.y); ri = AS_DOUBLE(PINFBITPATT_DP64) * MATH_MAD(zny, zd.x, -znx * zd.y); } else if ((BUILTIN_ISINF_F64(zd.x) || BUILTIN_ISINF_F64(zd.y)) && (BUILTIN_ISFINITE_F64(zn.x) && BUILTIN_ISFINITE_F64(zn.y))) { zdx = BUILTIN_COPYSIGN_F64(BUILTIN_ISINF_F64(zd.x) ? 1.0 : 0.0, zd.x); zdy = BUILTIN_COPYSIGN_F64(BUILTIN_ISINF_F64(zd.y) ? 1.0 : 0.0, zd.y); rr = 0.0 * MATH_MAD(zn.x, zdx, zn.y * zdy); ri = 0.0 * MATH_MAD(zn.y, zdx, -zn.x * zdy); } } } return (double2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cdivF.cl000066400000000000000000000054231415221260100204300ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define CP(A,B,C,D) ({ \ float _a = A; \ float _b = B; \ float _c = C; \ float _d = D; \ float _bd = _b * _d; \ float _e = BUILTIN_FMA_F32(_b, _d, -_bd); \ float _f = BUILTIN_FMA_F32(_a, _c, _bd); \ _f + _e; \ }) CONSTATTR float2 MATH_MANGLE(cdiv)(float2 zn, float2 zd) { float zdx = zd.x; float zdy = zd.y; bool g = BUILTIN_ABS_F32(zdx) > BUILTIN_ABS_F32(zdy); int ed = BUILTIN_FREXP_EXP_F32(g ? zdx : zdy); int en = BUILTIN_FREXP_EXP_F32(BUILTIN_MAX_F32(BUILTIN_ABS_F32(zn.x), BUILTIN_ABS_F32(zn.y))); int es1 = 126 - ed; int es2 = 126 - ed - ed; int es3 = 126 - ed - en; int es = BUILTIN_MIN_S32(BUILTIN_MIN_S32(es1, es2), es3) >> 1; zdx = BUILTIN_FLDEXP_F32(zdx, es); zdy = BUILTIN_FLDEXP_F32(zdy, es); float u = g ? zdx : zdy; float v = g ? zdy : zdx; float d2 = BUILTIN_FMA_F32(u, u, v*v); zdx = BUILTIN_FLDEXP_F32(zdx, es); zdy = BUILTIN_FLDEXP_F32(zdy, es); float tr = CP(zn.x, zn.y, zdx, zdy); float ti = CP(zn.y, -zn.x, zdx, zdy); float rr = MATH_DIV(tr, d2); float ri = MATH_DIV(ti, d2); if (!FINITE_ONLY_OPT()) { if (BUILTIN_ISNAN_F32(rr) && BUILTIN_ISNAN_F32(ri)) { if (d2 == 0.0f && (!BUILTIN_ISNAN_F32(zn.x) || !BUILTIN_ISNAN_F32(zn.y))) { float i = BUILTIN_COPYSIGN_F32(AS_FLOAT(PINFBITPATT_SP32), zd.x); rr = i * zn.x; ri = i * zn.y; } else if ((BUILTIN_ISINF_F32(zn.x) || BUILTIN_ISINF_F32(zn.y)) && (BUILTIN_ISFINITE_F32(zd.x) && BUILTIN_ISFINITE_F32(zd.y))) { float znx = BUILTIN_COPYSIGN_F32(BUILTIN_ISINF_F32(zn.x) ? 1.0f : 0.0f, zn.x); float zny = BUILTIN_COPYSIGN_F32(BUILTIN_ISINF_F32(zn.y) ? 1.0f : 0.0f, zn.y); rr = AS_FLOAT(PINFBITPATT_SP32) * MATH_MAD(znx, zd.x, zny * zd.y); ri = AS_FLOAT(PINFBITPATT_SP32) * MATH_MAD(zny, zd.x, -znx * zd.y); } else if ((BUILTIN_ISINF_F32(zd.x) || BUILTIN_ISINF_F32(zd.y)) && (BUILTIN_ISFINITE_F32(zn.x) && BUILTIN_ISFINITE_F32(zn.y))) { zdx = BUILTIN_COPYSIGN_F32(BUILTIN_ISINF_F32(zd.x) ? 1.0f : 0.0f, zd.x); zdy = BUILTIN_COPYSIGN_F32(BUILTIN_ISINF_F32(zd.y) ? 1.0f : 0.0f, zd.y); rr = 0.0f * MATH_MAD(zn.x, zdx, zn.y * zdy); ri = 0.0f * MATH_MAD(zn.y, zdx, -zn.x * zdy); } } } return (float2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ceilD.cl000066400000000000000000000006461415221260100204170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(ceil)(double x) { return BUILTIN_CEIL_F64(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ceilF.cl000066400000000000000000000006441415221260100204170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(ceil)(float x) { return BUILTIN_CEIL_F32(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ceilH.cl000066400000000000000000000007651415221260100204250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(ceil)(half2 x) { return BUILTIN_CEIL_2F16(x); } CONSTATTR half MATH_MANGLE(ceil)(half x) { return BUILTIN_CEIL_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cexpD.cl000066400000000000000000000023351415221260100204370ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double2 MATH_MANGLE(cexp)(double2 z) { double x = z.s0; double y = z.s1; double cy; double sy = MATH_MANGLE(sincos)(y, &cy); bool g = x > 709.0; double ex = MATH_MANGLE(exp)(x - (g ? 1.0f : 0.0f)); const double e1 = 0x1.5bf0a8b145769p+1; cy *= g ? e1 : 1.0; sy *= g ? e1 : 1.0; double rr = ex * cy; double ri = ex * sy; if (!FINITE_ONLY_OPT()) { bool b = BUILTIN_CLASS_F64(y, CLASS_NINF|CLASS_PINF|CLASS_QNAN|CLASS_SNAN); if (BUILTIN_CLASS_F64(x, CLASS_NINF)) { rr = 0.0; ri = b ? 0.0 : ri; } if (BUILTIN_CLASS_F64(x, CLASS_PINF)) { rr = b ? AS_DOUBLE(PINFBITPATT_DP64) : rr; ri = b ? AS_DOUBLE(QNANBITPATT_DP64) : ri; ri = y == 0.0 ? y : ri; } ri = (BUILTIN_ISNAN_F64(x) & (y == 0.0)) ? y : ri; } return (double2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cexpF.cl000066400000000000000000000023201415221260100204330ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float2 MATH_MANGLE(cexp)(float2 z) { float x = z.s0; float y = z.s1; float cy; float sy = MATH_MANGLE(sincos)(y, &cy); bool g = x > 88.0f; float ex = MATH_MANGLE(exp)(x - (g ? 1.0f : 0.0f)); const float e1 = 0x1.5bf0a8p+1f; cy *= g ? e1 : 1.0f; sy *= g ? e1 : 1.0f; float rr = ex * cy; float ri = ex * sy; if (!FINITE_ONLY_OPT()) { bool b = BUILTIN_CLASS_F32(y, CLASS_NINF|CLASS_PINF|CLASS_QNAN|CLASS_SNAN); if (BUILTIN_CLASS_F32(x, CLASS_NINF)) { rr = 0.0f; ri = b ? 0.0f : ri; } if (BUILTIN_CLASS_F32(x, CLASS_PINF)) { rr = b ? AS_FLOAT(PINFBITPATT_SP32) : rr; ri = b ? AS_FLOAT(QNANBITPATT_SP32) : ri; ri = y == 0.0f ? y : ri; } ri = (BUILTIN_ISNAN_F32(x) & (y == 0.0f)) ? y : ri; } return (float2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/clogD.cl000066400000000000000000000021341415221260100204210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea); CONSTATTR double2 MATH_MANGLE(clog)(double2 z) { double x = z.s0; double y = z.s1; double a = BUILTIN_ABS_F64(x); double b = BUILTIN_ABS_F64(y); double t = BUILTIN_MAX_F64(a, b); int e = BUILTIN_FREXP_EXP_F64(t) ; a = BUILTIN_FLDEXP_F64(a, -e); b = BUILTIN_FLDEXP_F64(b, -e); double rr = 0.5 * MATH_PRIVATE(lnep)(add(sqr(a), sqr(b)), 2*e); double ri = MATH_MANGLE(atan2)(y, x); if (!FINITE_ONLY_OPT()) { rr = ((x == 0.0) & (y == 0.0)) ? AS_DOUBLE(NINFBITPATT_DP64) : rr; rr = (BUILTIN_ISINF_F64(x) | BUILTIN_ISINF_F64(y)) ? AS_DOUBLE(PINFBITPATT_DP64) : rr; } return (double2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/clogF.cl000066400000000000000000000021541415221260100204250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea); CONSTATTR float2 MATH_MANGLE(clog)(float2 z) { float x = z.s0; float y = z.s1; float a = BUILTIN_ABS_F32(x); float b = BUILTIN_ABS_F32(y); float t = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(a), AS_UINT(b))); int e = BUILTIN_FREXP_EXP_F32(t) ; a = BUILTIN_FLDEXP_F32(a, -e); b = BUILTIN_FLDEXP_F32(b, -e); float rr = 0.5f * MATH_PRIVATE(lnep)(add(sqr(a), sqr(b)), 2*e); float ri = MATH_MANGLE(atan2)(y, x); if (!FINITE_ONLY_OPT()) { rr = ((x == 0.0f) & (y == 0.0f)) ? AS_FLOAT(NINFBITPATT_SP32) : rr; rr = (BUILTIN_ISINF_F32(x) | BUILTIN_ISINF_F32(y)) ? AS_FLOAT(PINFBITPATT_SP32) : rr; } return (float2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/copysignD.cl000066400000000000000000000006741415221260100213370ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(copysign)(double x, double y) { return BUILTIN_COPYSIGN_F64(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/copysignF.cl000066400000000000000000000006711415221260100213360ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(copysign)(float x, float y) { return BUILTIN_COPYSIGN_F32(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/copysignH.cl000066400000000000000000000010341415221260100213320ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(copysign)(half2 x, half2 y) { return BUILTIN_COPYSIGN_2F16(x, y); } CONSTATTR half MATH_MANGLE(copysign)(half x, half y) { return BUILTIN_COPYSIGN_F16(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cosD.cl000066400000000000000000000014321415221260100202610ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigredD.h" double MATH_MANGLE(cos)(double x) { double ax = BUILTIN_ABS_F64(x); struct redret r = MATH_PRIVATE(trigred)(ax); struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); sc.s = -sc.s; int2 c = AS_INT2((r.i & 1) != 0 ? sc.s : sc.c); c.hi ^= r.i > 1 ? (int)0x80000000 : 0; if (!FINITE_ONLY_OPT()) { c = BUILTIN_ISFINITE_F64(ax) ? c : AS_INT2(QNANBITPATT_DP64); } return AS_DOUBLE(c); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cosF.cl000066400000000000000000000015771415221260100202750ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" float MATH_MANGLE(cos)(float x) { float ax = BUILTIN_ABS_F32(x); struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax)); #if defined EXTRA_PRECISION struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); #else struct scret sc = MATH_PRIVATE(sincosred)(r.hi); #endif sc.s = -sc.s; float c = (r.i & 1) != 0 ? sc.s : sc.c; c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0)); if (!FINITE_ONLY_OPT()) { c = BUILTIN_ISFINITE_F32(ax) ? c : AS_FLOAT(QNANBITPATT_SP32); } return c; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cosH.cl000066400000000000000000000014651415221260100202730ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigredH.h" UGEN(cos) REQUIRES_16BIT_INSTS half MATH_MANGLE(cos)(half x) { half ax = BUILTIN_ABS_F16(x); struct redret r = MATH_PRIVATE(trigred)(ax); struct scret sc = MATH_PRIVATE(sincosred)(r.hi); sc.s = -sc.s; short c = AS_SHORT((r.i & 1) == (short)0 ? sc.c : sc.s); c ^= r.i > 1 ? (short)0x8000 : (short)0; if (!FINITE_ONLY_OPT()) { c = BUILTIN_ISFINITE_F16(ax) ? c : (short)QNANBITPATT_HP16; } return AS_HALF(c); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cosbD.cl000066400000000000000000000026151415221260100204270ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigredD.h" #define FSUM2(A, B, H, L) \ do { \ double __s = A + B; \ double __t = B - (__s - A); \ H = __s; \ L = __t; \ } while (0) #define FDIF2(A, B, H, L) \ do { \ double __d = A - B; \ double __e = (A - __d) - B; \ H = __d; \ L = __e; \ } while (0) double MATH_PRIVATE(cosb)(double x, int n, double p) { struct redret r = MATH_PRIVATE(trigred)(x); bool b = r.hi < p; r.i = (r.i - b - n) & 3; // This is a properly signed extra precise pi/4 double ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0))); double pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0))); double sh, sl; FDIF2(ph, p, ph, sl); pl += sl; FSUM2(ph, pl, ph, pl); FSUM2(ph, r.hi, sh, sl); sl += pl + r.lo; FSUM2(sh, sl, sh, sl); struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl); sc.s = -sc.s; int2 c = AS_INT2((r.i & 1) != 0 ? sc.s : sc.c); c.hi ^= r.i > 1 ? 0x80000000 : 0; return AS_DOUBLE(c); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cosbF.cl000066400000000000000000000027071415221260100204330ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" #define FSUM2(A, B, H, L) \ do { \ float __s = A + B; \ float __t = B - (__s - A); \ H = __s; \ L = __t; \ } while (0) #define FDIF2(A, B, H, L) \ do { \ float __d = A - B; \ float __e = (A - __d) - B; \ H = __d; \ L = __e; \ } while (0) float MATH_PRIVATE(cosb)(float x, int n, float p) { struct redret r = MATH_PRIVATE(trigred)(x); bool b = r.hi < p; r.i = (r.i - b - n) & 3; #if defined EXTRA_PRECISION float ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); float pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0)); float sh, sl; FDIF2(ph, p, ph, sl); pl += sl; FSUM2(ph, pl, ph, pl); FSUM2(ph, r.hi, sh, sl); sl += pl + r.lo; FSUM2(sh, sl, sh, sl); struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl); #else r.hi = r.hi - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); struct scret sc = MATH_PRIVATE(sincosred)(r.hi); #endif sc.s = -sc.s; float c = (r.i & 1) != 0 ? sc.s : sc.c; c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0)); return c; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/coshD.cl000066400000000000000000000014601415221260100204320ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x); CONSTATTR double MATH_MANGLE(cosh)(double x) { x = BUILTIN_ABS_F64(x); double2 e = MATH_PRIVATE(epexpep)(sub(x, con(0x1.62e42fefa39efp-1,0x1.abc9e3b39803fp-56))); double2 c = fadd(e, ldx(rcp(e), -2)); double z = c.hi; if (!FINITE_ONLY_OPT()) { z = x >= 0x1.633ce8fb9f87ep+9 ? AS_DOUBLE(PINFBITPATT_DP64) : z; } return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/coshF.cl000066400000000000000000000014241415221260100204340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x); CONSTATTR float MATH_MANGLE(cosh)(float x) { x = BUILTIN_ABS_F32(x); float2 e = MATH_PRIVATE(epexpep)(sub(x, con(0x1.62e430p-1f, -0x1.05c610p-29f))); float2 c = fadd(e, ldx(rcp(e), -2)); float z = c.hi; if (!FINITE_ONLY_OPT()) { z = x > 0x1.65a9f8p+6f ? AS_FLOAT(PINFBITPATT_SP32) : z; } return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/coshH.cl000066400000000000000000000010141415221260100204310ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(cosh) CONSTATTR half MATH_MANGLE(cosh)(half hx) { float x = (float)hx * 0x1.715476p+0f; return (half)(0.5f * (BUILTIN_EXP2_F32(x) + BUILTIN_EXP2_F32(-x))); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cospiD.cl000066400000000000000000000014331415221260100206130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigpiredD.h" double MATH_MANGLE(cospi)(double x) { double ax = BUILTIN_ABS_F64(x); struct redret r = MATH_PRIVATE(trigpired)(ax); struct scret sc = MATH_PRIVATE(sincospired)(r.hi); sc.s = -sc.s; int2 c = AS_INT2((r.i & 1) == 0 ? sc.c : sc.s); c.hi ^= r.i > 1 ? (int)0x80000000 : 0; if (!FINITE_ONLY_OPT()) { c = BUILTIN_ISFINITE_F64(ax) ? c : AS_INT2(QNANBITPATT_DP64); } return AS_DOUBLE(c); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cospiF.cl000066400000000000000000000014401415221260100206130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigpiredF.h" CONSTATTR float MATH_MANGLE(cospi)(float x) { float ax = BUILTIN_ABS_F32(x); struct redret r = MATH_PRIVATE(trigpired)(ax); struct scret sc = MATH_PRIVATE(sincospired)(r.hi); sc.s = -sc.s; float c = (r.i & 1) != 0 ? sc.s : sc.c; c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0)); if (!FINITE_ONLY_OPT()) { c = BUILTIN_ISFINITE_F32(ax) ? c : AS_FLOAT(QNANBITPATT_SP32); } return c; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/cospiH.cl000066400000000000000000000015151415221260100206200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigpiredH.h" UGEN(cospi) REQUIRES_16BIT_INSTS half MATH_MANGLE(cospi)(half x) { half ax = BUILTIN_ABS_F16(x); struct redret r = MATH_PRIVATE(trigpired)(ax); struct scret sc = MATH_PRIVATE(sincospired)(r.hi); sc.s = -sc.s; short c = AS_SHORT((r.i & (short)1) == (short)0 ? sc.c : sc.s); c ^= r.i > (short)1 ? (short)0x8000 : (short)0; if (!FINITE_ONLY_OPT()) { c = BUILTIN_ISFINITE_F16(ax) ? c : (short)QNANBITPATT_HP16; } return AS_HALF(c); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/csinD.cl000066400000000000000000000007441415221260100204360ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double2 MATH_MANGLE(csin)(double2 z) { double2 r = MATH_MANGLE(csinh)((double2)(-z.y, z.x)); return (double2)(r.y, -r.x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/csinF.cl000066400000000000000000000007371415221260100204420ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float2 MATH_MANGLE(csin)(float2 z) { float2 r = MATH_MANGLE(csinh)((float2)(-z.y, z.x)); return (float2)(r.y, -r.x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/csinhD.cl000066400000000000000000000030761415221260100206070ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 z); CONSTATTR double2 MATH_MANGLE(csinh)(double2 z) { double x = BUILTIN_ABS_F64(z.x); double2 e = MATH_PRIVATE(epexpep)(sub(x, con(0x1.62e42fefa39efp+0,0x1.abc9e3b39803fp-55))); double2 er = rcp(e); er = ldx(er, -4); double2 cx = fadd(e, er); double2 sx = fsub(e, er); double cy; double sy = MATH_MANGLE(sincos)(z.y, &cy); double cxhi, sxhi; if (FINITE_ONLY_OPT()) { cxhi = cx.hi; sxhi = sx.hi; } else { bool b = x >= 0x1.6395a2079b70cp+9; cxhi = b ? AS_DOUBLE(PINFBITPATT_DP64) : cx.hi; sxhi = b ? AS_DOUBLE(PINFBITPATT_DP64) : sx.hi; } bool s = x >= 0x1.0p-27; double rr = BUILTIN_FLDEXP_F64(BUILTIN_COPYSIGN_F64(s ? sxhi : x, z.x) * cy, s); double ri = BUILTIN_FLDEXP_F64(cxhi * sy, 1); if (!FINITE_ONLY_OPT()) { rr = (BUILTIN_CLASS_F64(x, CLASS_PZER|CLASS_NZER|CLASS_PINF|CLASS_NINF) & BUILTIN_CLASS_F64(z.y, CLASS_PINF|CLASS_NINF|CLASS_QNAN|CLASS_SNAN)) ? z.x : rr; ri = (BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_NINF|CLASS_QNAN|CLASS_SNAN) & (z.y == 0.0)) ? z.y : ri; } return (double2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/csinhF.cl000066400000000000000000000030351415221260100206040ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 z); CONSTATTR float2 MATH_MANGLE(csinh)(float2 z) { float x = BUILTIN_ABS_F32(z.x); float2 e = MATH_PRIVATE(epexpep)(sub(x, con(0x1.62e430p+0, -0x1.05c610p-28f))); float2 er = rcp(e); er = ldx(er, -4); float2 cx = fadd(e, er); float2 sx = fsub(e, er); float cy; float sy = MATH_MANGLE(sincos)(z.y, &cy); float cxhi, sxhi; if (FINITE_ONLY_OPT()) { cxhi = cx.hi; sxhi = sx.hi; } else { bool b = x >= 0x1.686fc0p+6f; cxhi = b ? AS_FLOAT(PINFBITPATT_SP32) : cx.hi; sxhi = b ? AS_FLOAT(PINFBITPATT_SP32) : sx.hi; } bool s = x >= 0x1.0p-12f; float rr = BUILTIN_FLDEXP_F32(BUILTIN_COPYSIGN_F32(s ? sxhi : x, z.x) * cy, s); float ri = BUILTIN_FLDEXP_F32(cxhi * sy, 1); if (!FINITE_ONLY_OPT()) { rr = (BUILTIN_CLASS_F32(x, CLASS_PZER|CLASS_NZER|CLASS_PINF|CLASS_NINF) & BUILTIN_CLASS_F32(z.y, CLASS_PINF|CLASS_NINF|CLASS_QNAN|CLASS_SNAN)) ? z.x : rr; ri = (BUILTIN_CLASS_F32(x, CLASS_PINF|CLASS_NINF|CLASS_QNAN|CLASS_SNAN) & (z.y == 0.0f)) ? z.y : ri; } return (float2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/csqrtD.cl000066400000000000000000000030061415221260100206300ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double2 MATH_MANGLE(csqrt)(double2 z) { double a = BUILTIN_ABS_F64(z.x); double b = BUILTIN_ABS_F64(z.y); double t = BUILTIN_MAX_F64(a, b); if (!FINITE_ONLY_OPT()) { t = (BUILTIN_ISNAN_F64(a) | BUILTIN_ISNAN_F64(b)) ? AS_DOUBLE(QNANBITPATT_DP64) : t; } int e = BUILTIN_FREXP_EXP_F64(t); double as = BUILTIN_FLDEXP_F64(a, -e); double bs = BUILTIN_FLDEXP_F64(b, -e); bool o = BUILTIN_CLASS_F64(t, CLASS_NZER|CLASS_PZER|CLASS_NINF|CLASS_PINF|CLASS_QNAN|CLASS_SNAN); double p = MATH_FAST_SQRT(MATH_MAD(as, as, bs*bs)); p = o ? t : p; int k = (e & 1) ^ 1; p = BUILTIN_FLDEXP_F64(p + as, k); p = BUILTIN_FLDEXP_F64(MATH_FAST_SQRT(p), (e >> 1) - k); p = o ? t : p; double q = BUILTIN_FLDEXP_F64(MATH_DIV(b, p), -1); q = t == 0.0 ? t : q; bool l = z.x < 0.0; double rr = l ? q : p; double ri = l ? p : q; if (!FINITE_ONLY_OPT()) { bool i = BUILTIN_ISINF_F64(b); rr = i ? b : rr; ri = i ? b : ri; ri = BUILTIN_CLASS_F64(z.x, CLASS_NINF) ? a : ri; rr = BUILTIN_CLASS_F64(z.x, CLASS_PINF) ? a : rr; } return (double2)(rr, BUILTIN_COPYSIGN_F64(ri, z.y)); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/csqrtF.cl000066400000000000000000000023341415221260100206350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float2 MATH_MANGLE(csqrt)(float2 z) { float a = BUILTIN_ABS_F32(z.x); float b = BUILTIN_ABS_F32(z.y); int e = BUILTIN_FREXP_EXP_F32(BUILTIN_MAX_F32(a, b)); float as = BUILTIN_FLDEXP_F32(a, -e); float bs = BUILTIN_FLDEXP_F32(b, -e); float p = MATH_FAST_SQRT(MATH_MAD(as, as, bs*bs)); int k = (e & 1) ^ 1; p = BUILTIN_FLDEXP_F32(p + as, k); p = BUILTIN_FLDEXP_F32(MATH_FAST_SQRT(p), (e >> 1) - k); float q = BUILTIN_FLDEXP_F32(MATH_DIV(b, p), -1); q = p == 0.0f ? p : q; bool l = z.x < 0.0f; float rr = l ? q : p; float ri = l ? p : q; if (!FINITE_ONLY_OPT()) { bool i = BUILTIN_ISINF_F32(b); rr = i ? b : rr; ri = i ? b : ri; ri = BUILTIN_CLASS_F32(z.x, CLASS_NINF) ? a : ri; rr = BUILTIN_CLASS_F32(z.x, CLASS_PINF) ? a : rr; } return (float2)(rr, BUILTIN_COPYSIGN_F32(ri, z.y)); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ctanD.cl000066400000000000000000000007441415221260100204270ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double2 MATH_MANGLE(ctan)(double2 z) { double2 r = MATH_MANGLE(ctanh)((double2)(-z.y, z.x)); return (double2)(r.y, -r.x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ctanF.cl000066400000000000000000000007371415221260100204330ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float2 MATH_MANGLE(ctan)(float2 z) { float2 r = MATH_MANGLE(ctanh)((float2)(-z.y, z.x)); return (float2)(r.y, -r.x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ctanhD.cl000066400000000000000000000033331415221260100205740ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 z); CONSTATTR double2 MATH_MANGLE(ctanh)(double2 z) { double cy; double sy = MATH_MANGLE(sincos)(z.y, &cy); double cysy = cy*sy; double x = BUILTIN_ABS_F64(z.x); double rr, ri; if (x < 0x1.419ecb712c481p+4) { double2 e = MATH_PRIVATE(epexpep)(sub(x, con(0x1.62e42fefa39efp-1,0x1.abc9e3b39803fp-56))); double2 er = rcp(e); er = ldx(er, -2); double2 cx = fadd(e, er); double2 sx = fsub(e, er); double cxhi = cx.hi; double sxhi = x < 0x1.0p-27 ? x : sx.hi; double d = MATH_MAD(cy, cy, sxhi*sxhi); rr = BUILTIN_COPYSIGN_F64(MATH_DIV(cxhi*sxhi, d), z.x); ri = MATH_DIV(cysy, d); } else { rr = BUILTIN_COPYSIGN_F64(1.0, z.x); ri = 4.0 * cysy * MATH_MANGLE(exp)(-2.0 * x); } if (!FINITE_ONLY_OPT()) { bool xn = BUILTIN_ISNAN_F64(x); bool yin = BUILTIN_CLASS_F64(z.y, CLASS_NINF|CLASS_PINF|CLASS_QNAN|CLASS_SNAN); bool ni = BUILTIN_CLASS_F64(x, CLASS_PZER|CLASS_PSUB|CLASS_PNOR) & yin; rr = (ni | xn) ? AS_DOUBLE(QNANBITPATT_DP64) : rr; ri = ni ? AS_DOUBLE(QNANBITPATT_DP64) : ri; ri = (BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_NINF) & yin) ? 0.0 : ri; ri = (xn & (z.y == 0.0)) ? z.y : ri; } return (double2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ctanhF.cl000066400000000000000000000032741415221260100206020ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 z); CONSTATTR float2 MATH_MANGLE(ctanh)(float2 z) { float cy; float sy = MATH_MANGLE(sincos)(z.y, &cy); float cysy = cy*sy; float x = BUILTIN_ABS_F32(z.x); float rr, ri; if (x < 0x1.3687aap+3f) { float2 e = MATH_PRIVATE(epexpep)(sub(x, con(0x1.62e430p-1, -0x1.05c610p-29f))); float2 er = rcp(e); er = ldx(er, -2); float2 cx = fadd(e, er); float2 sx = fsub(e, er); float cxhi = cx.hi; float sxhi = x < 0x1.0p-12f ? x : sx.hi; float d = MATH_MAD(cy, cy, sxhi*sxhi); rr = BUILTIN_COPYSIGN_F32(MATH_DIV(cxhi*sxhi, d), z.x); ri = MATH_DIV(cysy, d); } else { rr = BUILTIN_COPYSIGN_F32(1.0f, z.x); ri = 4.0f * cysy * MATH_MANGLE(exp)(-2.0f * x); } if (!FINITE_ONLY_OPT()) { bool xn = BUILTIN_ISNAN_F32(x); bool yin = BUILTIN_CLASS_F32(z.y, CLASS_NINF|CLASS_PINF|CLASS_QNAN|CLASS_SNAN); bool ni = BUILTIN_CLASS_F32(x, CLASS_PZER|CLASS_PSUB|CLASS_PNOR) & yin; rr = (ni | xn) ? AS_FLOAT(QNANBITPATT_SP32) : rr; ri = ni ? AS_FLOAT(QNANBITPATT_SP32) : ri; ri = (BUILTIN_CLASS_F32(x, CLASS_PINF|CLASS_NINF) & yin) ? 0.0f : ri; ri = (xn & (z.y == 0.0f)) ? z.y : ri; } return (float2)(rr, ri); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/divD.cl000066400000000000000000000010621415221260100202560ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define GEN(LN,UN) \ CONSTATTR double \ MATH_MANGLE(LN)(double x, double y) \ { \ return BUILTIN_##UN##_F64(x, y); \ } // GEN(div_rte,DIV_RTE) // GEN(div_rtn,DIV_RTN) // GEN(div_rtp,DIV_RTP) // GEN(div_rtz,DIV_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/divF.cl000066400000000000000000000010571415221260100202640ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define GEN(LN,UN) \ CONSTATTR float \ MATH_MANGLE(LN)(float x, float y) \ { \ return BUILTIN_##UN##_F32(x, y); \ } // GEN(div_rte,DIV_RTE) // GEN(div_rtn,DIV_RTN) // GEN(div_rtp,DIV_RTP) // GEN(div_rtz,DIV_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/divH.cl000066400000000000000000000010541415221260100202630ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #define GEN(LN,UN) \ CONSTATTR half \ MATH_MANGLE(LN)(half x, half y) \ { \ return BUILTIN_##UN##_F16(x, y); \ } // GEN(div_rte,DIV_RTE) // GEN(div_rtn,DIV_RTN) // GEN(div_rtp,DIV_RTP) // GEN(div_rtz,DIV_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/ep.h000066400000000000000000000171001415221260100176250ustar00rootroot00000000000000 #define ATTR __attribute__((const, overloadable)) #if defined FLOAT_SPECIALIZATION #define T float #define T2 float2 #define FMA BUILTIN_FMA_F32 #define RCP MATH_FAST_RCP #define DIV(X,Y) MATH_FAST_DIV(X,Y) #define LDEXP BUILTIN_FLDEXP_F32 #define SQRT MATH_FAST_SQRT #define ISINF(X) BUILTIN_ISINF_F32(X) #define USE_FMA HAVE_FAST_FMA32() #define HIGH(X) AS_FLOAT(AS_UINT(X) & 0xfffff000U) #define SIGNBIT(X) (AS_INT(X) < 0) #define SAMESIGN(X,Y) ((AS_INT(X)& 0x80000000) == (AS_INT(Y) & 0x80000000)) #endif #if defined DOUBLE_SPECIALIZATION #define T double #define T2 double2 #define FMA BUILTIN_FMA_F64 #define RCP MATH_FAST_RCP #define DIV(X,Y) MATH_FAST_DIV(X,Y) #define LDEXP BUILTIN_FLDEXP_F64 #define SQRT MATH_FAST_SQRT #define ISINF(X) BUILTIN_ISINF_F64(X) #define USE_FMA true #define HIGH(X) AS_DOUBLE(AS_ULONG(X) & 0xfffffffff8000000UL) #define SIGNBIT(X) (AS_INT2(X).hi < 0) #define SAMESIGN(X,Y) ((AS_INT2(X).hi & 0x80000000) == (AS_INT2(Y).hi & 0x80000000)) #endif #if defined HALF_SPECIALIZATION #define T half #define T2 half2 #define FMA BUILTIN_FMA_F16 #define RCP MATH_FAST_RCP #define DIV(X,Y) MATH_FAST_DIV(X,Y) #define LDEXP BUILTIN_FLDEXP_F16 #define SQRT MATH_FAST_SQRT #define ISINF(X) BUILTIN_ISINF_F16(X) #define USE_FMA true #define HIGH(X) AS_HALF(AS_USHORT(X) & (ushort)0xffc0U) #define SIGNBIT(X) (AS_SHORT(X) < (short)0) #define SAMESIGN(X,Y) ((AS_USHORT(X) & (ushort)0x8000) == (AS_USHORT(Y) & (ushort)0x8000)) #endif static ATTR T2 absv(T2 a) { return SIGNBIT(a.hi) ? -a : a; } static ATTR T2 csgn(T2 a, T2 b) { return SAMESIGN(a.hi, b.hi) ? a : -a; } static ATTR T2 con(T a, T b) { return (T2)(b, a); } static ATTR T2 fadd(T a, T b) { T s = a + b; return con(s, b - (s - a)); } static ATTR T2 nrm(T2 a) { return fadd(a.hi, a.lo); } static ATTR T2 onrm(T2 a) { T s = a.hi + a.lo; T t = a.lo - (s - a.hi); s = ISINF(a.hi) ? a.hi : s; return con(s, ISINF(s) ? (T)0 : t); } static ATTR T2 fsub(T a, T b) { T d = a - b; return con(d, (a - d) - b); } static ATTR T2 add(T a, T b) { T s = a + b; T d = s - a; return con(s, (a - (s - d)) + (b - d)); } static ATTR T2 sub(T a, T b) { T d = a - b; T e = d - a; return con(d, (a - (d - e)) - (b + e)); } static ATTR T2 mul(T a, T b) { T p = a * b; if (USE_FMA) { return con(p, FMA(a, b, -p)); } else { T ah = HIGH(a); T al = a - ah; T bh = HIGH(b); T bl = b - bh; T p = a * b; return con(p, ((ah*bh - p) + ah*bl + al*bh) + al*bl); } } static ATTR T2 sqr(T a) { T p = a * a; if (USE_FMA) { return con(p, FMA(a, a, -p)); } else { T ah = HIGH(a); T al = a - ah; return con(p, ((ah*ah - p) + 2.0f*ah*al) + al*al); } } static ATTR T2 add(T2 a, T b) { T2 s = add(a.hi, b); s.lo += a.lo; return nrm(s); } static ATTR T2 fadd(T2 a, T b) { T2 s = fadd(a.hi, b); s.lo += a.lo; return nrm(s); } static ATTR T2 add(T a, T2 b) { T2 s = add(a, b.hi); s.lo += b.lo; return nrm(s); } static ATTR T2 fadd(T a, T2 b) { T2 s = fadd(a, b.hi); s.lo += b.lo; return nrm(s); } static ATTR T2 add(T2 a, T2 b) { T2 s = add(a.hi, b.hi); T2 t = add(a.lo, b.lo); s.lo += t.hi; s = nrm(s); s.lo += t.lo; return nrm(s); } static ATTR T2 fadd(T2 a, T2 b) { T2 s = fadd(a.hi, b.hi); s.lo += a.lo + b.lo; return nrm(s); } static ATTR T2 sub(T2 a, T b) { T2 d = sub(a.hi, b); d.lo += a.lo; return nrm(d); } static ATTR T2 fsub(T2 a, T b) { T2 d = fsub(a.hi, b); d.lo += a.lo; return nrm(d); } static ATTR T2 sub(T a, T2 b) { T2 d = sub(a, b.hi); d.lo -= b.lo; return nrm(d); } static ATTR T2 fsub(T a, T2 b) { T2 d = fsub(a, b.hi); d.lo -= b.lo; return nrm(d); } static ATTR T2 sub(T2 a, T2 b) { T2 d = sub(a.hi, b.hi); T2 e = sub(a.lo, b.lo); d.lo += e.hi; d = nrm(d); d.lo += e.lo; return nrm(d); } static ATTR T2 fsub(T2 a, T2 b) { T2 d = fsub(a.hi, b.hi); d.lo = d.lo + a.lo - b.lo; return nrm(d); } static ATTR T2 ldx(T2 a, int e) { return con(LDEXP(a.hi, e), LDEXP(a.lo, e)); } static ATTR T2 mul(T2 a, T b) { T2 p = mul(a.hi, b); if (USE_FMA) { p.lo = FMA(a.lo, b, p.lo); } else { p.lo += a.lo * b; } return nrm(p); } static ATTR T2 omul(T2 a, T b) { T2 p = mul(a.hi, b); if (USE_FMA) { p.lo = FMA(a.lo, b, p.lo); } else { p.lo += a.lo * b; } return onrm(p); } static ATTR T2 mul(T a, T2 b) { T2 p = mul(a, b.hi); if (USE_FMA) { p.lo = FMA(a, b.lo, p.lo); } else { p.lo += a * b.lo; } return nrm(p); } static ATTR T2 omul(T a, T2 b) { T2 p = mul(a, b.hi); if (USE_FMA) { p.lo = FMA(a, b.lo, p.lo); } else { p.lo += a * b.lo; } return onrm(p); } static ATTR T2 mul(T2 a, T2 b) { T2 p = mul(a.hi, b.hi); if (USE_FMA) { p.lo = FMA(a.lo, b.hi, FMA(a.hi, b.lo, p.lo)); } else { p.lo += a.hi*b.lo + a.lo*b.hi; } return nrm(p); } static ATTR T2 omul(T2 a, T2 b) { T2 p = mul(a.hi, b.hi); if (USE_FMA) { p.lo += FMA(a.hi, b.lo, a.lo*b.hi); } else { p.lo += a.hi*b.lo + a.lo*b.hi; } return onrm(p); } static ATTR T2 div(T a, T b) { T r = RCP(b); T qhi = a * r; T2 p = mul(qhi, b); T2 d = fsub(a, p.hi); d.lo -= p.lo; T qlo = (d.hi + d.lo) * r; return fadd(qhi, qlo); } static ATTR T2 div(T2 a, T b) { T r = RCP(b); T qhi = a.hi * r; T2 p = mul(qhi, b); T2 d = fsub(a.hi, p.hi); d.lo = d.lo + a.lo - p.lo; T qlo = (d.hi + d.lo) * r; return fadd(qhi, qlo); } static ATTR T2 div(T a, T2 b) { T r = RCP(b.hi); T qhi = a * r; T2 p = mul(qhi, b); T2 d = fsub(a, p.hi); d.lo -= p.lo; T qlo = (d.hi + d.lo) * r; return fadd(qhi, qlo); } static ATTR T2 fdiv(T2 a, T2 b) { T r = RCP(b.hi); T qhi = a.hi * r; T2 p = mul(qhi, b); T2 d = fsub(a.hi, p.hi); d.lo = d.lo - p.lo + a.lo; T qlo = (d.hi + d.lo) * r; return fadd(qhi, qlo); } static ATTR T2 div(T2 a, T2 b) { T y = RCP(b.hi); T qhi = a.hi * y; T2 r = fsub(a, mul(qhi, b)); T qmi = r.hi * y; r = fsub(r, mul(qmi, b)); T qlo = r.hi * y; T2 q = fadd(qhi, qmi); q.lo += qlo; return nrm(q); } static ATTR T2 rcp(T b) { T qhi = RCP(b); T2 p = mul(qhi, b); T2 d = fsub((T)1, p.hi); d.lo -= p.lo; T qlo = (d.hi + d.lo) * qhi; return fadd(qhi, qlo); } static ATTR T2 frcp(T2 b) { T qhi = RCP(b.hi); T2 p = mul(qhi, b); T2 d = fsub((T)1, p.hi); d.lo -= p.lo; T qlo = (d.hi + d.lo) * qhi; return fadd(qhi, qlo); } static ATTR T2 rcp(T2 b) { T qhi = RCP(b.hi); T2 r = fsub((T)1, mul(qhi, b)); T qmi = r.hi * qhi; r = fsub(r, mul(qmi, b)); T qlo = r.hi * qhi; T2 q = fadd(qhi, qmi); q.lo += qlo; return nrm(q); } static ATTR T2 sqr(T2 a) { T2 p = sqr(a.hi); if (USE_FMA) { p.lo = FMA(a.hi, (T)2 * a.lo, p.lo); } else { p.lo = p.lo + (T)2 * a.lo * a.hi; } return nrm(p); } static ATTR T2 root2(T a) { T shi = SQRT(a); T2 e = fsub(a, sqr(shi)); T slo = DIV(e.hi, (T)2 * shi); return fadd(shi, a == (T)0 ? (T)0 : slo); } static ATTR T2 root2(T2 a) { T shi = SQRT(a.hi); T2 e = fsub(a, sqr(shi)); T slo = DIV(e.hi, (T)2 * shi); return fadd(shi, a.hi == (T)0 ? (T)0 : slo); } #undef ATTR #undef T #undef T2 #undef FMA #undef RCP #undef DIV #undef LDEXP #undef SQRT #undef ISINF #undef USE_FMA #undef HIGH #undef COPYSIGN #undef SIGNBIT #undef SAMESIGN ROCm-Device-Libs-rocm-5.0.0/ocml/src/epcsqrtepD.cl000066400000000000000000000014001415221260100214760ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" CONSTATTR double4 MATH_PRIVATE(epcsqrtep)(double4 z) { double2 x = z.lo; double2 y = z.hi; double2 u = root2(fadd(root2(add(sqr(x), sqr(y))), absv(x)) * 0.5); double2 v = absv(fdiv(y, u) * 0.5); v = ((y.hi == 0.0) & (u.hi == 0.0)) ? y : v; bool b = x.hi >= 0.0; double2 s = b ? u : v; double2 t = csgn(b ? v : u, y); return (double4)(s, t); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/epcsqrtepF.cl000066400000000000000000000013731415221260100215110ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" CONSTATTR float4 MATH_PRIVATE(epcsqrtep)(float4 z) { float2 x = z.lo; float2 y = z.hi; float2 u = root2(fadd(root2(add(sqr(x), sqr(y))), absv(x)) * 0.5f); float2 v = absv(fdiv(y, u) * 0.5f); v = ((y.hi == 0.0f) & (u.hi == 0.0f)) ? y : v; bool b = x.hi >= 0.0f; float2 s = b ? u : v; float2 t = csgn(b ? v : u, y); return (float4)(s, t); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/epexpepD.cl000066400000000000000000000022641415221260100211470ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x) { double dn = BUILTIN_RINT_F64(x.hi * 0x1.71547652b82fep+0); double2 t = fsub(fsub(fadd(MATH_MAD(dn, -0x1.62e42fefa3000p-1, x.hi), x.lo), dn*0x1.3de6af278e000p-42), dn*0x1.9cc01f97b57a0p-83); double th = t.hi; double p = MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, 0x1.ade156a5dcb37p-26, 0x1.28af3fca7ab0cp-22), 0x1.71dee623fde64p-19), 0x1.a01997c89e6b0p-16), 0x1.a01a014761f6ep-13), 0x1.6c16c1852b7b0p-10), 0x1.1111111122322p-7), 0x1.55555555502a1p-5), 0x1.5555555555511p-3), 0x1.000000000000bp-1); double2 r = fadd(1.0, fadd(t, mul(sqr(t), p))); return ldx(r, (int)dn); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/epexpepF.cl000066400000000000000000000016111415221260100211440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x) { float fn = BUILTIN_RINT_F32(x.hi * 0x1.715476p+0f); float2 t = fsub(fsub(fadd(MATH_MAD(fn, -0x1.62e400p-1f, x.hi), x.lo), fn*0x1.7f7800p-20f), fn*0x1.473de6p-34f); float th = t.hi; float p = MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, 0x1.6850e4p-10f, 0x1.123bccp-7f), 0x1.555b98p-5f), 0x1.55548ep-3f), 0x1.fffff8p-2f); float2 r = fadd(1.0f, fadd(t, mul(sqr(t), p))); return ldx(r, (int)fn); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/eplnD.cl000066400000000000000000000026121415221260100204340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" CONSTATTR double2 MATH_PRIVATE(epln)(double a) { double m = BUILTIN_FREXP_MANT_F64(a); int b = m < (2.0/3.0); m = BUILTIN_FLDEXP_F64(m, b); int e = BUILTIN_FREXP_EXP_F64(a) - b; double2 x = div(m - 1.0, fadd(1.0, m)); double2 s = sqr(x); double t = s.hi; double p = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.dee674222de17p-4, 0x1.a6564968915a9p-4), 0x1.e25e43abe935ap-4), 0x1.110ef47e6c9c2p-3), 0x1.3b13bcfa74449p-3), 0x1.745d171bf3c30p-3), 0x1.c71c71c7792cep-3), 0x1.24924924920dap-2), 0x1.999999999999cp-2); // ln(2)*e + 2*x + x^3(c3 + x^2*p) double2 r = add(mul(con(0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56), (double)e), fadd(ldx(x,1), mul(mul(s, x), fadd(con(0x1.5555555555555p-1,0x1.543b0d5df274dp-55), mul(s, p))))); return r; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/eplnF.cl000066400000000000000000000020701415221260100204340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" CONSTATTR float2 MATH_PRIVATE(epln)(float a) { float m = BUILTIN_FREXP_MANT_F32(a); int b = m < (2.0f/3.0f); m = BUILTIN_FLDEXP_F32(m, b); int e = BUILTIN_FREXP_EXP_F32(a) - b; float2 x = div(m - 1.0f, fadd(1.0f, m)); float2 s = sqr(x); float t = s.hi; float p = MATH_MAD(t, MATH_MAD(t, 0x1.ed89c2p-3f, 0x1.23e988p-2f), 0x1.999bdep-2f); // ln(2)*e + 2*x + x^3(c3 + x^2*p) float2 r = add(mul(con(0x1.62e430p-1f, -0x1.05c610p-29f), (float)e), fadd(ldx(x,1), mul(mul(s, x), fadd(con(0x1.555554p-1f,0x1.e72020p-29f), mul(s, p))))); return r; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfD.cl000066400000000000000000000047471415221260100202650ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(erf)(double x) { double ax = BUILTIN_ABS_F64(x); double ret; if (ax < 1.0) { double t = ax * ax; double p = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.ab15c51d2ebebp-31, 0x1.d6e3ddfeb1f49p-27), -0x1.5bfe76384472p-23), 0x1.b97e44280cfb9p-20), -0x1.f4ca204c771c5p-17), 0x1.f9a2b75531772p-14), -0x1.c02db0149d904p-11), 0x1.565bccf7e2856p-8), -0x1.b82ce311ee09bp-6), 0x1.ce2f21a0408d1p-4), -0x1.812746b0379b2p-2), 0x1.06eba8214db68p-3); ret = MATH_MAD(ax, p, ax); } else { double p = MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, 0x1.98d37c14b24bep-58, -0x1.145a3502a41cdp-51), 0x1.62deed735f9ecp-46), -0x1.1ffe55552ca22p-41), 0x1.4b9ba7074b644p-37), -0x1.20345a78ce24p-33), 0x1.88b7a0cefddd8p-30), -0x1.aded48c94b617p-27), 0x1.803aa312306dp-24), -0x1.1b0106f4c5a9bp-21), 0x1.58c0e7cfd79aep-19), -0x1.59e386410fdf7p-17), 0x1.192fc1f9b1786p-15), -0x1.62cf3f4634b2ep-14), 0x1.314dfb42f7e4bp-13), -0x1.2cb68c047288ap-14), -0x1.038ff7bbcce25p-11), 0x1.a9466ae1babaep-10), -0x1.58be1e65a6063p-13), -0x1.39bc16738ee3ap-6), 0x1.a4fbc28146b69p-4), 0x1.45f2da69750c4p-1), 0x1.06ebb919fcca8p-3); p = MATH_MAD(ax, p, ax); ret = 1.0 - MATH_MANGLE(exp)(-p); } ret = BUILTIN_COPYSIGN_F64(ret, x); return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfF.cl000066400000000000000000000022701415221260100202540ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(erf)(float x) { float ax = BUILTIN_ABS_F32(x); float ret; if (ax < 1.0f) { float t = ax*ax; float p = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.268bc2p-11f, 0x1.420828p-8f), -0x1.b5937p-6f), 0x1.ce077cp-4f), -0x1.81266p-2f), 0x1.06eba0p-3f); ret = BUILTIN_FMA_F32(ax, p, ax); } else { float p = MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, 0x1.1d3156p-16f, -0x1.8d129p-12f), 0x1.f9a6d2p-9f), -0x1.8c3164p-6f), 0x1.b4e9c8p-4f), 0x1.4515fap-1f), 0x1.078e50p-3f); p = BUILTIN_FMA_F32(ax, p, ax); ret = 1.0f - MATH_MANGLE(exp)(-p); } ret = BUILTIN_COPYSIGN_F32(ret, x); return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfH.cl000066400000000000000000000007061415221260100202600ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(erf) CONSTATTR half MATH_MANGLE(erf)(half x) { return (half)MATH_UPMANGLE(erf)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfcD.cl000066400000000000000000000320471415221260100204220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #if !defined EXTRA_ACCURACY CONSTATTR extern double MATH_PRIVATE(erfcx)(double); CONSTATTR double MATH_MANGLE(erfc)(double x) { double ax = BUILTIN_ABS_F64(x); double x2h = -x*x; double x2l = MATH_MAD(-x, x, -x2h); double e = MATH_MANGLE(exp)(x2h); e = MATH_MAD(e, x2l, e); double ret = e * MATH_PRIVATE(erfcx)(ax); ret = ax > 0x1.b39dc41e48bfcp+4 ? 0.0f : ret; double nret = 2.0 - ret; return x < 0.0 ? nret : ret; } #else // Partially based on ideas from the Sun implementation /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ /* double erf(double x) * double erfc(double x) * x * 2 |\ * erf(x) = --------- | exp(-t*t)dt * sqrt(pi) \| * 0 * * erfc(x) = 1-erf(x) * Note that * erf(-x) = -erf(x) * erfc(-x) = 2 - erfc(x) * * Method: * 1. For |x| in [0, 0.84375] * erf(x) = x + x*R(x^2) * erfc(x) = 1 - erf(x) if x in [-.84375,0.25] * = 0.5 + ((0.5-x)-x*R) if x in [0.25,0.84375] * where R = P/Q where P is an odd poly of degree 8 and * Q is an odd poly of degree 10. * -57.90 * | R - (erf(x)-x)/x | <= 2 * * * Remark. The formula is derived by noting * erf(x) = (2/sqrt(pi))*(x - x^3/3 + x^5/10 - x^7/42 + ....) * and that * 2/sqrt(pi) = 1.128379167095512573896158903121545171688 * is close to one. The interval is chosen because the fix * point of erf(x) is near 0.6174 (i.e., erf(x)=x when x is * near 0.6174), and by some experiment, 0.84375 is chosen to * guarantee the error is less than one ulp for erf. * * 2. For |x| in [0.84375,1.25], let s = |x| - 1, and * c = 0.84506291151 rounded to single (24 bits) * erf(x) = sign(x) * (c + P1(s)/Q1(s)) * erfc(x) = (1-c) - P1(s)/Q1(s) if x > 0 * 1+(c+P1(s)/Q1(s)) if x < 0 * |P1/Q1 - (erf(|x|)-c)| <= 2**-59.06 * Remark: here we use the taylor series expansion at x=1. * erf(1+s) = erf(1) + s*Poly(s) * = 0.845.. + P1(s)/Q1(s) * That is, we use rational approximation to approximate * erf(1+s) - (c = (single)0.84506291151) * Note that |P1/Q1|< 0.078 for x in [0.84375,1.25] * where * P1(s) = degree 6 poly in s * Q1(s) = degree 6 poly in s * * 3. For x in [1.25,1/0.35(~2.857143)], * erfc(x) = (1/x)*exp(-x*x-0.5625+R1/S1) * erf(x) = 1 - erfc(x) * where * R1(z) = degree 7 poly in z, (z=1/x^2) * S1(z) = degree 8 poly in z * * 4. For x in [1/0.35,28] * erfc(x) = (1/x)*exp(-x*x-0.5625+R2/S2) if x > 0 * = 2.0 - (1/x)*exp(-x*x-0.5625+R2/S2) if -6 x >= 28 * erf(x) = sign(x) *(1 - tiny) (raise inexact) * erfc(x) = tiny*tiny (raise underflow) if x > 0 * = 2 - tiny if x<0 * * 7. Special case: * erf(0) = 0, erf(inf) = 1, erf(-inf) = -1, * erfc(0) = 1, erfc(inf) = 0, erfc(-inf) = 2, * erfc/erf(NaN) is NaN */ CONSTATTR double MATH_MANGLE(erfc)(double x) { double ret; if (x < 0x1.e861fbb24c00ap-2) { if (x > -1.0) { double t = x * x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.abae491c443a9p-31, 0x1.d71b0f1b10a64p-27), -0x1.5c0726f04dcfbp-23), 0x1.b97fd3d992938p-20), -0x1.f4ca4d6f3e30fp-17), 0x1.f9a2baa8fedd2p-14), -0x1.c02db03dd71d4p-11), 0x1.565bccf92b2f9p-8), -0x1.b82ce311fa93ep-6), 0x1.ce2f21a040d16p-4), -0x1.812746b0379bdp-2), 0x1.20dd750429b6dp+0); ret = MATH_MAD(-x, ret, 1.0); } else if (x > -1.75) { double t = -x - 1.0; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.6c922ed03eb9dp-17, 0x1.97d42571bbb38p-14), -0x1.41761e0138c87p-12), 0x1.7f635425509dep-13), 0x1.30fe6b148c32fp-10), -0x1.e682366d34981p-10), -0x1.39b7dcc1aeec8p-8), 0x1.f0ab5db978c52p-7), 0x1.2e3e92d3304b4p-8), -0x1.1b613d8e18405p-4), 0x1.1b614a01845b4p-4), 0x1.1b614b15ab5c1p-3), -0x1.a911f0970fc8dp-2), 0x1.a911f096fbf43p-2), 0x1.d7bb3d3a08445p+0); } else if (x > -2.5) { double t = -x - 1.75; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.1f145e2e90ae8p-18, -0x1.04595429d0b58p-15), 0x1.566284cadc629p-14), -0x1.daefe4f2fa8e2p-17), -0x1.cbee5eda62503p-12), 0x1.d416c2aa2275ap-11), 0x1.7eeb86b197684p-11), -0x1.8d11b66138741p-8), 0x1.25b37e361d1c9p-7), 0x1.b22258f45515dp-8), -0x1.8a0da54b7e9dep-5), 0x1.7148c3d5d2293p-4), -0x1.7a4a8a2bdfeb2p-4), 0x1.b05530322115bp-5), 0x1.fc9683bfc6ab7p+0); } else if (x > -4.0) { double t = -x - 2.5; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.708f6d0e65c33p-32, 0x1.dbd0618847c60p-28), -0x1.c3001cf83cd69p-26), -0x1.4dca746dfe625p-22), 0x1.a8e79a95d6f67p-20), 0x1.8d8d7711fc864p-16), -0x1.99fe2d9d9b69bp-13), -0x1.b3b1f1e28669cp-12), 0x1.01d3d83753fb1p-7), -0x1.e842cf8341e6ap-10), -0x1.a49bb4ab1d7d9p-3), 0x1.3a50e1b16e339p-1); ret = ret*ret; ret = ret*ret; ret = ret*ret; ret = MATH_MAD(-ret, ret, 2.0); } else if (x > -5.9375) { double t = -x - 4.0; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.5b22d2cd54932p-26, -0x1.3e056a1040a29p-24), -0x1.2d8f6bf8af04ap-19), 0x1.4c20d337a4541p-16), 0x1.d9d0971c8f96dp-16), -0x1.0a33e01adb0ddp-10), 0x1.63716fb40eab9p-9), 0x1.7d6f6bbcfc7e0p-6), -0x1.5687476feec74p-3), 0x1.4cb2bacd30820p-2); ret = ret*ret; ret = ret*ret; ret = ret*ret; ret = MATH_MAD(-ret, ret, 2.0); } else { ret = 2.0; } } else { if (x < 1.0) { double t = x - 0.75; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.57d59f658aba7p-16, 0x1.362e0b222318ep-14), 0x1.bc4dcd34fdd6dp-14), -0x1.470d403e0efe6p-11), -0x1.86196ce26e31fp-13), 0x1.0410341ee1473p-8), -0x1.2db338db4ad88p-9), -0x1.2e0afac283b7fp-6), 0x1.b847796a479d8p-6), 0x1.b42a1890465d3p-5), -0x1.349b5eaa155b6p-3), -0x1.b6e8591f65270p-6), 0x1.edc5644353c2dp-2), -0x1.492e42d78d2c5p-1), 0x1.27c6d14c5e341p-2); } else if (x < 1.5) { double t = x - 1.25; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.9c25dae26e5a8p-18, 0x1.692456873fac4p-19), -0x1.d3ef7e77785bap-15), 0x1.baaa993d5590fp-15), 0x1.53b075bbc5b61p-12), -0x1.a00787b6af397p-11), -0x1.cc224fab0d8a4p-11), 0x1.75672d1e80999p-8), -0x1.db43c97b37ceap-9), -0x1.5d0003afa1e92p-6), 0x1.8281ce0b36c0dp-5), 0x1.93a9a7bb80513p-8), -0x1.571d01c5c56c8p-3), 0x1.2ebf3dcc9f22fp-2), -0x1.e4652fadcb6b2p-3), 0x1.3bcd133aa0ffcp-4); } else if (x < 1.75) { double t = x - 1.625; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.02ad00dd8cbb4p-13, 0x1.70ffb4c1c5cbfp-12), -0x1.71c6788c68de8p-10), 0x1.2e4d6f91e46c7p-11), 0x1.954aa9df71457p-8), -0x1.d857f3fbcac79p-7), 0x1.17d430d63aaf5p-9), 0x1.974c0368aecfcp-5), -0x1.d6631e1a2977fp-4), 0x1.0bcfca219477bp-3), -0x1.499d478bca733p-4), 0x1.612d893085125p-6); } else if (x < 27.21875) { double t = MATH_RCP(x*x); if (x < 2.75) ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.ee796b0cccbebp+11, -0x1.f287322c462d4p+13), 0x1.d9e0700d3d82dp+14), -0x1.1a96768b6b29fp+15), 0x1.dafa2508a60dcp+14), -0x1.2bbd8e3460b89p+14), 0x1.27fd8cab24e6ep+13), -0x1.d7a7a4e4c3b93p+11), 0x1.37a4a4d018456p+10), -0x1.60173b9f73257p+8), 0x1.6253e7ca4b16fp+6), -0x1.51d02c514c31cp+4), 0x1.4e9a1546b2716p+2), -0x1.86ed776e3a5e5p+0), 0x1.3fb9e1ef8c40ap-1), -0x1.fffcb9ff22596p-2), -0x1.43424dfcdbdcep-7); else ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.bba05f5648454p+38, -0x1.401ff919f9865p+39), 0x1.b23350c3b39a1p+38), -0x1.70d6cf6eca08ep+37), 0x1.b9e665656eee6p+35), -0x1.8f73b118a9b93p+33), 0x1.1da829fcea796p+31), -0x1.5090992846e0ep+28), 0x1.548adac0440f5p+25), -0x1.3694e9079941ep+22), 0x1.0e5ce4af6bb84p+19), -0x1.dda4fee0ea545p+15), 0x1.c3f3a46f6fac8p+12), -0x1.dc5f4d89f0ae7p+9), 0x1.1f825da9dcbacp+7), -0x1.98193f7900492p+4), 0x1.60fffd6b1743dp+2), -0x1.8aaaaa9e2e8dep+0), 0x1.3fffffffedba9p-1), -0x1.fffffffffff1fp-2), -0x1.4341239e86f47p-7); double xh = AS_DOUBLE(AS_LONG(x) & 0xffffffff00000000L); ret = MATH_DIV(MATH_MANGLE(exp)(MATH_MAD(x - xh, -(x + xh), ret)), x) * MATH_MANGLE(exp)(MATH_MAD(xh, -xh, -0.5625)); } else { ret = BUILTIN_ISNAN_F64(x) ? x : 0.0; } } return ret; } #endif ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfcF.cl000066400000000000000000000110141415221260100204130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #if !defined EXTRA_ACCURACY CONSTATTR extern float MATH_PRIVATE(erfcx)(float); CONSTATTR float MATH_MANGLE(erfc)(float x) { float ax = BUILTIN_ABS_F32(x); float x2h = -x*x; float x2l = BUILTIN_FMA_F32(-x, x, -x2h); float e = MATH_MANGLE(exp)(x2h); e = BUILTIN_FMA_F32(e, x2l, e); float ret = e * MATH_PRIVATE(erfcx)(ax); ret = ax > 0x1.41bbf8p+3f ? 0.0f : ret; float nret = 2.0f - ret; return x < 0.0f ? nret : ret; } #else // Some of this implementation is based on ideas from Sun LLVM /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ CONSTATTR float MATH_MANGLE(erfc)(float x) { float ret; if (x < 0x1.e861fcp-2f) { if (x > -1.0f) { float t = x * x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.496a32p-14f, -0x1.a3f700p-11f), 0x1.5405b2p-8f), -0x1.b7f90ep-6f), 0x1.ce2cf8p-4f), -0x1.81273ep-2f), 0x1.20dd74p+0f), ret = MATH_MAD(-x, ret, 1.0f); } else if (x > -2.0f) { float t = -x - 1.0f; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.e72c84p-9f, 0x1.fe43a0p-6f), -0x1.6c8eecp-4f), 0x1.3db6cep-4f), 0x1.1760e0p-3f), -0x1.a8d6d0p-2f), 0x1.a90f56p-2f), 0x1.d7bb3ep+0f); } else if (x > -3.74609375f) { float t = -x - 2.0f; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.19665ap-13f, -0x1.d8e18ap-14f), 0x1.13b7c0p-7f), -0x1.cf36a8p-7f), -0x1.9460fap-3f), 0x1.6e23c8p-1f); ret = ret*ret; ret = ret*ret; ret = ret*ret; ret = MATH_MAD(-ret, ret, 2.0f); } else { return 2.0f; } } else { if (x < 1.0f) { float t = x - 0.75f; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.b3ca9ap-6f, 0x1.a27606p-5f), -0x1.3489bcp-3f), -0x1.b5b5f0p-6f), 0x1.edc50cp-2f), -0x1.492e58p-1f), 0x1.27c6d2p-2f); } else if (x < 1.5f) { float t = x - 1.25f; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.558b4ep-6f, 0x1.7f4316p-5f), 0x1.9362c6p-8f), -0x1.5716acp-3f), 0x1.2ebf30p-2f), -0x1.e4653cp-3f), 0x1.3bcd14p-4f); } else if (x < 1.75f) { float t = x - 1.625f; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.d1cd9cp-7f, 0x1.2d8f6cp-9f), 0x1.9742c6p-5f), -0x1.d66472p-4f), 0x1.0bcfcep-3f), -0x1.499d46p-4f), 0x1.612d8ap-6f); } else if (x < 10.0234375f) { float t = MATH_FAST_RCP(x*x); if (x < 2.75f) ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.ecf46ap-1f, -0x1.d8a006p+0f), 0x1.ab72d8p+0f), -0x1.05ed12p+0f), 0x1.2691fep-1f), -0x1.fd0ddcp-2f), -0x1.45b16ep-7f); else ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.107a4cp+4f, -0x1.7fa404p+3f), 0x1.22b8c8p+2f), -0x1.7faf0cp+0f), 0x1.3f746ep-1f), -0x1.fffc90p-2f), -0x1.4341a6p-7f); float xh = AS_FLOAT(AS_INT(x) & 0xffffe000); ret = MATH_FAST_DIV(MATH_MANGLE(exp)(MATH_MAD(xh - x, xh + x, ret)), x) * MATH_MANGLE(exp)(MATH_MAD(xh, -xh, -0.5625f)); } else { ret = BUILTIN_ISNAN_F32(x) ? x : 0.0f; } } return ret; } #endif ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfcH.cl000066400000000000000000000007111415221260100204170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(erfc) CONSTATTR half MATH_MANGLE(erfc)(half x) { return (half)MATH_UPMANGLE(erfc)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfcinvD.cl000066400000000000000000000131621415221260100211340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(erfcinv)(double y) { double ret; if (y > 0.625) { ret = MATH_MANGLE(erfinv)(1.0 - y); } else if (y > 0x1.0p-10) { double t = -MATH_MANGLE(log)(y * (2.0 - y)) - 3.125; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.1267a785a1166p-69, -0x1.a6581051dd484p-63), 0x1.2b2956fc047a4p-60), 0x1.ad835aed5cc07p-57), -0x1.25e0612eae68fp-53), 0x1.a0cab63f02a91p-57), 0x1.d9227af501adbp-48), -0x1.6c3ad559a9b4ep-45), -0x1.6cafa36036318p-44), 0x1.72879641e158fp-39), -0x1.c89d755f7fff8p-37), -0x1.dc51171ddae3ap-35), 0x1.20f512744ae65p-30), -0x1.1a9e5f4bcfcd8p-28), -0x1.f36ce926b83e8p-26), 0x1.c6b4f6c7cfa1ep-22), -0x1.6e8a53e0c2026p-20), -0x1.d1d1f7bf4570bp-17), 0x1.879c2a20cc3e2p-13), -0x1.8457694844d14p-11), -0x1.8b6c33114edadp-8), 0x1.ebd80d9b13e14p-3), 0x1.a755e7c99ae86p+0); ret = BUILTIN_FMA_F64(-y, ret, ret); } else { double s = MATH_SQRT(-MATH_MANGLE(log)(y)); double t = MATH_RCP(s); if (y > 0x1.0p-19) { ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.8b3cfc98a5212p+4, -0x1.907bcdab54a4ep+6), 0x1.7659cf8216d7dp+7), -0x1.ac222777f664dp+7), 0x1.4f2f8e33151acp+7), -0x1.7d7d1eb301c4cp+6), 0x1.48e630c1c77e7p+5), -0x1.c63e7d0e327f6p+3), 0x1.225b286aeb0dfp+2), -0x1.82a4acc22b05dp+0), -0x1.0a88271680e57p-5), 0x1.001f6acebb122p+0); } else if (y > 0x1.0p-40) { ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.0fdcb40bf066dp+9, -0x1.870ddeaa832dbp+10), 0x1.035c39e0428c4p+11), -0x1.a4d3c54a3ec14p+10), 0x1.d382aee6efae8p+9), -0x1.79f9e26565bc1p+8), 0x1.d00e058ce9abap+6), -0x1.c7d1e01821eb3p+4), 0x1.9d930ba7a3111p+2), -0x1.af47941dd2baap+0), -0x1.787ecc823998bp-6), 0x1.000fae5fb73e3p+0); } else if (y > 0x1.0p-82) { ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.c9e5b8e31c18ep+13, -0x1.c866153b1bce6p+14), 0x1.a386b3b4fb25cp+14), -0x1.d7bf378e7b5fbp+13), 0x1.6b416de0a7a75p+12), -0x1.9757c1cf44e90p+10), 0x1.5b56ededbaa8cp+8), -0x1.da79924b4d155p+5), 0x1.2ba25315d612bp+3), -0x1.de5808fbd786dp+0), -0x1.04e014b9fc507p-6), 0x1.000788df1c89fp+0); } else if (y > 0x1.0p-200) { ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.ff518aae00301p+18, -0x1.5781ef98c6aa9p+19), 0x1.a9511b21c7715p+18), -0x1.41d8f1455b21ep+17), 0x1.4d4a3d4025a4cp+15), -0x1.f640fe7077996p+12), 0x1.1faf674f42181p+10), -0x1.080c5cd81d791p+7), 0x1.c0ae370098ef4p+3), -0x1.08ebd67dc005ap+1), -0x1.5cf3329e72289p-7), 0x1.00035e75f27e2p+0); } else if (y > 0x1.0p-400) { ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.d554f00bf9d81p+20, 0x1.8456711ff3627p+20), -0x1.26c90acc5daafp+19), 0x1.106501cdef815p+17), -0x1.57a4c95601c04p+14), 0x1.3ca627cbaede6p+11), -0x1.c716e091922fbp+7), 0x1.292f8f6e8bc75p+4), -0x1.1b469c212bd5fp+1), -0x1.04977fb6d0462p-7), 0x1.0001dc9f52f8ap+0); } else if (y > 0x1.0p-900) { ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.21913925f3a73p+25, 0x1.4aa2fba282b9bp+24), -0x1.5a2a3f9742896p+22), 0x1.b8ee3895772e8p+19), -0x1.7f2ce0b036be4p+16), 0x1.e62ab1bcbb738p+12), -0x1.e0ed2965d2a06p+8), 0x1.b0c16705263e5p+4), -0x1.334f9a732ecc7p+1), -0x1.65f60412f9578p-8), 0x1.0000e0bda43b5p+0); } else { ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.e3d70f1fdc7bep+11, 0x1.28d9acd5b9596p+10), -0x1.554c1ce591414p+7), 0x1.15b1e5a1fe7f5p+4), -0x1.1aa8e6f616c69p+1), -0x1.f6803b3b4d6ccp-8), 0x1.00019ac5bed2ap+0); } ret = s * ret; } if (!FINITE_ONLY_OPT()) { ret = ((y < 0.0) | (y > 2.0)) ? AS_DOUBLE(QNANBITPATT_DP64) : ret; ret = y == 0.0 ? AS_DOUBLE(PINFBITPATT_DP64) : ret; ret = y == 2.0 ? AS_DOUBLE(NINFBITPATT_DP64) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfcinvF.cl000066400000000000000000000037361415221260100211440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(erfcinv)(float y) { float ret; if (y > 0.625f) { ret = MATH_MANGLE(erfinv)(1.0f - y); } else if (y > 0x1.0p-10f) { float t = -MATH_MANGLE(log)(y * (2.0f - y)) - 3.125f; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.7ee662p-31f, -0x1.3f5a80p-28f), -0x1.b638f0p-26f), 0x1.c9ccc6p-22f), -0x1.72f8aep-20f), -0x1.d21aa6p-17f), 0x1.87aebcp-13f), -0x1.8455d4p-11f), -0x1.8b6ca4p-8f), 0x1.ebd80cp-3f), 0x1.a755e8p+0f); ret = MATH_MAD(-y, ret, ret); } else { float s = MATH_FAST_SQRT(-MATH_MANGLE(log)(y)); float t = MATH_FAST_RCP(s); if (y > 0x1.0p-42f) { ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.57221ep+0f, 0x1.7f6144p+1f), -0x1.98dd40p+1f), 0x1.2c9066p+1f), -0x1.3a07eap+0f), -0x1.ba546cp-5f), 0x1.004e66p+0f); } else { ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.649c6ap+4f, 0x1.8fa8fap+4f), -0x1.a112d8p+3f), 0x1.309d98p+2f), -0x1.919488p+0f), -0x1.c084ecp-6f), 0x1.00143ep+0f); } ret = s * ret; } if (!FINITE_ONLY_OPT()) { ret = ((y < 0.0f) | (y > 2.0f)) ? AS_FLOAT(QNANBITPATT_SP32) : ret; ret = y == 0.0f ? AS_FLOAT(PINFBITPATT_SP32) : ret; ret = y == 2.0f ? AS_FLOAT(NINFBITPATT_SP32) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfcinvH.cl000066400000000000000000000007221415221260100211360ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(erfcinv) CONSTATTR half MATH_MANGLE(erfcinv)(half x) { return (half)MATH_UPMANGLE(erfcinv)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfcxD.cl000066400000000000000000000130461415221260100206100ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_PRIVATE(erfcx)(double x) { double n = x - 4.0; double d = x + 4.0; double r = MATH_FAST_RCP(d); double q = n * r; double e = MATH_MAD(-q, x, MATH_MAD(q + 1.0, -4.0, x)); q = BUILTIN_FMA_F64(r, e, q); double p = MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, -0x1.1f39d54df3c0ep-27, -0x1.1166337cfa789p-27), 0x1.b45f1d9802b82p-24), 0x1.d90488a03dcdbp-25), -0x1.b87b02eba62d8p-21), 0x1.5104ba56e15f1p-22), 0x1.7f29f71c907dep-18), -0x1.78f5c2cd770fbp-17), -0x1.995fb76d0a51ap-16), 0x1.3be2ec022d0edp-13), -0x1.a1deb2fdbf62ep-13), -0x1.8d4ac3689fc43p-11), 0x1.49c67192d909bp-8), -0x1.09623852ff07p-6), 0x1.3079edfadea8fp-5), -0x1.0fb06dff6591p-4), 0x1.7fee004de8f32p-4), -0x1.9ddb23c3dbeb3p-4), 0x1.16ecefcfa693p-4), 0x1.f7f5df66fb8a3p-7), -0x1.1df1ad154a2a8p-3), 0x1.dd2c8b74febf8p-3); double tx = x + x; d = 1.0 + tx; r = MATH_FAST_RCP(d); q = MATH_MAD(p, r, r); e = MATH_MAD(-q, tx, 1.0) + (p - q); q = MATH_MAD(r, e, q); return q; } #if !defined EXTRA_ACCURACY CONSTATTR double MATH_MANGLE(erfcx)(double x) { double ax = BUILTIN_ABS_F64(x); double ret; if (ax < 0x1.b39dc41e48bfcp+4) { ret = MATH_PRIVATE(erfcx)(ax); } else { double r = MATH_RCP(ax); double t = r*r; double p = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -29.53125, 6.5625), -1.875), 0.75), -0.5), 1.0); ret = 0x1.20dd750429b6dp-1 * r * p; } if (x < 0.0) { double x2h = x*x; double x2l = MATH_MAD(x, x, -x2h); double e = MATH_MANGLE(exp)(x2h); ret = MATH_MAD(2.0, MATH_MAD(e, x2l, e), -ret); ret = x < -0x1.aa0f4d2e063cep+4 ? AS_DOUBLE(PINFBITPATT_DP64) : ret; } return ret; } #else CONSTATTR double MATH_MANGLE(erfcx)(double x) { double ax = BUILTIN_ABS_F64(x); double ret; if (ax < 1.0) { ret = MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, 0x1.997339112da12p-29, -0x1.9a1485b7ae337p-27), 0x1.9548ab4c5bb56p-26), -0x1.2f88b47e02dc3p-24), 0x1.282114351c39ap-22), -0x1.e533a426aadd7p-21), 0x1.723131b8ef11ep-19), -0x1.188f6b08d66b9p-17), 0x1.a00995a561233p-16), -0x1.2aeb04681fed5p-14), 0x1.a01b9d82bcaa5p-13), -0x1.182d3bb1ac2c8p-11), 0x1.6c16a932f49d1p-10), -0x1.c74aef6905182p-9), 0x1.111111f403407p-7), -0x1.390379458257cp-6), 0x1.5555554b34536p-5), -0x1.6023e8de7793p-4), 0x1.5555555597342p-3), -0x1.341f6bc020c17p-2), 0x1.fffffffffe5aep-2), -0x1.812746b037cadp-1), 0x1.000000000001dp0), -0x1.20dd750429b6ap0), 0x1.0p0); } else if (ax < 5120.0) { double t = MATH_DIV(ax - 4.0, ax + 4.0); ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0.14981549849751462e-8, -0.69954933359042387e-8), -0.15965692247743744e-7), 0.92967132363414431e-7), 0.70214215034531004e-7), -0.80204958740421079e-6), 0.29923810132862422e-6), 0.56895739871851154e-5), -0.11226090578381133e-4), -0.2438781785281914e-4), 0.00015062360829881126), -0.00019926094025574419), -0.00075777387606136804), 0.0050319709983606006), -0.016197733946788412), 0.037167515387099868), -0.066330365824435124), 0.093732835010698844), -0.10103906603561565), 0.068097054254223675), 0.015379652102604634), -0.13962111684055725), 1.2329951186255526); ret = MATH_DIV(ret, MATH_MAD(ax, 2.0, 1.0)); } else { const double one_over_sqrtpi = 0x1.20dd750429b6dp-1; double z = MATH_RCP(x * x); ret = MATH_DIV(one_over_sqrtpi, x) * MATH_MAD(z, MATH_MAD(z, 0.375, -0.5), 1.0); } if (x <= -1.0) { double x2h = ax * ax; double x2l = BUILTIN_FMA_F64(ax, ax, -x2h); ret = MATH_MANGLE(exp)(x2h) * MATH_MANGLE(exp)(x2l) * 2.0 - ret; ret = x < -27.0 ? AS_DOUBLE(PINFBITPATT_DP64) : ret; } return ret; } #endif ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfcxF.cl000066400000000000000000000070331415221260100206110ustar00rootroot00000000000000 #include "mathF.h" CONSTATTR float MATH_PRIVATE(erfcx)(float x) { float n = x - 2.0f; float d = x + 2.0f; float r = MATH_FAST_RCP(d); float q = n * r; float e = BUILTIN_FMA_F32(-q, x, BUILTIN_FMA_F32(q + 1.0f, -2.0f, x)); q = BUILTIN_FMA_F32(r, e, q); float p = MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, MATH_MAD(q, -0x1.adf188p-12f, -0x1.45aea6p-10f), 0x1.5a5f68p-10f), 0x1.1b44cep-7f), -0x1.082b62p-7f), -0x1.bc143p-5f), 0x1.4ffc54p-3f), -0x1.5407fap-3f), -0x1.7bf616p-4f), 0x1.1ba038p-2); float tx = x + x; d = 1.0f + tx; r = MATH_FAST_RCP(d); q = BUILTIN_FMA_F32(p, r, r); e = BUILTIN_FMA_F32(-q, tx, 1.0f) + (p - q); q = BUILTIN_FMA_F32(r, e, q); return q; } #if !defined EXTRA_ACCURACY CONSTATTR float MATH_MANGLE(erfcx)(float x) { float ax = BUILTIN_ABS_F32(x); float ret; if (ax < 0x1.41bbf8p+3f) { ret = MATH_PRIVATE(erfcx)(ax); } else { float r = MATH_FAST_RCP(0x1.0p-2f * ax); float t = r*r * 0x1.0p-4f; float p = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 6.5625f, -1.875f), 0.75f), -0.5f), 1.0f); ret = 0x1.20dd76p-3f * r * p; } if (x < 0.0f) { float x2h = x*x; float x2l = BUILTIN_FMA_F32(x, x, -x2h); float e = MATH_MANGLE(exp)(x2h); ret = BUILTIN_FMA_F32(2.0f, BUILTIN_FMA_F32(e, x2l, e), -ret); ret = x < -0x1.2d6abcp+3f ? AS_FLOAT(PINFBITPATT_SP32) : ret; } return ret; } #else CONSTATTR float MATH_MANGLE(erfcx)(float x) { float ax = BUILTIN_ABS_F32(x); float ret; if (ax < 1.0f) { ret = MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, -0x1.77d64p-11f, 0x1.269372p-9f), -0x1.c27dd4p-9f), 0x1.d3d3c4p-8f), -0x1.35d6cap-6f), 0x1.5bb082p-5f), -0x1.60e46ep-4f), 0x1.54d3e4p-3f), -0x1.340edap-2f), 0x1.00049ap-1f), -0x1.81286p-1f), 0x1.ffffcap-1f), -0x1.20dd7p+0f), 0x1.0p+0f); } else if (ax < 32.0f) { float t = MATH_DIV(ax - 4.0f, ax + 4.0f); ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0.00416076401f, -0.0167250745f), 0.0378070959f), -0.0661972834f), 0.0935599947f), -0.101052745f), 0.0681148962f), 0.0153801711f), -0.139621619f), 1.23299511f); ret = MATH_DIV(ret, MATH_MAD(ax, 2.0f, 1.0f)); } else { const float one_over_sqrtpi = 0x1.20dd76p-1f; float z = MATH_RCP(x * x); ret = MATH_DIV(one_over_sqrtpi, x) * MATH_MAD(z, MATH_MAD(z, 0.375f, -0.5f), 1.0f); } if (x <= -1.0f) { float x2h, x2l; if (HAVE_FAST_FMA32()) { x2h = ax * ax; x2l = BUILTIN_FMA_F32(ax, ax, -x2h); } else { float xh = AS_FLOAT(AS_UINT(ax) & 0xfffff000U); float xl = ax - xh; x2h = xh*xh; x2l = (ax + xh)*xl; } ret = MATH_MANGLE(exp)(x2h) * MATH_MANGLE(exp)(x2l) * 2.0f - ret; ret = x < -10.0f ? AS_FLOAT(PINFBITPATT_SP32) : ret; } return ret; } #endif ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfcxH.cl000066400000000000000000000002141415221260100206050ustar00rootroot00000000000000 #include "mathH.h" CONSTATTR UGEN(erfcx) CONSTATTR half MATH_MANGLE(erfcx)(half x) { return (half)MATH_UPMANGLE(erfcx)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfinvD.cl000066400000000000000000000130261415221260100207700ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(erfinv)(double x) { double ax = BUILTIN_ABS_F64(x); double ret; if (ax < 0.375) { double t = ax*ax; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.c5ec06cd8002bp-2, -0x1.bb7dd47aef0d6p-1), 0x1.d189992eccdb6p-1), -0x1.10ec180cde957p-1), 0x1.05cce379dd66fp-2), -0x1.6b9067e3dae74p-5), 0x1.5f7f0487c11a3p-5), 0x1.e0fbf22b2350cp-6), 0x1.2ce26322b7f90p-5), 0x1.5ebeeee81dd31p-5), 0x1.a7cacb897f0d4p-5), 0x1.0a130d62cba32p-4), 0x1.62847c8653359p-4), 0x1.053c2c0a5e083p-3), 0x1.db29fb2feec72p-3), 0x1.c5bf891b4ef6ap-1); ret = ax * ret; } else if (ax < 0x1.fffep-1) { double w = -MATH_MANGLE(log)(BUILTIN_FMA_F64(-ax, ax, 1.0)); if (w < 6.25) { w = w - 3.125; ret = MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, -0x1.135d2e746e627p-68, -0x1.8ddf93324d327p-63), 0x1.7b83eef0b7c9fp-60), 0x1.9ba72cd589b91p-57), -0x1.33689090a6b96p-53), 0x1.82e11898132e0p-56), 0x1.de4acfd9e26bap-48), -0x1.6d33eed66c487p-45), -0x1.6f2167040d8e2p-44), 0x1.72a22c2d77e20p-39), -0x1.c8859c4e5c0afp-37), -0x1.dc583d118a561p-35), 0x1.20f47ccf46b3cp-30), -0x1.1a9e38dc84d60p-28), -0x1.f36cd6d3d46a9p-26), 0x1.c6b4f5d03b787p-22), -0x1.6e8a5434ae8a2p-20), -0x1.d1d1f7b8736f6p-17), 0x1.879c2a212f024p-13), -0x1.845769484fca8p-11), -0x1.8b6c33114f909p-8), 0x1.ebd80d9b13e28p-3), 0x1.a755e7c99ae86p+0); } else if (w < 16.0) { w = MATH_SQRT(w) - 3.25; ret = MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, 0x1.3040f87dbd932p-29, 0x1.85cbe52878635p-24), -0x1.2777453dd3955p-22), 0x1.395abcd554c6cp-26), 0x1.936388a3790adp-20), -0x1.0d5db812b5083p-18), 0x1.8860cd5d652f6p-19), 0x1.a29a0cacdfb23p-17), -0x1.8cef1f80281f2p-15), 0x1.1e684d0b9188ap-14), 0x1.932cd54c8a222p-16), -0x1.7448a89ef8aa3p-12), 0x1.f3cc55ad40c25p-11), -0x1.ba924132f38b1p-10), 0x1.468eeca533cf8p-9), -0x1.ebadabb891bbdp-9), 0x1.5ffcfe5b76afcp-8), 0x1.0158a6d641d39p+0), 0x1.8abcc380d5a48p+1); } else { w = MATH_SQRT(w) - 5.0; ret = MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, -0x1.dcec3a7785389p-36, -0x1.18feec0e38727p-32), 0x1.9e6bf2dda45e3p-30), -0x1.0468fb24e2f5fp-28), 0x1.05ac6a8fba182p-27), -0x1.0102e495fb9c0p-26), 0x1.f4c20e1334af8p-26), -0x1.22d220fdf9c3ep-24), 0x1.ebc8bb824cb54p-23), -0x1.0a8d40ea372ccp-20), 0x1.2fbd29d093d2bp-18), -0x1.4a3497e1e0facp-16), 0x1.3ebf4eb00938fp-14), -0x1.c2f36a8fc5d53p-13), -0x1.22ea5df04047cp-13), 0x1.02a30d1fba0dcp+0), 0x1.3664ddd1ad7fbp+2); } ret = ax * ret; } else { double s = MATH_SQRT(-MATH_MANGLE(log)(1.0 - ax)); double t = MATH_RCP(s); if (ax < 0x1.fffffffep-1) { ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.c4bd831a51669p+7, -0x1.66af45b757c26p+9), 0x1.061b293ee1671p+10), -0x1.d4aa0fd7248e9p+9), 0x1.1eebb0088748dp+9), -0x1.ff4cb6c165efep+7), 0x1.59c379a609255p+6), -0x1.762b2677680c6p+4), 0x1.7626132cf7c5ap+2), -0x1.a298cc231a949p+0), -0x1.9fa2d429b22cap-6), 0x1.00131c4b15d15p+0); } else { ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.e1f462cc8e58ap+7, -0x1.dd260d25bee8dp+8), 0x1.af7dab6c206e6p+8), -0x1.d97c75a0f5809p+7), 0x1.632c20bf45d30p+6), -0x1.8e4908179a727p+4), 0x1.89538a73a2c3cp+2), -0x1.aad8569b3607dp+0), -0x1.80d1bec4b54cbp-6), 0x1.001006f90ea2cp+0); } ret = s * ret; } if (!FINITE_ONLY_OPT()) { ret = ax > 1.0 ? AS_DOUBLE(QNANBITPATT_DP64) : ret; ret = ax == 1.0 ? AS_DOUBLE(PINFBITPATT_DP64) : ret; } return BUILTIN_COPYSIGN_F64(ret, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfinvF.cl000066400000000000000000000037061415221260100207760ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(erfinv)(float x) { float ax = BUILTIN_ABS_F32(x); float p; if (ax < 0.375f) { float t = ax*ax; p = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.48b6cap-3f, -0x1.a2930ap-6f), 0x1.65b0b4p-4f), 0x1.5581aep-4f), 0x1.05aa56p-3f), 0x1.db2748p-3f), 0x1.c5bf8ap-1f); } else { float w; if (HAVE_FAST_FMA32()) { w = BUILTIN_FMA_F32(-ax, ax, 1.0f); } else { w = (1.0f - ax) * (1.0f + ax); } w = -MATH_MANGLE(log)(w); if (w < 5.0f) { w = w - 2.5f; p = MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, 0x1.e2cb10p-26f, 0x1.70966cp-22f), -0x1.d8e6aep-19f), -0x1.26b582p-18f), 0x1.ca65b6p-13f), -0x1.48a810p-10f), -0x1.11c9dep-8f), 0x1.f91ec6p-3f), 0x1.805c5ep+0f); } else { w = MATH_SQRT(w) - 3.0f; p = MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, -0x1.a3e136p-13f, 0x1.a76ad6p-14f), 0x1.61b8e4p-10f), -0x1.e17bcep-9f), 0x1.7824f6p-8f), -0x1.f38baep-8f), 0x1.354afcp-7f), 0x1.006db6p+0f), 0x1.6a9efcp+1f); } } float ret = p*ax; if (!FINITE_ONLY_OPT()) { ret = ax > 1.0f ? AS_FLOAT(QNANBITPATT_SP32) : ret; ret = ax == 1.0f ? AS_FLOAT(PINFBITPATT_SP32) : ret; } return BUILTIN_COPYSIGN_F32(ret, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/erfinvH.cl000066400000000000000000000007171415221260100207770ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(erfinv) CONSTATTR half MATH_MANGLE(erfinv)(half x) { return (half)MATH_UPMANGLE(erfinv)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/exp10D.cl000066400000000000000000000005611415221260100204340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_EXP10 #include "expD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/exp10F.cl000066400000000000000000000005611415221260100204360ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_EXP10 #include "expF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/exp10H.cl000066400000000000000000000007311415221260100204370ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(exp10) CONSTATTR half MATH_MANGLE(exp10)(half x) { return (half)BUILTIN_EXP2_F32((float)x * 0x1.a934f0p+1f); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/exp2D.cl000066400000000000000000000005601415221260100203540ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_EXP2 #include "expD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/exp2F.cl000066400000000000000000000005601415221260100203560ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_EXP2 #include "expF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/exp2H.cl000066400000000000000000000006711415221260100203630ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(exp2) CONSTATTR half MATH_MANGLE(exp2)(half x) { return BUILTIN_EXP2_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/expD.cl000066400000000000000000000005571415221260100203000ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_EXP #include "expD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/expD_base.h000066400000000000000000000033721415221260100211210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double #if defined COMPILING_EXP2 MATH_MANGLE(exp2)(double x) #elif defined COMPILING_EXP10 MATH_MANGLE(exp10)(double x) #else MATH_MANGLE(exp)(double x) #endif { #if defined(COMPILING_EXP2) double dn = BUILTIN_RINT_F64(x); double f = x - dn; double t = MATH_MAD(f, 0x1.62e42fefa39efp-1, f * 0x1.abc9e3b39803fp-56); #elif defined(COMPILING_EXP10) double dn = BUILTIN_RINT_F64(x * 0x1.a934f0979a371p+1); double f = MATH_MAD(-dn, -0x1.9dc1da994fd21p-59, MATH_MAD(-dn, 0x1.34413509f79ffp-2, x)); double t = MATH_MAD(f, 0x1.26bb1bbb55516p+1, f * -0x1.f48ad494ea3e9p-53); #else double dn = BUILTIN_RINT_F64(x * 0x1.71547652b82fep+0); double t = MATH_MAD(-dn, 0x1.abc9e3b39803fp-56, MATH_MAD(-dn, 0x1.62e42fefa39efp-1, x)); #endif double p = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.ade156a5dcb37p-26, 0x1.28af3fca7ab0cp-22), 0x1.71dee623fde64p-19), 0x1.a01997c89e6b0p-16), 0x1.a01a014761f6ep-13), 0x1.6c16c1852b7b0p-10), 0x1.1111111122322p-7), 0x1.55555555502a1p-5), 0x1.5555555555511p-3), 0x1.000000000000bp-1), 1.0), 1.0); double z = BUILTIN_FLDEXP_F64(p, (int)dn); if (!FINITE_ONLY_OPT()) { z = x > 1024.0 ? AS_DOUBLE(PINFBITPATT_DP64) : z; } z = x < -1075.0 ? 0.0 : z; return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/expF.cl000066400000000000000000000005571415221260100203020ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_EXP #include "expF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/expF_base.h000066400000000000000000000120211415221260100211120ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" // Algorithm: // // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) // // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer // n = 64*m + j, 0 <= j < 64 // // e^x = 2^((64*m + j + f)/64) // = (2^m) * (2^(j/64)) * 2^(f/64) // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) // // f = x*(64/ln(2)) - n // r = f*(ln(2)/64) = x - n*(ln(2)/64) // // e^x = (2^m) * (2^(j/64)) * e^r // // (2^(j/64)) is precomputed // // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! // e^r = 1 + q // // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! // // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) PUREATTR float #if defined COMPILING_EXP2 MATH_MANGLE(exp2)(float x) #elif defined COMPILING_EXP10 MATH_MANGLE(exp10)(float x) #else MATH_MANGLE(exp)(float x) #endif { if (DAZ_OPT()) { if (UNSAFE_MATH_OPT()) { #if defined COMPILING_EXP2 return BUILTIN_EXP2_F32(x); #elif defined COMPILING_EXP10 return BUILTIN_EXP2_F32(x * 0x1.a92000p+1f) * BUILTIN_EXP2_F32(x * 0x1.4f0978p-11f); #else return BUILTIN_EXP2_F32(x * 0x1.715476p+0f); #endif } else { #if defined COMPILING_EXP2 return BUILTIN_EXP2_F32(x); #else float ph, pl; if (HAVE_FAST_FMA32()) { #if defined COMPILING_EXP const float c = 0x1.715476p+0f; const float cc = 0x1.4ae0bep-26f; // c+cc are 49 bits #else const float c = 0x1.a934f0p+1f; const float cc = 0x1.2f346ep-24f; #endif ph = x * c; pl = BUILTIN_FMA_F32(x, cc, BUILTIN_FMA_F32(x, c, -ph)); } else { #if defined COMPILING_EXP const float ch = 0x1.714000p+0f; const float cl = 0x1.47652ap-12f; // ch + cl are 36 bits #else const float ch = 0x1.a92000p+1f; const float cl = 0x1.4f0978p-11f; #endif float xh = AS_FLOAT(AS_INT(x) & 0xfffff000); float xl = x - xh; ph = xh * ch; pl = MATH_MAD(xh, cl, MATH_MAD(xl, ch, xl*cl)); } float e = BUILTIN_RINT_F32(ph); float a = ph - e + pl; float r = BUILTIN_FLDEXP_F32(BUILTIN_EXP2_F32(a), (int)e); #if defined COMPILING_EXP r = x < -0x1.5d58a0p+6f ? 0.0f : r; r = x > 0x1.62e430p+6f ? AS_FLOAT(PINFBITPATT_SP32) : r; #else r = x < -0x1.2f7030p+5f ? 0.0f : r; r = x > 0x1.344136p+5f ? AS_FLOAT(PINFBITPATT_SP32): r; #endif return r; #endif } } else { if (UNSAFE_MATH_OPT()) { #if defined COMPILING_EXP2 bool s = x < -0x1.f80000p+6f; return BUILTIN_EXP2_F32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); #elif defined COMPILING_EXP10 bool s = x < -0x1.2f7030p+5f; x += s ? 0x1.0p+5f : 0.0f; return BUILTIN_EXP2_F32(x * 0x1.a92000p+1f) * BUILTIN_EXP2_F32(x * 0x1.4f0978p-11f) * (s ? 0x1.9f623ep-107f : 1.0f); #else bool s = x < -0x1.5d58a0p+6f; return BUILTIN_EXP2_F32((x + (s ? 0x1.0p+6f : 0.0f)) * 0x1.715476p+0f) * (s ? 0x1.969d48p-93f : 1.0f); #endif } else { #if defined COMPILING_EXP2 bool s = x < -0x1.f80000p+6f; return BUILTIN_EXP2_F32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); #else float ph, pl; if (HAVE_FAST_FMA32()) { #if defined COMPILING_EXP const float c = 0x1.715476p+0f; const float cc = 0x1.4ae0bep-26f; // c+cc are 49 bits #else const float c = 0x1.a934f0p+1f; const float cc = 0x1.2f346ep-24f; #endif ph = x * c; pl = BUILTIN_FMA_F32(x, cc, BUILTIN_FMA_F32(x, c, -ph)); } else { #if defined COMPILING_EXP const float ch = 0x1.714000p+0f; const float cl = 0x1.47652ap-12f; // ch + cl are 36 bits #else const float ch = 0x1.a92000p+1f; const float cl = 0x1.4f0978p-11f; #endif float xh = AS_FLOAT(AS_INT(x) & 0xfffff000); float xl = x - xh; ph = xh * ch; pl = MATH_MAD(xh, cl, MATH_MAD(xl, ch, xl*cl)); } float e = BUILTIN_RINT_F32(ph); float a = ph - e + pl; float r = BUILTIN_FLDEXP_F32(BUILTIN_EXP2_F32(a), (int)e); #if defined COMPILING_EXP r = x < -0x1.9d1da0p+6f ? 0.0f : r; r = x > 0x1.62e430p+6f ? AS_FLOAT(PINFBITPATT_SP32) : r; #else r = x < -0x1.66d3e8p+5f ? 0.0f : r; r = x > 0x1.344136p+5f ? AS_FLOAT(PINFBITPATT_SP32): r; #endif return r; #endif } } } ROCm-Device-Libs-rocm-5.0.0/ocml/src/expH.cl000066400000000000000000000007251415221260100203010ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(exp) CONSTATTR half MATH_MANGLE(exp)(half x) { return (half)BUILTIN_EXP2_F32((float)x * 0x1.715476p+0f); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/expepD.cl000066400000000000000000000026731415221260100206260ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" CONSTATTR double MATH_PRIVATE(expep)(double2 x) { #if defined EXTRA_ACCURACY double dn = BUILTIN_RINT_F64(x.hi * 0x1.71547652b82fep+0); double2 t = fsub(fsub(sub(x, dn*0x1.62e42fefa3000p-1), dn*0x1.3de6af278e000p-42), dn*0x1.9cc01f97b57a0p-83); double th = t.hi; double p = MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, 0x1.ade156a5dcb37p-26, 0x1.28af3fca7ab0cp-22), 0x1.71dee623fde64p-19), 0x1.a01997c89e6b0p-16), 0x1.a01a014761f6ep-13), 0x1.6c16c1852b7b0p-10), 0x1.1111111122322p-7), 0x1.55555555502a1p-5), 0x1.5555555555511p-3), 0x1.000000000000bp-1); double2 r = fadd(t, mul(sqr(t), p)); double z = 1.0 + r.hi; z = BUILTIN_FLDEXP_F64(z, (int)dn); z = x.hi > 710.0 ? AS_DOUBLE(PINFBITPATT_DP64) : z; z = x.hi < -745.0 ? 0.0 : z; #else double z = MATH_MANGLE(exp)(x.hi); double zz = MATH_MAD(z, x.lo, z); z = BUILTIN_ISINF_F64(z)? z : zz; #endif return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/expepF.cl000066400000000000000000000023551415221260100206250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" CONSTATTR float MATH_PRIVATE(expep)(float2 x) { #if defined EXTRA_ACCURACY float fn = BUILTIN_RINT_F32(x.hi * 0x1.715476p+0f); float2 t = fsub(fsub(sub(x, fn*0x1.62e400p-1f), fn*0x1.7f7800p-20f), fn*0x1.473de6p-34f); float th = t.hi; float p = MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, 0x1.6850e4p-10f, 0x1.123bccp-7f), 0x1.555b98p-5f), 0x1.55548ep-3f), 0x1.fffff8p-2f); float2 r = fadd(t, mul(sqr(t), p)); float z = 1.0f + r.hi; z = BUILTIN_FLDEXP_F32(z, (int)fn); z = x.hi > 89.0f ? AS_FLOAT(PINFBITPATT_SP32) : z; z = x.hi < -104.0f ? 0.0f : z; #else float d = x.hi == 0x1.62e430p+6f ? 0x1.0p-17f : 0.0f; x.hi -= d; x.lo += d; float z = MATH_MANGLE(exp)(x.hi); float zz = BUILTIN_FMA_F32(z, x.lo, z); z = BUILTIN_ISINF_F32(z) ? z : zz; #endif return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/expm1D.cl000066400000000000000000000031751415221260100205350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x); CONSTATTR double MATH_MANGLE(expm1)(double x) { #if defined EXTRA_ACCURACY double2 e = sub(MATH_PRIVATE(epexpep)(con(x, 0.0)), 1.0); double z = e.hi; #else double dn = BUILTIN_RINT_F64(x * 0x1.71547652b82fep+0); double t = MATH_MAD(-dn, 0x1.abc9e3b39803fp-56, MATH_MAD(-dn, 0x1.62e42fefa39efp-1, x)); double p = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.1f32ea9d67f34p-29, 0x1.af4eb2a1b768bp-26), 0x1.27e500e0ac05bp-22), 0x1.71de01b889c29p-19), 0x1.a01a0197bcfd8p-16), 0x1.a01a01ac1a723p-13), 0x1.6c16c16c18931p-10), 0x1.1111111110056p-7), 0x1.5555555555552p-5), 0x1.5555555555557p-3), 0x1.0000000000000p-1); p = MATH_MAD(t, t*p, t); int e = dn == 1024.0 ? 1023 : (int)dn; double s = BUILTIN_FLDEXP_F64(1.0, e); double z = MATH_MAD(s, p, s - 1.0); z = dn == 1024.0 ? 2.0*z : z; #endif if (!FINITE_ONLY_OPT()) { z = x > 0x1.62e42fefa39efp+9 ? AS_DOUBLE(PINFBITPATT_DP64) : z; } z = x < -37.0 ? -1.0 : z; return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/expm1F.cl000066400000000000000000000024661415221260100205410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x); CONSTATTR float MATH_MANGLE(expm1)(float x) { #if defined EXTRA_ACCURACY float2 e = sub(MATH_PRIVATE(epexpep)(con(x, 0.0f)), 1.0f); float z = e.hi; #else float fn = BUILTIN_RINT_F32(x * 0x1.715476p+0f); float t = BUILTIN_FMA_F32(-fn, -0x1.05c610p-29f, BUILTIN_FMA_F32(-fn, 0x1.62e430p-1f, x)); float p = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.a26762p-13f, 0x1.6d2e00p-10f), 0x1.110ff2p-7f), 0x1.555502p-5f), 0x1.555556p-3f), 0x1.000000p-1f); p = BUILTIN_FMA_F32(t, t*p, t); int e = fn == 128.0f ? 127 : (int)fn; float s = BUILTIN_FLDEXP_F32(1.0f, e); float z = BUILTIN_FMA_F32(s, p, s - 1.0f); z = fn == 128.0 ? 2.0f*z : z; #endif if (!FINITE_ONLY_OPT()) { z = x > 0x1.62e42ep+6f ? AS_FLOAT(PINFBITPATT_SP32) : z; } z = x < -17.0f ? -1.0f : z; return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/expm1H.cl000066400000000000000000000012031415221260100205270ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(expm1) CONSTATTR half MATH_MANGLE(expm1)(half x) { half ret; ret = (half)(BUILTIN_EXP2_F32((float)x * 0x1.715476p+0f) - 1.0f); half p = BUILTIN_FMA_F16(x, x*BUILTIN_FMA_F16(x, 0x1.555556p-3h, 0.5h), x); ret = BUILTIN_ABS_F16(x) < 0x1.0p-6h ? p : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fabsD.cl000066400000000000000000000006461415221260100204160ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(fabs)(double x) { return BUILTIN_ABS_F64(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fabsF.cl000066400000000000000000000006441415221260100204160ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(fabs)(float x) { return BUILTIN_ABS_F32(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fabsH.cl000066400000000000000000000007631415221260100204220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(fabs)(half2 x) { return BUILTIN_ABS_2F16(x); } CONSTATTR half MATH_MANGLE(fabs)(half x) { return BUILTIN_ABS_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fdimD.cl000066400000000000000000000007301415221260100204140ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(fdim)(double x, double y) { return (x <= y && !BUILTIN_ISUNORDERED_F64(x, y)) ? 0.0 : (x - y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fdimF.cl000066400000000000000000000007261415221260100204230ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(fdim)(float x, float y) { return (x <= y && !BUILTIN_ISUNORDERED_F32(x, y)) ? 0.0f : (x - y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fdimH.cl000066400000000000000000000007511415221260100204230ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR BGEN(fdim) CONSTATTR half MATH_MANGLE(fdim)(half x, half y) { return (x <= y && !BUILTIN_ISUNORDERED_F16(x, y)) ? 0.0h : (x - y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/floorD.cl000066400000000000000000000006511415221260100206200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(floor)(double x) { return BUILTIN_FLOOR_F64(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/floorF.cl000066400000000000000000000006471415221260100206270ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(floor)(float x) { return BUILTIN_FLOOR_F32(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/floorH.cl000066400000000000000000000007711415221260100206270ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(floor)(half2 x) { return BUILTIN_FLOOR_2F16(x); } CONSTATTR half MATH_MANGLE(floor)(half x) { return BUILTIN_FLOOR_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmaD.cl000066400000000000000000000012511415221260100202370ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(fma)(double a, double b, double c) { return BUILTIN_FMA_F64(a, b, c); } #define GEN(LN,UN) \ CONSTATTR double \ MATH_MANGLE(LN)(double a, double b, double c) \ { \ return BUILTIN_##UN##_F64(a, b, c); \ } // GEN(fma_rte,FMA_RTE) // GEN(fma_rtn,FMA_RTN) // GEN(fma_rtp,FMA_RTP) // GEN(fma_rtz,FMA_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmaF.cl000066400000000000000000000014151415221260100202430ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float2 MATH_MANGLE2(fma)(float2 a, float2 b, float2 c) { return BUILTIN_FMA_2F32(a, b, c); } CONSTATTR float MATH_MANGLE(fma)(float a, float b, float c) { return BUILTIN_FMA_F32(a, b, c); } #define GEN(LN,UN) \ CONSTATTR float \ MATH_MANGLE(LN)(float a, float b, float c) \ { \ return BUILTIN_##UN##_F32(a, b, c); \ } // GEN(fma_rte,FMA_RTE) // GEN(fma_rtn,FMA_RTN) // GEN(fma_rtp,FMA_RTP) // GEN(fma_rtz,FMA_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmaH.cl000066400000000000000000000014011415221260100202400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(fma)(half2 a, half2 b, half2 c) { return BUILTIN_FMA_2F16(a, b, c); } CONSTATTR half MATH_MANGLE(fma)(half a, half b, half c) { return BUILTIN_FMA_F16(a, b, c); } #define GEN(LN,UN) \ CONSTATTR half \ MATH_MANGLE(LN)(half a, half b, half c) \ { \ return BUILTIN_##UN##_F16(a, b, c); \ } // GEN(fma_rte,FMA_RTE) // GEN(fma_rtn,FMA_RTN) // GEN(fma_rtp,FMA_RTP) // GEN(fma_rtz,FMA_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmaxD.cl000066400000000000000000000006631415221260100204350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(fmax)(double x, double y) { return BUILTIN_MAX_F64(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmaxF.cl000066400000000000000000000011301415221260100204250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(fmax)(float x, float y) { float ret; if (DAZ_OPT() & !FINITE_ONLY_OPT()) { // XXX revist this later ret = BUILTIN_CMAX_F32(x, y); } else { ret = BUILTIN_MAX_F32(x, y); } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmaxH.cl000066400000000000000000000010121415221260100204260ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(fmax)(half2 x, half2 y) { return BUILTIN_MAX_2F16(x, y); } CONSTATTR half MATH_MANGLE(fmax)(half x, half y) { return BUILTIN_MAX_F16(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fminD.cl000066400000000000000000000006631415221260100204330ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(fmin)(double x, double y) { return BUILTIN_MIN_F64(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fminF.cl000066400000000000000000000011311415221260100204240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(fmin)(float x, float y) { float ret; if (DAZ_OPT() & !FINITE_ONLY_OPT()) { // XXX revisit this later ret = BUILTIN_CMIN_F32(x, y); } else { ret = BUILTIN_MIN_F32(x, y); } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fminH.cl000066400000000000000000000010121415221260100204240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(fmin)(half2 x, half2 y) { return BUILTIN_MIN_2F16(x, y); } CONSTATTR half MATH_MANGLE(fmin)(half x, half y) { return BUILTIN_MIN_F16(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmodD.cl000066400000000000000000000005661415221260100204310ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_FMOD #include "remainderD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmodF.cl000066400000000000000000000005661415221260100204330ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_FMOD #include "remainderF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmodH.cl000066400000000000000000000006401415221260100204260ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR BGEN(fmod) #define COMPILING_FMOD #include "remainderH_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmuladdD.cl000066400000000000000000000007251415221260100211150ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(fmuladd)(double a, double b, double c) { #pragma OPENCL FP_CONTRACT ON return a * b + c; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmuladdF.cl000066400000000000000000000011241415221260100211110ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float2 MATH_MANGLE2(fmuladd)(float2 a, float2 b, float2 c) { #pragma OPENCL FP_CONTRACT ON return a * b + c; } CONSTATTR float MATH_MANGLE(fmuladd)(float a, float b, float c) { #pragma OPENCL FP_CONTRACT ON return a * b + c; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fmuladdH.cl000066400000000000000000000011151415221260100211130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(fmuladd)(half2 a, half2 b, half2 c) { #pragma OPENCL FP_CONTRACT ON return a * b + c; } CONSTATTR half MATH_MANGLE(fmuladd)(half a, half b, half c) { #pragma OPENCL FP_CONTRACT ON return a * b + c; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fpclassifyD.cl000066400000000000000000000012611415221260100216400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR int MATH_MANGLE(fpclassify)(double x) { int ret = BUILTIN_ISINF_F64(x) ? FP_INFINITE : FP_NAN; ret = BUILTIN_CLASS_F64(x, CLASS_PZER|CLASS_NZER) ? FP_ZERO : ret; ret = BUILTIN_CLASS_F64(x, CLASS_PSUB|CLASS_NSUB) ? FP_SUBNORMAL : ret; ret = BUILTIN_CLASS_F64(x, CLASS_PNOR|CLASS_NNOR) ? FP_NORMAL : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fpclassifyF.cl000066400000000000000000000012601415221260100216410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR int MATH_MANGLE(fpclassify)(float x) { int ret = BUILTIN_ISINF_F32(x) ? FP_INFINITE : FP_NAN; ret = BUILTIN_CLASS_F32(x, CLASS_PZER|CLASS_NZER) ? FP_ZERO : ret; ret = BUILTIN_CLASS_F32(x, CLASS_PSUB|CLASS_NSUB) ? FP_SUBNORMAL : ret; ret = BUILTIN_CLASS_F32(x, CLASS_PNOR|CLASS_NNOR) ? FP_NORMAL : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fpclassifyH.cl000066400000000000000000000013041415221260100216420ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS CONSTATTR int MATH_MANGLE(fpclassify)(half x) { int ret = BUILTIN_ISINF_F16(x) ? FP_INFINITE : FP_NAN; ret = BUILTIN_CLASS_F16(x, CLASS_PZER|CLASS_NZER) ? FP_ZERO : ret; ret = BUILTIN_CLASS_F16(x, CLASS_PSUB|CLASS_NSUB) ? FP_SUBNORMAL : ret; ret = BUILTIN_CLASS_F16(x, CLASS_PNOR|CLASS_NNOR) ? FP_NORMAL : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fractD.cl000066400000000000000000000014341415221260100205760ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" double MATH_MANGLE(fract)(double x, __private double *ip) { double i = BUILTIN_FLOOR_F64(x); double f; if (__oclc_ISA_version < 8000) { f = BUILTIN_MIN_F64(x - i, 0x1.fffffffffffffp-1); if (!FINITE_ONLY_OPT()) { f = BUILTIN_ISNAN_F64(x) ? x : f; f = BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_NINF) ? 0.0 : f; } } else { f = BUILTIN_FRACTION_F64(x); } *ip = i; return f; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fractF.cl000066400000000000000000000013731415221260100206020ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" float MATH_MANGLE(fract)(float x, __private float *ip) { float i = BUILTIN_FLOOR_F32(x); float f; if (__oclc_ISA_version < 8000) { f = BUILTIN_MIN_F32(x - i, 0x1.fffffep-1f); if (!FINITE_ONLY_OPT()) { f = BUILTIN_ISNAN_F32(x) ? x : f; f = BUILTIN_ISINF_F32(x) ? 0.0f : f; } } else { f = BUILTIN_FRACTION_F32(x); } *ip = i; return f; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/fractH.cl000066400000000000000000000012471415221260100206040ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS half2 MATH_MANGLE2(fract)(half2 x, __private half2 *ip) { *ip = BUILTIN_FLOOR_2F16(x); return (half2)(BUILTIN_FRACTION_F16(x.lo), BUILTIN_FRACTION_F16(x.hi)); } REQUIRES_16BIT_INSTS half MATH_MANGLE(fract)(half x, __private half *ip) { *ip = BUILTIN_FLOOR_F16(x); return BUILTIN_FRACTION_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/frexpD.cl000066400000000000000000000011341415221260100206200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" double MATH_MANGLE(frexp)(double x, __private int *ep) { int e = BUILTIN_FREXP_EXP_F64(x); double r = BUILTIN_FREXP_MANT_F64(x); bool c = BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_NINF|CLASS_SNAN|CLASS_QNAN); *ep = c ? 0 : e; return c ? x : r; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/frexpF.cl000066400000000000000000000011311415221260100206170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" float MATH_MANGLE(frexp)(float x, __private int *ep) { int e = BUILTIN_FREXP_EXP_F32(x); float r = BUILTIN_FREXP_MANT_F32(x); bool c = BUILTIN_CLASS_F32(x, CLASS_PINF|CLASS_NINF|CLASS_SNAN|CLASS_QNAN); *ep = c ? 0 : e; return c ? x : r; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/frexpH.cl000066400000000000000000000015401415221260100206250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS half2 MATH_MANGLE2(frexp)(half2 x, __private int2 *ep) { int elo, ehi; half2 r; r.lo = MATH_MANGLE(frexp)(x.lo, &elo); r.hi = MATH_MANGLE(frexp)(x.hi, &ehi); *ep = (int2)(elo, ehi); return r; } REQUIRES_16BIT_INSTS half MATH_MANGLE(frexp)(half x, __private int *ep) { int e = (int)BUILTIN_FREXP_EXP_F16(x); half r = BUILTIN_FREXP_MANT_F16(x); bool c = BUILTIN_CLASS_F16(x, CLASS_PINF|CLASS_NINF|CLASS_SNAN|CLASS_QNAN); *ep = c ? 0 : e; return c ? x : r; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/hypotD.cl000066400000000000000000000017141415221260100206430ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(hypot)(double x, double y) { double a = BUILTIN_ABS_F64(x); double b = BUILTIN_ABS_F64(y); double t = BUILTIN_MAX_F64(a, b); int e = BUILTIN_FREXP_EXP_F64(t); a = BUILTIN_FLDEXP_F64(a, -e); b = BUILTIN_FLDEXP_F64(b, -e); double ret = BUILTIN_FLDEXP_F64(MATH_FAST_SQRT(MATH_MAD(a, a, b*b)), e); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISNAN_F64(x) | BUILTIN_ISNAN_F64(y)) ? AS_DOUBLE(QNANBITPATT_DP64) : ret; ret = (BUILTIN_ISINF_F64(x) | BUILTIN_ISINF_F64(y)) ? AS_DOUBLE(PINFBITPATT_DP64) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/hypotF.cl000066400000000000000000000014741415221260100206500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(hypot)(float x, float y) { float a = BUILTIN_ABS_F32(x); float b = BUILTIN_ABS_F32(y); float t = BUILTIN_MAX_F32(a, b); int e = BUILTIN_FREXP_EXP_F32(t) ; a = BUILTIN_FLDEXP_F32(a, -e); b = BUILTIN_FLDEXP_F32(b, -e); float ret = BUILTIN_FLDEXP_F32(MATH_FAST_SQRT(MATH_MAD(a, a, b*b)), e); if (!FINITE_ONLY_OPT()) { ret = BUILTIN_ISINF_F32(t) ? AS_FLOAT(PINFBITPATT_SP32) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/hypotH.cl000066400000000000000000000013331415221260100206440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR BGEN(hypot) CONSTATTR half MATH_MANGLE(hypot)(half x, half y) { float fx = (float)x; float fy = (float)y; float d2 = BUILTIN_MAD_F32(fx, fx, fy*fy); half ret = (half)BUILTIN_SQRT_F32(d2); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISINF_F16(x) | BUILTIN_ISINF_F16(y)) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/i0D.cl000066400000000000000000000051051415221260100200060ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" double MATH_MANGLE(i0)(double x) { x = BUILTIN_ABS_F64(x); double ret; if (x < 8.0) { double t = 0.25 * x * x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.dd78750ff79b2p-97, 0x1.4394559531e65p-89), 0x1.6f7123f151c79p-81), 0x1.3d9e7c5528048p-73), 0x1.e736f323a0cabp-66), 0x1.4196ce3b298c5p-58), 0x1.69caac7bf9255p-51), 0x1.5601878c06ac8p-44), 0x1.0b313291f5e48p-37), 0x1.522a43f5dcb54p-31), 0x1.522a43f659634p-25), 0x1.02e85c0898945p-19), 0x1.23456789abcf3p-14), 0x1.c71c71c71c71cp-10), 0x1.c71c71c71c71cp-6), 0x1.0000000000000p-2), 0x1.0000000000000p+0), ret = MATH_MAD(t, ret, 1.0f); } else { double t = MATH_RCP(x); ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.cc967bacb549dp+49, -0x1.5ba7722975981p+50), 0x1.df0f836763276p+49), -0x1.9042a430f3f43p+48), 0x1.c630541c4f568p+46), -0x1.7366be5a9784fp+44), 0x1.c5669a48f574ep+41), -0x1.a664cac47f0eap+38), 0x1.308250566988cp+35), -0x1.56874c2ddb061p+31), 0x1.2da58968da2aap+27), -0x1.9faaa33f0d6bcp+22), 0x1.be0a8f2bc76ddp+17), -0x1.7123c68c3cb02p+12), 0x1.d402150cc72aap+6), -0x1.7a8ae85359520p+0), 0x1.bd7e0b6a753cdp-4), 0x1.6d6ce3774506dp-5), 0x1.debdd3d2f7cf9p-6), 0x1.cb94db8d452d5p-6), 0x1.9884533daea3dp-5), 0x1.9884533d4362fp-2); double xs = x - 709.0; double e1 = MATH_MANGLE(exp)(x > 709.0 ? xs : x); double e2 = x > 709.0 ? 0x1.d422d2be5dc9bp+1022 : 1.0; ret = e1 * MATH_MANGLE(rsqrt)(x) * ret * e2; } if (!FINITE_ONLY_OPT()) { ret = BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_QNAN|CLASS_SNAN) ? x : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/i0F.cl000066400000000000000000000027451415221260100200170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" float MATH_MANGLE(i0)(float x) { x = BUILTIN_ABS_F32(x); float ret; if (x < 8.0f) { float t = 0.25f * x * x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.38d760p-43f, 0x1.7fd5c6p-38f), 0x1.66ffc8p-31f), 0x1.4ecb6ep-25f), 0x1.033c70p-19f), 0x1.233bb2p-14f), 0x1.c71db2p-10f), 0x1.c71c5ep-6f), 0x1.000000p-2f), 0x1.000000p+0f); ret = MATH_MAD(t, ret, 1.0f); } else { float t = MATH_FAST_RCP(x); ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.c49916p-2f, -0x1.110f5ep-5f), 0x1.2a130ap-5f), 0x1.c68702p-6f), 0x1.9890aep-5f), 0x1.988450p-2f); float xs = x - 88.0f; float e1 = MATH_MANGLE(exp)(x > 88.0f ? xs : x); float e2 = x > 88.0f ? 0x1.f1056ep+126f : 1.0f; ret = e1 * BUILTIN_RSQRT_F32(x) * ret * e2; } if (!FINITE_ONLY_OPT()) { ret = BUILTIN_CLASS_F32(x, CLASS_PINF|CLASS_QNAN|CLASS_SNAN) ? x : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/i0H.cl000066400000000000000000000006571415221260100200210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" UGEN(i0) half MATH_MANGLE(i0)(half x) { return (half)MATH_UPMANGLE(i0)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/i1D.cl000066400000000000000000000050771415221260100200170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" double MATH_MANGLE(i1)(double x) { double a = BUILTIN_ABS_F64(x); double ret; if (a < 8.0) { a *= 0.5; double t = a * a; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.fc892c836e80ap-93, 0x1.432352d94a857p-85), 0x1.588ae4f7b7a4ap-77), 0x1.15e96e9231b49p-69), 0x1.8bdcb5f2184d1p-62), 0x1.e26237a1e02fep-55), 0x1.f176aca1a831fp-48), 0x1.ab81e97c83e75p-41), 0x1.2c9758e3649ffp-34), 0x1.522a43f5ed306p-28), 0x1.27e4fb778d591p-22), 0x1.845c8a0ce4edap-17), 0x1.6c16c16c16c26p-12), 0x1.c71c71c71c71cp-8), 0x1.5555555555555p-4), 0x1.0000000000000p-1); ret = MATH_MAD(t, a*ret, a); } else { double t = MATH_RCP(a); ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.c9d8d43214423p+49, 0x1.5c072e12fb4bap+50), -0x1.e26cff438b6f6p+49), 0x1.952224c61a221p+48), -0x1.cdc7c873cf435p+46), 0x1.7b1e32a15fb86p+44), -0x1.d07dbd6696f1cp+41), 0x1.b227934f2ced2p+38), -0x1.39f23e6685444p+35), 0x1.6229383f6f890p+31), -0x1.38bf1ceeee865p+27), 0x1.b01a348b749b8p+22), -0x1.d0e043ef0916ap+17), 0x1.81b06f82cfbacp+12), -0x1.ea879b2a6508bp+6), 0x1.85cffc8d54f52p+0), -0x1.09f107ee0f7e2p-3), -0x1.d61631539fb0dp-5), -0x1.4f1e01d904ebap-5), -0x1.7efc0ced79c58p-5), -0x1.32633e6e0f07ap-3), 0x1.9884533d43674p-2); double xs = x - 709.0; double e1 = MATH_MANGLE(exp)(x > 709.0 ? xs : x); double e2 = x > 709.0 ? 0x1.d422d2be5dc9bp+1022 : 1.0; ret = e1 * MATH_MANGLE(rsqrt)(x) * ret * e2; } if (!FINITE_ONLY_OPT()) { ret = BUILTIN_CLASS_F64(a, CLASS_PINF|CLASS_QNAN|CLASS_SNAN) ? a : ret; } return BUILTIN_COPYSIGN_F64(ret, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/i1F.cl000066400000000000000000000027451415221260100200200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" float MATH_MANGLE(i1)(float x) { float a = BUILTIN_ABS_F32(x); float ret; if (a < 8.0f) { a *= 0.5f; float t = a * a; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.882dd2p-40f, 0x1.af97f6p-35f), 0x1.66a3eap-28f), 0x1.251b32p-22f), 0x1.84cbb6p-17f), 0x1.6c0d4ap-12f), 0x1.c71d3ap-8f), 0x1.555550p-4f), 0x1.000000p-1f); ret = MATH_MAD(t, a*ret, a); } else { float t = MATH_FAST_RCP(a); ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.06de32p-1f, 0x1.043b22p-5f), -0x1.925276p-5f), -0x1.7c15c8p-5f), -0x1.3266ccp-3f), 0x1.988456p-2f); float as = a - 88.0f; float e1 = MATH_MANGLE(exp)(a > 88.0f ? as : a); float e2 = a > 88.0f ? 0x1.f1056ep+126f : 1.0f; ret = e1 * BUILTIN_RSQRT_F32(a) * ret * e2; } if (!FINITE_ONLY_OPT()) { ret = BUILTIN_CLASS_F32(a, CLASS_PINF|CLASS_QNAN|CLASS_SNAN) ? a : ret; } return BUILTIN_COPYSIGN_F32(ret, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/i1H.cl000066400000000000000000000006571415221260100200220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" UGEN(i1) half MATH_MANGLE(i1)(half x) { return (half)MATH_UPMANGLE(i1)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ilogbD.cl000066400000000000000000000011511415221260100205670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR int MATH_MANGLE(ilogb)(double x) { int r = BUILTIN_FREXP_EXP_F64(x) - 1; if (!FINITE_ONLY_OPT()) { r = BUILTIN_ISNAN_F64(x) ? FP_ILOGBNAN : r; r = BUILTIN_ISINF_F64(x) ? INT_MAX : r; } r = x == 0.0 ? FP_ILOGB0 : r; return r; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ilogbF.cl000066400000000000000000000011521415221260100205720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR int MATH_MANGLE(ilogb)(float x) { int r = BUILTIN_FREXP_EXP_F32(x) - 1; if (!FINITE_ONLY_OPT()) { r = BUILTIN_ISNAN_F32(x) ? FP_ILOGBNAN : r; r = BUILTIN_ISINF_F32(x) ? INT_MAX : r; } r = x == 0.0f ? FP_ILOGB0 : r; return r; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ilogbH.cl000066400000000000000000000014171415221260100206000ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS CONSTATTR int2 MATH_MANGLE2(ilogb)(half2 x) { return (int2)(MATH_MANGLE(ilogb)(x.lo), MATH_MANGLE(ilogb)(x.hi)); } REQUIRES_16BIT_INSTS CONSTATTR int MATH_MANGLE(ilogb)(half x) { int r = (int)BUILTIN_FREXP_EXP_F16(x) - 1; if (!FINITE_ONLY_OPT()) { r = BUILTIN_ISNAN_F16(x) ? FP_ILOGBNAN : r; r = BUILTIN_ISINF_F16(x) ? INT_MAX : r; } r = x == 0.0h ? FP_ILOGB0 : r; return r; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isfiniteD.cl000066400000000000000000000006541415221260100213140ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR int MATH_MANGLE(isfinite)(double x) { return BUILTIN_ISFINITE_F64(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isfiniteF.cl000066400000000000000000000006531415221260100213150ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR int MATH_MANGLE(isfinite)(float x) { return BUILTIN_ISFINITE_F32(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isfiniteH.cl000066400000000000000000000012271415221260100213150ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS CONSTATTR short2 MATH_MANGLE2(isfinite)(half2 x) { return (short2) (BUILTIN_ISFINITE_F16(x.lo) ? (short)-1 : (short)0, BUILTIN_ISFINITE_F16(x.hi) ? (short)-1 : (short)0); } REQUIRES_16BIT_INSTS CONSTATTR int MATH_MANGLE(isfinite)(half x) { return BUILTIN_ISFINITE_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isinfD.cl000066400000000000000000000006461415221260100206130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR int MATH_MANGLE(isinf)(double x) { return BUILTIN_ISINF_F64(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isinfF.cl000066400000000000000000000006441415221260100206130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR int MATH_MANGLE(isinf)(float x) { return BUILTIN_ISINF_F32(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isinfH.cl000066400000000000000000000012101415221260100206030ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS CONSTATTR short2 MATH_MANGLE2(isinf)(half2 x) { return (short2) (BUILTIN_ISINF_F16(x.lo) ? (short)-1 : (short)0, BUILTIN_ISINF_F16(x.hi) ? (short)-1 : (short)0); } REQUIRES_16BIT_INSTS CONSTATTR int MATH_MANGLE(isinf)(half x) { return BUILTIN_ISINF_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isnanD.cl000066400000000000000000000006451415221260100206120ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR int MATH_MANGLE(isnan)(double x) { return BUILTIN_ISNAN_F64(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isnanF.cl000066400000000000000000000006441415221260100206130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR int MATH_MANGLE(isnan)(float x) { return BUILTIN_ISNAN_F32(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isnanH.cl000066400000000000000000000011361415221260100206120ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR short2 MATH_MANGLE2(isnan)(half2 x) { return (short2) (BUILTIN_ISNAN_F16(x.lo) ? (short)-1 : (short)0, BUILTIN_ISNAN_F16(x.hi) ? (short)-1 : (short)0); } CONSTATTR int MATH_MANGLE(isnan)(half x) { return BUILTIN_ISNAN_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isnormalD.cl000066400000000000000000000007001415221260100213160ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR int MATH_MANGLE(isnormal)(double x) { return BUILTIN_CLASS_F64(x, CLASS_PNOR|CLASS_NNOR); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isnormalF.cl000066400000000000000000000006761415221260100213340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR int MATH_MANGLE(isnormal)(float x) { return BUILTIN_CLASS_F32(x, CLASS_PNOR|CLASS_NNOR); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/isnormalH.cl000066400000000000000000000013231415221260100213240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS CONSTATTR short2 MATH_MANGLE2(isnormal)(half2 x) { return (short2) (BUILTIN_CLASS_F16(x.lo, CLASS_PNOR|CLASS_NNOR) ? (short)-1 : (short)0, BUILTIN_CLASS_F16(x.hi, CLASS_PNOR|CLASS_NNOR) ? (short)-1 : (short)0); } REQUIRES_16BIT_INSTS CONSTATTR int MATH_MANGLE(isnormal)(half x) { return BUILTIN_CLASS_F16(x, CLASS_PNOR|CLASS_NNOR); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/j0D.cl000066400000000000000000000054651415221260100200200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" extern double MATH_PRIVATE(cosb)(double, int, double); extern CONSTATTR double MATH_PRIVATE(bp0)(double); extern CONSTATTR double MATH_PRIVATE(ba0)(double); double MATH_MANGLE(j0)(double x) { x = BUILTIN_ABS_F64(x); const double b0 = 1.65625; const double b1 = 3.125; const double b2 = 4.6875; const double b3 = 6.265625; const double b4 = 7.84375; const double b5 = 9.421875; const double b6 = 10.984375; const double b7 = 12.578125; double ret; if (x <= b7) { // Ty to maintain relative accuracy here USE_TABLE(double, p, M64_J0); double ch, cl; if (x <= b3) { if (x <= b0) { ch = 0.0; cl = 0.0; } else if (x <= b1) { ch = 0x1.33d152e971b40p+1; cl = -0x1.0f539d7da258ep-53; p += 1*15; } else if (x <= b2) { ch = 0x1.ea75575af6f09p+1; cl = -0x1.60155a9d1b256p-53; p += 2*15; } else { ch = 0x1.6148f5b2c2e45p+2; cl = 0x1.75054cd60a517p-54; p += 3*15; } } else { if (x <= b4) { ch = 0x1.c0ff5f3b47250p+2; cl = -0x1.b226d9d243827p-54; p += 4*15; } else if (x <= b5) { ch = 0x1.14eb56cccdecap+3; cl = -0x1.51970714c7c25p-52; p += 5*15; } else if (x <= b6) { ch = 0x1.458d0d0bdfc29p+3; cl = 0x1.02610a51562b6p-51; p += 6*15; } else { ch = 0x1.79544008272b6p+3; cl = 0x1.444fd5821d5b1p-52; p += 7*15; } } x = x - ch - cl; ret = MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, p[14], p[13]), p[12]), p[11]), p[10]), p[9]), p[8]), p[7]), p[6]), p[5]), p[4]), p[3]), p[2]), p[1]), p[0]); } else { double r = MATH_RCP(x); double r2 = r*r; double p = MATH_PRIVATE(bp0)(r2) * r; ret = 0x1.9884533d43651p-1 * MATH_FAST_SQRT(r) * MATH_PRIVATE(ba0)(r2) * MATH_PRIVATE(cosb)(x, 0, p); ret = BUILTIN_CLASS_F64(x, CLASS_PINF) ? 0.0 : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/j0F.cl000066400000000000000000000050421415221260100200110ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" extern float MATH_PRIVATE(cosb)(float, int, float); extern CONSTATTR float MATH_PRIVATE(bp0)(float); extern CONSTATTR float MATH_PRIVATE(ba0)(float); float MATH_MANGLE(j0)(float x) { x = BUILTIN_ABS_F32(x); const float b0 = 1.65625f; const float b1 = 3.125f; const float b2 = 4.6875f; const float b3 = 6.265625f; const float b4 = 7.84375f; const float b5 = 9.421875f; const float b6 = 10.984375f; const float b7 = 12.578125f; float ret; if (x <= b7) { // Ty to maintain relative accuracy here USE_TABLE(float, p, M32_J0); float ch, cl; if (x <= b3) { if (x <= b0) { ch = 0x0.000000p+0f; cl = 0x0.000000p+0f; } else if (x <= b1) { ch = 0x1.33d152p+1f; cl = 0x1.d2e368p-24f; p += 1*9; } else if (x <= b2) { ch = 0x1.ea7558p+1f; cl = -0x1.4a121ep-24f; p += 2*9; } else { ch = 0x1.6148f6p+2f; cl = -0x1.34f46ep-24f; p += 3*9; } } else { if (x <= b4) { ch = 0x1.c0ff60p+2f; cl = -0x1.8971b6p-23f; p += 4*9; } else if (x <= b5) { ch = 0x1.14eb56p+3f; cl = 0x1.999bdap-22f; p += 5*9; } else if (x <= b6) { ch = 0x1.458d0ep+3f; cl = -0x1.e8407ap-22f; p += 6*9; } else { ch = 0x1.795440p+3f; cl = 0x1.04e56cp-26f; p += 7*9; } } x = x - ch - cl; ret = MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, p[8], p[7]), p[6]), p[5]), p[4]), p[3]), p[2]), p[1]), p[0]); } else { float r = MATH_RCP(x); float r2 = r*r; float p = MATH_PRIVATE(bp0)(r2) * r; ret = 0x1.988454p-1f * BUILTIN_RSQRT_F32(x) * MATH_PRIVATE(ba0)(r2) * MATH_PRIVATE(cosb)(x, 0, p); ret = BUILTIN_CLASS_F32(x, CLASS_PINF) ? 0.0f : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/j0H.cl000066400000000000000000000006571415221260100200220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" UGEN(j0) half MATH_MANGLE(j0)(half x) { return (half)MATH_UPMANGLE(j0)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/j1D.cl000066400000000000000000000055711415221260100200170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" extern double MATH_PRIVATE(cosb)(double, int, double); extern CONSTATTR double MATH_PRIVATE(bp1)(double); extern CONSTATTR double MATH_PRIVATE(ba1)(double); double MATH_MANGLE(j1)(double x) { const double b0 = 1.09375; const double b1 = 2.84375; const double b2 = 4.578125; const double b3 = 6.171875; const double b4 = 7.78125; const double b5 = 9.359375; const double b6 = 10.953125; const double b7 = 12.515625; double ax = BUILTIN_ABS_F64(x); double ret; if (ax <= b7) { // Ty to maintain relative accuracy here USE_TABLE(double, p, M64_J1); double ch, cl; if (ax <= b3) { if (ax <= b0) { ch = 0.0; cl = 0.0; } else if (ax <= b1) { ch = 0x1.d757d1fec8a3ap+0; cl = 0x1.616d820cfdaebp-58; p += 1*15; } else if (ax <= b2) { ch = 0x1.ea75575af6f09p+1; cl = -0x1.60155a9d1b256p-53; p += 2*15; } else { ch = 0x1.55365bc032467p+2; cl = 0x1.5c646a75d7539p-53; p += 3*15; } } else { if (ax <= b4) { ch = 0x1.c0ff5f3b47250p+2; cl = -0x1.b226d9d243827p-54; p += 4*15; } else if (ax <= b5) { ch = 0x1.112980f0b88a1p+3; cl = -0x1.63e17ec20a31dp-53; p += 5*15; } else if (ax <= b6) { ch = 0x1.458d0d0bdfc29p+3; cl = 0x1.02610a51562b6p-51; p += 6*15; } else { ch = 0x1.76979797ee5acp+3; cl = 0x1.9a84d3a5fedc2p-51; p += 7*15; } } ax = ax - ch - cl; ret = MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, p[14], p[13]), p[12]), p[11]), p[10]), p[9]), p[8]), p[7]), p[6]), p[5]), p[4]), p[3]), p[2]), p[1]), p[0]); } else { double r = MATH_RCP(ax); double r2 = r*r; double p = MATH_PRIVATE(bp1)(r2) * r; ret = 0x1.9884533d43651p-1 * MATH_FAST_SQRT(r) * MATH_PRIVATE(ba1)(r2) * MATH_PRIVATE(cosb)(ax, 1, p); ret = BUILTIN_CLASS_F64(ax, CLASS_PINF) ? 0.0 : ret; } if (x < 0.0) ret = -ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/j1F.cl000066400000000000000000000051351415221260100200150ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" extern float MATH_PRIVATE(cosb)(float, int, float); extern CONSTATTR float MATH_PRIVATE(bp1)(float); extern CONSTATTR float MATH_PRIVATE(ba1)(float); float MATH_MANGLE(j1)(float x) { const float b0 = 1.09375f; const float b1 = 2.84375f; const float b2 = 4.578125f; const float b3 = 6.171875f; const float b4 = 7.78125f; const float b5 = 9.359375f; const float b6 = 10.953125f; const float b7 = 12.515625f; float ax = BUILTIN_ABS_F32(x); float ret; if (ax <= b7) { // Ty to maintain relative accuracy here USE_TABLE(float, p, M32_J1); float ch, cl; if (ax <= b3) { if (ax <= b0) { ch = 0.0f; cl = 0.0f; } else if (ax <= b1) { ch = 0x1.d757d2p+0f; cl = -0x1.375c60p-32f; p += 1*9; } else if (ax <= b2) { ch = 0x1.ea7558p+1f; cl = -0x1.4a121ep-24f; p += 2*9; } else { ch = 0x1.55365cp+2f; cl = -0x1.fe6dccp-25f; p += 3*9; } } else { if (ax <= b4) { ch = 0x1.c0ff60p+2f; cl = -0x1.8971b6p-23f; p += 4*9; } else if (ax <= b5) { ch = 0x1.112980p+3f; cl = 0x1.e17114p-22f; p += 5*9; } else if (ax <= b6) { ch = 0x1.458d0ep+3f; cl = -0x1.e8407ap-22f; p += 6*9; } else { ch = 0x1.769798p+3f; cl = -0x1.a04694p-23f; p += 7*9; } } ax = ax - ch - cl; ret = MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, p[8], p[7]), p[6]), p[5]), p[4]), p[3]), p[2]), p[1]), p[0]); } else { float r = MATH_RCP(ax); float r2 = r*r; float p = MATH_PRIVATE(bp1)(r2) * r; ret = 0x1.988454p-1f * BUILTIN_RSQRT_F32(ax) * MATH_PRIVATE(ba1)(r2) * MATH_PRIVATE(cosb)(ax, 1, p); ret = BUILTIN_CLASS_F32(ax, CLASS_PINF) ? 0.0f : ret; } if (x < 0.0f) ret = -ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/j1H.cl000066400000000000000000000006571415221260100200230ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" UGEN(j1) half MATH_MANGLE(j1)(half x) { return (half)MATH_UPMANGLE(j1)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ldexpD.cl000066400000000000000000000006641415221260100206170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(ldexp)(double x, int n) { return BUILTIN_FLDEXP_F64(x, n); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ldexpF.cl000066400000000000000000000006621415221260100206170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(ldexp)(float x, int n) { return BUILTIN_FLDEXP_F32(x, n); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ldexpH.cl000066400000000000000000000012171415221260100206160ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS CONSTATTR half2 MATH_MANGLE2(ldexp)(half2 x, int2 n) { return (half2)(MATH_MANGLE(ldexp)(x.lo, n.lo), MATH_MANGLE(ldexp)(x.hi, n.hi)); } REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(ldexp)(half x, int n) { return BUILTIN_FLDEXP_F16(x, BUILTIN_CLAMP_S32(n, SHRT_MIN, SHRT_MAX)); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/len3D.cl000066400000000000000000000026141415221260100203410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(len3)(double x, double y, double z) { double a = BUILTIN_ABS_F64(x); double b = BUILTIN_ABS_F64(y); double c = BUILTIN_ABS_F64(z); double a1 = BUILTIN_MAX_F64(a, b); double b1 = BUILTIN_MIN_F64(a, b); a = BUILTIN_MAX_F64(a1, c); double c1 = BUILTIN_MIN_F64(a1, c); b = BUILTIN_MAX_F64(b1, c1); c = BUILTIN_MIN_F64(b1, c1); int e = BUILTIN_FREXP_EXP_F64(a); a = BUILTIN_FLDEXP_F64(a, -e); b = BUILTIN_FLDEXP_F64(b, -e); c = BUILTIN_FLDEXP_F64(c, -e); double ret = BUILTIN_FLDEXP_F64(MATH_FAST_SQRT(MATH_MAD(a, a, MATH_MAD(b, b, c*c))), e); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISNAN_F64(x) | BUILTIN_ISNAN_F64(y) | BUILTIN_ISNAN_F64(z)) ? AS_DOUBLE(QNANBITPATT_DP64) : ret; ret = (BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_NINF) | BUILTIN_CLASS_F64(y, CLASS_PINF|CLASS_NINF) | BUILTIN_CLASS_F64(z, CLASS_PINF|CLASS_NINF)) ? AS_DOUBLE(PINFBITPATT_DP64) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/len3F.cl000066400000000000000000000025121415221260100203400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(len3)(float x, float y, float z) { float a = BUILTIN_ABS_F32(x); float b = BUILTIN_ABS_F32(y); float c = BUILTIN_ABS_F32(z); float a1 = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(a), AS_UINT(b))); float b1 = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(a), AS_UINT(b))); a = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(a1), AS_UINT(c))); float c1 = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(a1), AS_UINT(c))); b = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(b1), AS_UINT(c1))); c = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(b1), AS_UINT(c1))); int e = BUILTIN_FREXP_EXP_F32(a); a = BUILTIN_FLDEXP_F32(a, -e); b = BUILTIN_FLDEXP_F32(b, -e); c = BUILTIN_FLDEXP_F32(c, -e); float ret = BUILTIN_FLDEXP_F32(MATH_FAST_SQRT(MATH_MAD(a, a, MATH_MAD(b, b, c*c))), e); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISINF_F32(x) | BUILTIN_ISINF_F32(y) | BUILTIN_ISINF_F32(z)) ? AS_FLOAT(PINFBITPATT_SP32) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/len3H.cl000066400000000000000000000014541415221260100203460ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(len3)(half x, half y, half z) { float fx = (float)x; float fy = (float)y; float fz = (float)z; float d2 = MATH_MAD(fx, fx, MATH_MAD(fy, fy, fz*fz)); half ret = (half)BUILTIN_SQRT_F32(d2); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISINF_F16(x) | BUILTIN_ISINF_F16(y) | BUILTIN_ISINF_F16(z)) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/len4D.cl000066400000000000000000000031711415221260100203410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(len4)(double x, double y, double z, double w) { double a = BUILTIN_ABS_F64(x); double b = BUILTIN_ABS_F64(y); double c = BUILTIN_ABS_F64(z); double d = BUILTIN_ABS_F64(w); double a1 = BUILTIN_MAX_F64(a, b); double b1 = BUILTIN_MIN_F64(a, b); double c1 = BUILTIN_MAX_F64(c, d); double d1 = BUILTIN_MIN_F64(c, d); a = BUILTIN_MAX_F64(a1, c1); double c2 = BUILTIN_MIN_F64(a1, c1); double b2 = BUILTIN_MAX_F64(b1, d1); d = BUILTIN_MIN_F64(b1, d1); b = BUILTIN_MAX_F64(b2, c2); c = BUILTIN_MIN_F64(b2, c2); int e = BUILTIN_FREXP_EXP_F64(a); a = BUILTIN_FLDEXP_F64(a, -e); b = BUILTIN_FLDEXP_F64(b, -e); c = BUILTIN_FLDEXP_F64(c, -e); d = BUILTIN_FLDEXP_F64(d, -e); double ret = BUILTIN_FLDEXP_F64(MATH_FAST_SQRT(MATH_MAD(a, a, MATH_MAD(b, b, MATH_MAD(c, c, d*d)))), e); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISNAN_F64(x) | BUILTIN_ISNAN_F64(y) | BUILTIN_ISNAN_F64(z) | BUILTIN_ISNAN_F64(w)) ? AS_DOUBLE(QNANBITPATT_DP64) : ret; ret = (BUILTIN_ISINF_F64(x) | BUILTIN_ISINF_F64(y) | BUILTIN_ISINF_F64(z) | BUILTIN_ISINF_F64(w)) ? AS_DOUBLE(PINFBITPATT_DP64) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/len4F.cl000066400000000000000000000033361415221260100203460ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(len4)(float x, float y, float z, float w) { float a = BUILTIN_ABS_F32(x); float b = BUILTIN_ABS_F32(y); float c = BUILTIN_ABS_F32(z); float d = BUILTIN_ABS_F32(w); float a1 = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(a), AS_UINT(b))); float b1 = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(a), AS_UINT(b))); float c1 = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(c), AS_UINT(d))); float d1 = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(c), AS_UINT(d))); a = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(a1), AS_UINT(c1))); float c2 = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(a1), AS_UINT(c1))); float b2 = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(b1), AS_UINT(d1))); d = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(b1), AS_UINT(d1))); b = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(b2), AS_UINT(c2))); c = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(b2), AS_UINT(c2))); int e = BUILTIN_FREXP_EXP_F32(a); a = BUILTIN_FLDEXP_F32(a, -e); b = BUILTIN_FLDEXP_F32(b, -e); c = BUILTIN_FLDEXP_F32(c, -e); d = BUILTIN_FLDEXP_F32(d, -e); float ret = BUILTIN_FLDEXP_F32(MATH_FAST_SQRT(MATH_MAD(a, a, MATH_MAD(b, b, MATH_MAD(c, c, d*d)))), e); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISINF_F32(x) | BUILTIN_ISINF_F32(y) | BUILTIN_ISINF_F32(z) | BUILTIN_ISINF_F32(w)) ? AS_FLOAT(PINFBITPATT_SP32) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/len4H.cl000066400000000000000000000016051415221260100203450ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(len4)(half x, half y, half z, half w) { float fx = (float)x; float fy = (float)y; float fz = (float)z; float fw = (float)w; float d2 = MATH_MAD(fx, fx, MATH_MAD(fy, fy, MATH_MAD(fz, fz, fw*fw))); half ret = (half)BUILTIN_SQRT_F32(d2); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISINF_F16(x) | BUILTIN_ISINF_F16(y) | BUILTIN_ISINF_F16(z) | BUILTIN_ISINF_F16(w)) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/lgammaD.cl000066400000000000000000000006631415221260100207400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" double MATH_MANGLE(lgamma)(double x) { int s; return MATH_MANGLE(lgamma_r)(x, &s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/lgammaF.cl000066400000000000000000000006611415221260100207400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" float MATH_MANGLE(lgamma)(float x) { int s; return MATH_MANGLE(lgamma_r)(x, &s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/lgammaH.cl000066400000000000000000000006751415221260100207470ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" UGEN(lgamma) half MATH_MANGLE(lgamma)(half x) { int s; return MATH_MANGLE(lgamma_r)(x, &s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/lgamma_rD.cl000066400000000000000000000271411415221260100212610ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" // This lgamma routine began with Sun's lgamma code from netlib. // Their original copyright notice follows. /* @(#)e_lgamma_r.c 1.3 95/01/18 */ /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunSoft, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== * */ /* __ieee754_lgamma_r(x, signgamp) * Reentrant version of the logarithm of the Gamma function * with user provide pointer for the sign of Gamma(x). * * Method: * 1. Argument Reduction for 0 < x <= 8 * Since gamma(1+s)=s*gamma(s), for x in [0,8], we may * reduce x to a number in [1.5,2.5] by * lgamma(1+s) = log(s) + lgamma(s) * for example, * lgamma(7.3) = log(6.3) + lgamma(6.3) * = log(6.3*5.3) + lgamma(5.3) * = log(6.3*5.3*4.3*3.3*2.3) + lgamma(2.3) * 2. Polynomial approximation of lgamma around its * minimun ymin=1.461632144968362245 to maintain monotonicity. * On [ymin-0.23, ymin+0.27] (i.e., [1.23164,1.73163]), use * Let z = x-ymin; * lgamma(x) = -1.214862905358496078218 + z^2*poly(z) * where * poly(z) is a 14 degree polynomial. * 2. Rational approximation in the primary interval [2,3] * We use the following approximation: * s = x-2.0; * lgamma(x) = 0.5*s + s*P(s)/Q(s) * with accuracy * |P/Q - (lgamma(x)-0.5s)| < 2**-61.71 * Our algorithms are based on the following observation * * zeta(2)-1 2 zeta(3)-1 3 * lgamma(2+s) = s*(1-Euler) + --------- * s - --------- * s + ... * 2 3 * * where Euler = 0.5771... is the Euler constant, which is very * close to 0.5. * * 3. For x>=8, we have * lgamma(x)~(x-0.5)log(x)-x+0.5*log(2pi)+1/(12x)-1/(360x**3)+.... * (better formula: * lgamma(x)~(x-0.5)*(log(x)-1)-.5*(log(2pi)-1) + ...) * Let z = 1/x, then we approximation * f(z) = lgamma(x) - (x-0.5)(log(x)-1) * by * 3 5 11 * w = w0 + w1*z + w2*z + w3*z + ... + w6*z * where * |w - f(z)| < 2**-58.74 * * 4. For negative x, since (G is gamma function) * -x*G(-x)*G(x) = pi/sin(pi*x), * we have * G(x) = pi/(sin(pi*x)*(-x)*G(-x)) * since G(-x) is positive, sign(G(x)) = sign(sin(pi*x)) for x<0 * Hence, for x<0, signgam = sign(sin(pi*x)) and * lgamma(x) = log(|Gamma(x)|) * = log(pi/(|x*sin(pi*x)|)) - lgamma(-x); * Note: one should avoid compute pi*(-x) directly in the * computation of sin(pi*(-x)). * * 5. Special Cases * lgamma(2+s) ~ s*(1-Euler) for tiny s * lgamma(1)=lgamma(2)=0 * lgamma(x) ~ -log(x) for tiny x * lgamma(0) = lgamma(inf) = inf * lgamma(-integer) = +-inf * */ struct ret_t { double result; int signp; }; static struct ret_t MATH_MANGLE(lgamma_r_impl)(double x) { const double two52= 4.50359962737049600000e+15; const double pi = 3.14159265358979311600e+00; const double a0 = 7.72156649015328655494e-02; const double a1 = 3.22467033424113591611e-01; const double a2 = 6.73523010531292681824e-02; const double a3 = 2.05808084325167332806e-02; const double a4 = 7.38555086081402883957e-03; const double a5 = 2.89051383673415629091e-03; const double a6 = 1.19270763183362067845e-03; const double a7 = 5.10069792153511336608e-04; const double a8 = 2.20862790713908385557e-04; const double a9 = 1.08011567247583939954e-04; const double a10 = 2.52144565451257326939e-05; const double a11 = 4.48640949618915160150e-05; const double tc = 1.46163214496836224576e+00; const double tf = -1.21486290535849611461e-01; const double tt = -3.63867699703950536541e-18; const double t0 = 4.83836122723810047042e-01; const double t1 = -1.47587722994593911752e-01; const double t2 = 6.46249402391333854778e-02; const double t3 = -3.27885410759859649565e-02; const double t4 = 1.79706750811820387126e-02; const double t5 = -1.03142241298341437450e-02; const double t6 = 6.10053870246291332635e-03; const double t7 = -3.68452016781138256760e-03; const double t8 = 2.25964780900612472250e-03; const double t9 = -1.40346469989232843813e-03; const double t10 = 8.81081882437654011382e-04; const double t11 = -5.38595305356740546715e-04; const double t12 = 3.15632070903625950361e-04; const double t13 = -3.12754168375120860518e-04; const double t14 = 3.35529192635519073543e-04; const double u0 = -7.72156649015328655494e-02; const double u1 = 6.32827064025093366517e-01; const double u2 = 1.45492250137234768737e+00; const double u3 = 9.77717527963372745603e-01; const double u4 = 2.28963728064692451092e-01; const double u5 = 1.33810918536787660377e-02; const double v1 = 2.45597793713041134822e+00; const double v2 = 2.12848976379893395361e+00; const double v3 = 7.69285150456672783825e-01; const double v4 = 1.04222645593369134254e-01; const double v5 = 3.21709242282423911810e-03; const double s0 = -7.72156649015328655494e-02; const double s1 = 2.14982415960608852501e-01; const double s2 = 3.25778796408930981787e-01; const double s3 = 1.46350472652464452805e-01; const double s4 = 2.66422703033638609560e-02; const double s5 = 1.84028451407337715652e-03; const double s6 = 3.19475326584100867617e-05; const double r1 = 1.39200533467621045958e+00; const double r2 = 7.21935547567138069525e-01; const double r3 = 1.71933865632803078993e-01; const double r4 = 1.86459191715652901344e-02; const double r5 = 7.77942496381893596434e-04; const double r6 = 7.32668430744625636189e-06; const double w0 = 4.18938533204672725052e-01; const double w1 = 8.33333333333329678849e-02; const double w2 = -2.77777777728775536470e-03; const double w3 = 7.93650558643019558500e-04; const double w4 = -5.95187557450339963135e-04; const double w5 = 8.36339918996282139126e-04; const double w6 = -1.63092934096575273989e-03; const double z1 = -0x1.2788cfc6fb619p-1; const double z2 = 0x1.a51a6625307d3p-1; const double z3 = -0x1.9a4d55beab2d7p-2; const double z4 = 0x1.151322ac7d848p-2; const double z5 = -0x1.a8b9c17aa6149p-3; double ax = BUILTIN_ABS_F64(x); uint hax = AS_UINT2(ax).hi; double ret; if (hax < 0x3f700000) { // ax < 0x1.0p-8 ret = MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, z5, z4), z3), z2), z1), -MATH_MANGLE(log)(ax)); } else if (hax < 0x40000000) { // ax < 2.0 int i; bool c; double y, t; if (hax <= 0x3feccccc) { // |x| < 0.9 : lgamma(x) = lgamma(x+1)-log(x) ret = -MATH_MANGLE(log)(ax); y = 1.0 - ax; i = 0; c = hax < 0x3FE76944; // x < 0.7316 t = ax - (tc - 1.0); y = c ? t : y; i = c ? 1 : i; c = hax < 0x3FCDA661; // x < .2316 y = c ? ax : y; i = c ? 2 : i; } else { ret = 0.0; y = 2.0 - ax; i = 0; c = hax < 0x3FFBB4C3; // x < 1.7316 t = ax - tc; y = c ? t : y; i = c ? 1 : i; c = hax < 0x3FF3B4C4; // x < 1.2316 t = ax - 1.0; y = c ? t : y; i = c ? 2 : i; } double w, z, p, p1, p2, p3; switch(i) { case 0: z = y*y; p1 = MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, a10, a8), a6), a4), a2), a0); p2 = z * MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, a11, a9), a7), a5), a3), a1); p = MATH_MAD(y, p1, p2); ret += MATH_MAD(y, -0.5, p); break; case 1: z = y*y; w = z*y; p1 = MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, t12, t9), t6), t3), t0); p2 = MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, t13, t10), t7), t4), t1); p3 = MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, t14, t11), t8), t5), t2); p = MATH_MAD(z, p1, -MATH_MAD(w, -MATH_MAD(y, p3,p2), tt)); ret += tf + p; break; case 2: p1 = y * MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, u5, u4), u3), u2), u1), u0); p2 = MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, v5, v4), v3), v2), v1), 1.0); ret += MATH_MAD(y, -0.5, MATH_DIV(p1, p2)); break; } } else if (hax < 0x40200000) { // 2 < ax < 8 int i = (int)ax; double y = ax - (double)i; double p = y * MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, s6, s5), s4), s3), s2), s1), s0); double q = MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, r6, r5), r4), r3), r2), r1), 1.0); ret = MATH_MAD(y, 0.5, MATH_DIV(p, q)); double y2 = y + 2.0; double y3 = y + 3.0; double y4 = y + 4.0; double y5 = y + 5.0; double y6 = y + 6.0; double z = 1.0; z *= i > 2 ? y2 : 1.0; z *= i > 3 ? y3 : 1.0; z *= i > 4 ? y4 : 1.0; z *= i > 5 ? y5 : 1.0; z *= i > 6 ? y6 : 1.0; ret += MATH_MANGLE(log)(z); } else if (hax < 0x43900000) { // 8 <= ax < 2^58 double z = MATH_RCP(ax); double y = z*z; double w = MATH_MAD(z, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, w6, w5), w4), w3), w2), w1), w0); ret = MATH_MAD(ax - 0.5, MATH_MANGLE(log)(ax) - 1.0, w); } else { // 2^58 <= ax <= Inf ret = MATH_MAD(ax, MATH_MANGLE(log)(ax), -ax); } int s = 0; if (x >= 0.0) { ret = (x == 1.0 | x == 2.0) ? 0.0 : ret; s = x == 0.0 ? 0 : 1; } else if (hax < 0x43300000) { // x > -0x1.0p+52 if (hax > 0x3cd00000) { // x < -0x1.0p-50 double t = MATH_MANGLE(sinpi)(x); double negadj = MATH_MANGLE(log)(MATH_DIV(pi, BUILTIN_ABS_F64(t * x))); ret = negadj - ret; bool z = BUILTIN_FRACTION_F64(x) == 0.0; ret = z ? AS_DOUBLE(PINFBITPATT_DP64) : ret; s = t < 0.0 ? -1 : 1; s = z ? 0 : s; } else { s = -1; } } if (!FINITE_ONLY_OPT()) { // Handle negative integer, Inf, NaN ret = BUILTIN_CLASS_F64(ax, CLASS_NZER|CLASS_PZER|CLASS_PINF) | (x < 0.0f & hax >= 0x43300000) ? AS_DOUBLE(PINFBITPATT_DP64) : ret; ret = BUILTIN_ISNAN_F64(x) ? x : ret; } struct ret_t result; result.result = ret; result.signp = s; return result; } double MATH_MANGLE(lgamma_r)(double x, __private int *signp) { struct ret_t ret = MATH_MANGLE(lgamma_r_impl)(x); *signp = ret.signp; return ret.result; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/lgamma_rF.cl000066400000000000000000000263631415221260100212700ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" // This lgamma routine began with Sun's lgamma code from netlib. // Their original copyright notice follows. /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunSoft, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== * */ /* Reentrant version of the logarithm of the Gamma function * with user provide pointer for the sign of Gamma(x). * * Method: * 1. Argument Reduction for 0 < x <= 8 * Since gamma(1+s)=s*gamma(s), for x in [0,8], we may * reduce x to a number in [1.5,2.5] by * lgamma(1+s) = log(s) + lgamma(s) * for example, * lgamma(7.3) = log(6.3) + lgamma(6.3) * = log(6.3*5.3) + lgamma(5.3) * = log(6.3*5.3*4.3*3.3*2.3) + lgamma(2.3) * 2. Polynomial approximation of lgamma around its * minimun ymin=1.461632144968362245 to maintain monotonicity. * On [ymin-0.23, ymin+0.27] (i.e., [1.23164,1.73163]), use * Let z = x-ymin; * lgamma(x) = -1.214862905358496078218 + z^2*poly(z) * where * poly(z) is a 14 degree polynomial. * 2. Rational approximation in the primary interval [2,3] * We use the following approximation: * s = x-2.0; * lgamma(x) = 0.5*s + s*P(s)/Q(s) * with accuracy * |P/Q - (lgamma(x)-0.5s)| < 2**-61.71 * Our algorithms are based on the following observation * * zeta(2)-1 2 zeta(3)-1 3 * lgamma(2+s) = s*(1-Euler) + --------- * s - --------- * s + ... * 2 3 * * where Euler = 0.5771... is the Euler constant, which is very * close to 0.5. * * 3. For x>=8, we have * lgamma(x)~(x-0.5)log(x)-x+0.5*log(2pi)+1/(12x)-1/(360x**3)+.... * (better formula: * lgamma(x)~(x-0.5)*(log(x)-1)-.5*(log(2pi)-1) + ...) * Let z = 1/x, then we approximation * f(z) = lgamma(x) - (x-0.5)(log(x)-1) * by * 3 5 11 * w = w0 + w1*z + w2*z + w3*z + ... + w6*z * where * |w - f(z)| < 2**-58.74 * * 4. For negative x, since (G is gamma function) * -x*G(-x)*G(x) = pi/sin(pi*x), * we have * G(x) = pi/(sin(pi*x)*(-x)*G(-x)) * since G(-x) is positive, sign(G(x)) = sign(sin(pi*x)) for x<0 * Hence, for x<0, signgam = sign(sin(pi*x)) and * lgamma(x) = log(|Gamma(x)|) * = log(pi/(|x*sin(pi*x)|)) - lgamma(-x); * Note: one should avoid compute pi*(-x) directly in the * computation of sin(pi*(-x)). * * 5. Special Cases * lgamma(2+s) ~ s*(1-Euler) for tiny s * lgamma(1)=lgamma(2)=0 * lgamma(x) ~ -log(x) for tiny x * lgamma(0) = lgamma(inf) = inf * lgamma(-integer) = +-inf * */ struct ret_t { float result; int signp; }; static struct ret_t MATH_MANGLE(lgamma_r_impl)(float x) { const float two52 = 4.50359962737049600000e+15f; const float pi = 3.14159265358979311600e+00f; const float a0 = 7.72156649015328655494e-02f; const float a1 = 3.22467033424113591611e-01f; const float a2 = 6.73523010531292681824e-02f; const float a3 = 2.05808084325167332806e-02f; const float a4 = 7.38555086081402883957e-03f; const float a5 = 2.89051383673415629091e-03f; const float a6 = 1.19270763183362067845e-03f; const float a7 = 5.10069792153511336608e-04f; const float a8 = 2.20862790713908385557e-04f; const float a9 = 1.08011567247583939954e-04f; const float a10 = 2.52144565451257326939e-05f; const float a11 = 4.48640949618915160150e-05f; const float tc = 1.46163214496836224576e+00f; const float tf = -1.21486290535849611461e-01f; const float tt = -3.63867699703950536541e-18f; const float t0 = 4.83836122723810047042e-01f; const float t1 = -1.47587722994593911752e-01f; const float t2 = 6.46249402391333854778e-02f; const float t3 = -3.27885410759859649565e-02f; const float t4 = 1.79706750811820387126e-02f; const float t5 = -1.03142241298341437450e-02f; const float t6 = 6.10053870246291332635e-03f; const float t7 = -3.68452016781138256760e-03f; const float t8 = 2.25964780900612472250e-03f; const float t9 = -1.40346469989232843813e-03f; const float t10 = 8.81081882437654011382e-04f; const float t11 = -5.38595305356740546715e-04f; const float t12 = 3.15632070903625950361e-04f; const float t13 = -3.12754168375120860518e-04f; const float t14 = 3.35529192635519073543e-04f; const float u0 = -7.72156649015328655494e-02f; const float u1 = 6.32827064025093366517e-01f; const float u2 = 1.45492250137234768737e+00f; const float u3 = 9.77717527963372745603e-01f; const float u4 = 2.28963728064692451092e-01f; const float u5 = 1.33810918536787660377e-02f; const float v1 = 2.45597793713041134822e+00f; const float v2 = 2.12848976379893395361e+00f; const float v3 = 7.69285150456672783825e-01f; const float v4 = 1.04222645593369134254e-01f; const float v5 = 3.21709242282423911810e-03f; const float s0 = -7.72156649015328655494e-02f; const float s1 = 2.14982415960608852501e-01f; const float s2 = 3.25778796408930981787e-01f; const float s3 = 1.46350472652464452805e-01f; const float s4 = 2.66422703033638609560e-02f; const float s5 = 1.84028451407337715652e-03f; const float s6 = 3.19475326584100867617e-05f; const float r1 = 1.39200533467621045958e+00f; const float r2 = 7.21935547567138069525e-01f; const float r3 = 1.71933865632803078993e-01f; const float r4 = 1.86459191715652901344e-02f; const float r5 = 7.77942496381893596434e-04f; const float r6 = 7.32668430744625636189e-06f; const float w0 = 4.18938533204672725052e-01f; const float w1 = 8.33333333333329678849e-02f; const float w2 = -2.77777777728775536470e-03f; const float w3 = 7.93650558643019558500e-04f; const float w4 = -5.95187557450339963135e-04f; const float w5 = 8.36339918996282139126e-04f; const float w6 = -1.63092934096575273989e-03f; const float z1 = -0x1.2788d0p-1f; const float z2 = 0x1.a51a66p-1f; const float z3 = -0x1.9a4d56p-2f; const float z4 = 0x1.151322p-2f; float ax = BUILTIN_ABS_F32(x); float ret; if (ax < 0x1.0p-6f) { ret = MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, MATH_MAD(ax, z4, z3), z2), z1), -MATH_MANGLE(log)(ax)); } else if (ax < 2.0f) { int i; bool c; float y, t; if( ax <= 0.9f) { // lgamma(x) = lgamma(x+1)-log(x) ret = -MATH_MANGLE(log)(ax); y = 1.0f - ax; i = 0; c = ax < 0.7316f; t = ax - (tc - 1.0f); y = c ? t : y; i = c ? 1 : i; c = ax < 0.23164f; y = c ? ax : y; i = c ? 2 : i; } else { ret = 0.0f; y = 2.0f - ax; i = 0; c = ax < 1.7316f; t = ax - tc; y = c ? t : y; i = c ? 1 : y; c = ax < 1.23f; t = ax - 1.0f; y = c ? t : y; i = c ? 2 : i; } float z, w, p1, p2, p3, p; switch(i) { case 0: z = y * y; p1 = MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, a10, a8), a6), a4), a2), a0); p2 = z * MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, a11, a9), a7), a5), a3), a1); p = MATH_MAD(y, p1, p2); ret += MATH_MAD(y, -0.5f, p); break; case 1: z = y * y; w = z * y; p1 = MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, t12, t9), t6), t3), t0); p2 = MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, t13, t10), t7), t4), t1); p3 = MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, MATH_MAD(w, t14, t11), t8), t5), t2); p = MATH_MAD(z, p1, -MATH_MAD(w, -MATH_MAD(y, p3, p2), tt)); ret += tf + p; break; case 2: p1 = y * MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, u5, u4), u3), u2), u1), u0); p2 = MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, v5, v4), v3), v2), v1), 1.0f); ret += MATH_MAD(y, -0.5f, MATH_FAST_DIV(p1, p2)); break; } } else if (ax < 8.0f) { // 2 < |x| < 8 int i = (int)ax; float y = ax - (float) i; float p = y * MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, s6, s5), s4), s3), s2), s1), s0); float q = MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, r6, r5), r4), r3), r2), r1), 1.0f); ret = MATH_MAD(y, 0.5f, MATH_FAST_DIV(p, q)); float y2 = y + 2.0f; float y3 = y + 3.0f; float y4 = y + 4.0f; float y5 = y + 5.0f; float y6 = y + 6.0f; float z = 1.0f; z *= i > 2 ? y2 : 1.0f; z *= i > 3 ? y3 : 1.0f; z *= i > 4 ? y4 : 1.0f; z *= i > 5 ? y5 : 1.0f; z *= i > 6 ? y6 : 1.0f; ret += MATH_MANGLE(log)(z); } else if (ax < 0x1.0p+58f) { // 8 <= |x| < 2^58 float z = MATH_FAST_RCP(ax); float y = z * z; float w = MATH_MAD(z, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, w6, w5), w4), w3), w2), w1), w0); ret = MATH_MAD(ax - 0.5f, MATH_MANGLE(log)(ax) - 1.0f, w); } else { // 2^58 <= |x| <= Inf ret = MATH_MAD(ax, MATH_MANGLE(log)(ax), -ax); } int s = 0; if (x >= 0.0f) { ret = ((x == 1.0f) | (x == 2.0f)) ? 0.0f : ret; s = x == 0.0f ? 0 : 1; } else if (ax < 0x1.0p+23f) { // x > -0x1.0p+23 if (ax > 0x1.0p-21f) { float t = MATH_MANGLE(sinpi)(x); float negadj = MATH_MANGLE(log)(MATH_DIV(pi, BUILTIN_ABS_F32(t * x))); ret = negadj - ret; bool z = BUILTIN_FRACTION_F32(x) == 0.0f; ret = z ? AS_FLOAT(PINFBITPATT_SP32) : ret; s = t < 0.0f ? -1 : 1; s = z ? 0 : s; } else { s = -1; } } if (!FINITE_ONLY_OPT()) { ret = ((ax != 0.0f) && !BUILTIN_ISINF_F32(ax) && ((x >= 0.0f) || (ax < 0x1.0p+23f))) ? ret : AS_FLOAT(PINFBITPATT_SP32); ret = BUILTIN_ISNAN_F32(x) ? x : ret; } struct ret_t result; result.result = ret; result.signp = s; return result; } float MATH_MANGLE(lgamma_r)(float x, __private int *signp) { struct ret_t ret = MATH_MANGLE(lgamma_r_impl)(x); *signp = ret.signp; return ret.result; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/lgamma_rH.cl000066400000000000000000000012701415221260100212600ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" half2 MATH_MANGLE2(lgamma_r)(half2 x, __private int2 *signp) { int slo, shi; half2 r; r.lo = MATH_MANGLE(lgamma_r)(x.lo, &slo); r.hi = MATH_MANGLE(lgamma_r)(x.hi, &shi); *signp = (int2)(slo, shi); return r; } half MATH_MANGLE(lgamma_r)(half x, __private int *signp) { return (half)MATH_UPMANGLE(lgamma_r)((float)x, signp); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/lnepD.cl000066400000000000000000000021321415221260100204310ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea) { int b = BUILTIN_FREXP_MANT_F64(a.hi) < (2.0/3.0); int e = BUILTIN_FREXP_EXP_F64(a.hi) - b; double2 m = ldx(a, -e); double2 x = div(fadd(-1.0, m), fadd(1.0, m)); double s = x.hi * x.hi; double p = MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, 0x1.3ab76bf559e2bp-3, 0x1.385386b47b09ap-3), 0x1.7474dd7f4df2ep-3), 0x1.c71c016291751p-3), 0x1.249249b27acf1p-2), 0x1.99999998ef7b6p-2), 0x1.5555555555780p-1); double2 r = add(mul(con(0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56), (double)(e + ea)), fadd(ldx(x,1), s * x.hi * p)); return r.hi; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/lnepF.cl000066400000000000000000000015661415221260100204450ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea) { int b = BUILTIN_FREXP_MANT_F32(a.hi) < (2.0f/3.0f); int e = BUILTIN_FREXP_EXP_F32(a.hi) - b; float2 m = ldx(a, -e); float2 x = div(fadd(-1.0f, m), fadd(1.0f, m)); float s = x.hi * x.hi; float p = MATH_MAD(s, MATH_MAD(s, 0x1.36db58p-2f, 0x1.992b46p-2f), 0x1.5555b4p-1f); float2 r = add(mul(con(0x1.62e430p-1f, -0x1.05c610p-29f), (float)(e + ea)), fadd(ldx(x,1), s * x.hi * p)); return r.hi; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/log10D.cl000066400000000000000000000005611415221260100204210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_LOG10 #include "logD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/log10F.cl000066400000000000000000000005611415221260100204230ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_LOG10 #include "logF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/log10H.cl000066400000000000000000000007331415221260100204260ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(log10) CONSTATTR half MATH_MANGLE(log10)(half x) { return (half)(BUILTIN_LOG2_F32((float)x) * 0x1.344136p-2f); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/log1pD.cl000066400000000000000000000014051415221260100205170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" extern CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea); #define DOUBLE_SPECIALIZATION #include "ep.h" CONSTATTR double MATH_MANGLE(log1p)(double x) { double z = MATH_PRIVATE(lnep)(add(1.0, x), 0); if (!FINITE_ONLY_OPT()) { z = BUILTIN_CLASS_F64(x, CLASS_PINF) ? x : z; z = x < -1.0 ? AS_DOUBLE(QNANBITPATT_DP64) : z; z = x == -1.0 ? AS_DOUBLE(NINFBITPATT_DP64) : z; } return z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/log1pF.cl000066400000000000000000000014461415221260100205260ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea); CONSTATTR float MATH_MANGLE(log1p)(float x) { float z = MATH_PRIVATE(lnep)(add(1.0f, x), 0); if (!FINITE_ONLY_OPT()) { z = BUILTIN_CLASS_F32(x, CLASS_PINF) ? x : z; z = x < -1.0f ? AS_FLOAT(QNANBITPATT_SP32) : z; z = x == -1.0f ? AS_FLOAT(NINFBITPATT_SP32) : z; } return BUILTIN_ABS_F32(x) < 0x1.0p-24f ? x : z; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/log1pH.cl000066400000000000000000000011671415221260100205300ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(log1p) CONSTATTR half MATH_MANGLE(log1p)(half x) { half ret; ret = (half)(BUILTIN_LOG2_F32((float)x + 1.0f) * 0x1.62e430p-1f); half p = MATH_MAD(x, x*MATH_MAD(x, 0x1.555556p-2h, -0.5h), x); ret = BUILTIN_ABS_F16(x) < 0x1.0p-6h ? p : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/log2D.cl000066400000000000000000000005601415221260100203410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_LOG2 #include "logD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/log2F.cl000066400000000000000000000005601415221260100203430ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_LOG2 #include "logF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/log2H.cl000066400000000000000000000006711415221260100203500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(log2) CONSTATTR half MATH_MANGLE(log2)(half x) { return BUILTIN_LOG2_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/logD.cl000066400000000000000000000005571415221260100202650ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_LOG #include "logD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/logD_base.h000066400000000000000000000033311415221260100211010ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" CONSTATTR double #if defined(COMPILING_LOG2) MATH_MANGLE(log2)(double a) #elif defined(COMPILING_LOG10) MATH_MANGLE(log10)(double a) #else MATH_MANGLE(log)(double a) #endif { double m = BUILTIN_FREXP_MANT_F64(a); int b = m < (2.0/3.0); m = BUILTIN_FLDEXP_F64(m, b); int e = BUILTIN_FREXP_EXP_F64(a) - b; double2 x = div(m - 1.0, fadd(1.0, m)); double s = x.hi * x.hi; double p = MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, 0x1.3ab76bf559e2bp-3, 0x1.385386b47b09ap-3), 0x1.7474dd7f4df2ep-3), 0x1.c71c016291751p-3), 0x1.249249b27acf1p-2), 0x1.99999998ef7b6p-2), 0x1.5555555555780p-1); double2 r = fadd(ldx(x,1), s*x.hi*p); #if defined COMPILING_LOG2 r = add((double)e, mul(con(0x1.71547652b82fep+0,0x1.777d0ffda0d24p-56), r)); #elif defined COMPILING_LOG10 r = add(mul(con(0x1.34413509f79ffp-2, -0x1.9dc1da994fd21p-59), (double)e), mul(con(0x1.bcb7b1526e50ep-2, 0x1.95355baaafad3p-57), r)); #else r = add(mul(con(0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56), (double)e), r); #endif double ret = r.hi; if (!FINITE_ONLY_OPT()) { ret = BUILTIN_ISINF_F64(a) ? a : ret; ret = a < 0.0 ? AS_DOUBLE(QNANBITPATT_DP64) : ret; ret = a == 0.0 ? AS_DOUBLE(NINFBITPATT_DP64) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/logF.cl000066400000000000000000000005571415221260100202670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_LOG #include "logF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/logF_base.h000066400000000000000000000101741415221260100211060ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float #if defined COMPILING_LOG2 MATH_MANGLE(log2)(float x) #elif defined COMPILING_LOG10 MATH_MANGLE(log10)(float x) #else MATH_MANGLE(log)(float x) #endif { if (DAZ_OPT()) { if (UNSAFE_MATH_OPT()) { #if defined COMPILING_LOG2 return BUILTIN_LOG2_F32(x); #elif defined COMPILING_LOG10 return BUILTIN_LOG2_F32(x) * 0x1.344136p-2f; #else return BUILTIN_LOG2_F32(x) * 0x1.62e430p-1f; #endif } else { #if defined COMPILING_LOG2 return BUILTIN_LOG2_F32(x); #else float y = BUILTIN_LOG2_F32(x); float r; if (HAVE_FAST_FMA32()) { #if defined COMPILING_LOG10 const float c = 0x1.344134p-2f; const float cc = 0x1.09f79ep-26f; // c+cc are ln(2)/ln(10) to more than 49 bits #else const float c = 0x1.62e42ep-1f; const float cc = 0x1.efa39ep-25f; // c + cc is ln(2) to more than 49 bits #endif r = y * c; r = r + BUILTIN_FMA_F32(y, cc, BUILTIN_FMA_F32(y, c, -r)); } else { #if defined COMPILING_LOG10 const float ch = 0x1.344000p-2f; const float ct = 0x1.3509f6p-18f; // ch+ct is ln(2)/ln(10) to more than 36 bits #else const float ch = 0x1.62e000p-1f; const float ct = 0x1.0bfbe8p-15f; // ch + ct is ln(2) to more than 36 bits #endif float yh = AS_FLOAT(AS_UINT(y) & 0xfffff000); float yt = y - yh; r = MATH_MAD(yh, ch, MATH_MAD(yt, ch, MATH_MAD(yh, ct, yt*ct))); } r = BUILTIN_CLASS_F32(y, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) != 0 ? y : r; return r; #endif } } else { // not DAZ if (UNSAFE_MATH_OPT()) { bool s = BUILTIN_CLASS_F32(x, CLASS_NSUB|CLASS_PSUB); x *= s ? 0x1.0p+32f : 1.0f; #if defined COMPILING_LOG2 return BUILTIN_LOG2_F32(x) - (s ? 32.0f : 0.0f); #elif defined COMPILING_LOG10 return MATH_MAD(BUILTIN_LOG2_F32(x), 0x1.344136p-2f, s ? -0x1.344136p+3f : 0.0f); #else return MATH_MAD(BUILTIN_LOG2_F32(x), 0x1.62e430p-1f, s ? -0x1.62e430p+4f : 0.0f); #endif } else { bool s = BUILTIN_CLASS_F32(x, CLASS_NSUB|CLASS_PSUB); x *= s ? 0x1.0p+32f : 1.0f; #if defined COMPILING_LOG2 return BUILTIN_LOG2_F32(x) - (s ? 32.0f : 0.0f); #else float y = BUILTIN_LOG2_F32(x); float r; if (HAVE_FAST_FMA32()) { #if defined COMPILING_LOG10 const float c = 0x1.344134p-2f; const float cc = 0x1.09f79ep-26f; // c+cc are ln(2)/ln(10) to more than 49 bits #else const float c = 0x1.62e42ep-1f; const float cc = 0x1.efa39ep-25f; // c + cc is ln(2) to more than 49 bits #endif r = y * c; r = r + BUILTIN_FMA_F32(y, cc, BUILTIN_FMA_F32(y, c, -r)); } else { #if defined COMPILING_LOG10 const float ch = 0x1.344000p-2f; const float ct = 0x1.3509f6p-18f; // ch+ct is ln(2)/ln(10) to more than 36 bits #else const float ch = 0x1.62e000p-1f; const float ct = 0x1.0bfbe8p-15f; // ch + ct is ln(2) to more than 36 bits #endif float yh = AS_FLOAT(AS_UINT(y) & 0xfffff000); float yt = y - yh; r = MATH_MAD(yh, ch, MATH_MAD(yt, ch, MATH_MAD(yh, ct, yt*ct))); } r = BUILTIN_CLASS_F32(y, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) != 0 ? y : r; #if defined COMPILING_LOG10 r = r - (s ? 0x1.344136p+3f : 0.0f); #else r = r - (s ? 0x1.62e430p+4f : 0.0f); #endif // r = BUILTIN_CLASS_F32(y, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) != 0 ? y : r; return r; #endif } } } ROCm-Device-Libs-rocm-5.0.0/ocml/src/logH.cl000066400000000000000000000007271415221260100202700ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(log) CONSTATTR half MATH_MANGLE(log)(half x) { return (half)(BUILTIN_LOG2_F32((float)x) * 0x1.62e430p-1f); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/logbD.cl000066400000000000000000000012151415221260100204170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(logb)(double x) { double ret = (double)(BUILTIN_FREXP_EXP_F64(x) - 1); if (!FINITE_ONLY_OPT()) { double ax = BUILTIN_ABS_F64(x); ret = BUILTIN_ISFINITE_F64(ax) ? ret : ax; ret = x == 0.0 ? AS_DOUBLE(NINFBITPATT_DP64) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/logbF.cl000066400000000000000000000012101415221260100204140ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(logb)(float x) { float ret = (float)(BUILTIN_FREXP_EXP_F32(x) - 1); if (!FINITE_ONLY_OPT()) { float ax = BUILTIN_ABS_F32(x); ret = BUILTIN_ISFINITE_F32(ax) ? ret : ax; ret = x == 0.0f ? AS_FLOAT(NINFBITPATT_SP32) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/logbH.cl000066400000000000000000000012731415221260100204270ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(logb) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(logb)(half x) { half ret = (half)(BUILTIN_FREXP_EXP_F16(x) - (short)1); if (!FINITE_ONLY_OPT()) { half ax = BUILTIN_ABS_F16(x); ret = BUILTIN_ISFINITE_F16(ax) ? ret : ax; ret = x == 0.0h ? AS_HALF((short)NINFBITPATT_HP16) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/madD.cl000066400000000000000000000006701415221260100202410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(mad)(double a, double b, double c) { return MATH_MAD(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/madF.cl000066400000000000000000000010311415221260100202330ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float2 MATH_MANGLE2(mad)(float2 a, float2 b, float2 c) { return MATH_MAD2(a, b, c); } CONSTATTR float MATH_MANGLE(mad)(float a, float b, float c) { return MATH_MAD(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/madH.cl000066400000000000000000000010211415221260100202340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(mad)(half2 a, half2 b, half2 c) { return MATH_MAD2(a, b, c); } CONSTATTR half MATH_MANGLE(mad)(half a, half b, half c) { return MATH_MAD(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/mathD.h000066400000000000000000000031621415221260100202610ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ // OCML prototypes #include "ocml.h" // Tables #include "tables.h" // Builtins #include "builtins.h" // Mangling #define MATH_MANGLE(N) OCML_MANGLE_F64(N) #define MATH_PRIVATE(N) MANGLE3(__ocmlpriv,N,f64) // Optimization Controls #include "opts.h" // Attributes #define ALIGNEDATTR(X) __attribute__((aligned(X))) #define INLINEATTR __attribute__((always_inline)) #define PUREATTR __attribute__((pure)) #define CONSTATTR __attribute__((const)) // Math controls #include "privD.h" // Bit patterns #define SIGNBIT_DP64 0x8000000000000000L #define EXSIGNBIT_DP64 0x7fffffffffffffffL #define EXPBITS_DP64 0x7ff0000000000000L #define MANTBITS_DP64 0x000fffffffffffffL #define ONEEXPBITS_DP64 0x3ff0000000000000L #define TWOEXPBITS_DP64 0x4000000000000000L #define HALFEXPBITS_DP64 0x3fe0000000000000L #define IMPBIT_DP64 0x0010000000000000L #define QNANBITPATT_DP64 0x7ff8000000000000L #define INDEFBITPATT_DP64 0xfff8000000000000L #define PINFBITPATT_DP64 0x7ff0000000000000L #define NINFBITPATT_DP64 0xfff0000000000000L #define EXPBIAS_DP64 1023 #define EXPSHIFTBITS_DP64 52 #define BIASEDEMIN_DP64 1 #define EMIN_DP64 -1022 #define BIASEDEMAX_DP64 2046 #define EMAX_DP64 1023 #define LAMBDA_DP64 1.0e300 #define MANTLENGTH_DP64 53 #define BASEDIGITS_DP64 15 ROCm-Device-Libs-rocm-5.0.0/ocml/src/mathF.h000066400000000000000000000027731415221260100202720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ // OCML prototypes #include "ocml.h" // Tables #include "tables.h" // Builtins #include "builtins.h" // Mangling #define MATH_MANGLE(N) OCML_MANGLE_F32(N) #define MATH_MANGLE2(N) OCML_MANGLE_2F32(N) #define MATH_PRIVATE(N) MANGLE3(__ocmlpriv,N,f32) // Optimization Controls #include "opts.h" // Attributes #define ALIGNEDATTR(X) __attribute__((aligned(X))) #define INLINEATTR __attribute__((always_inline)) #define PUREATTR __attribute__((pure)) #define CONSTATTR __attribute__((const)) // Math controls #include "privF.h" // Floating point patterns #define SIGNBIT_SP32 (int)0x80000000 #define EXSIGNBIT_SP32 0x7fffffff #define EXPBITS_SP32 0x7f800000 #define MANTBITS_SP32 0x007fffff #define ONEEXPBITS_SP32 0x3f800000 #define TWOEXPBITS_SP32 0x40000000 #define HALFEXPBITS_SP32 0x3f000000 #define IMPBIT_SP32 0x00800000 #define QNANBITPATT_SP32 0x7fc00000 #define PINFBITPATT_SP32 0x7f800000 #define NINFBITPATT_SP32 (int)0xff800000 #define EXPBIAS_SP32 127 #define EXPSHIFTBITS_SP32 23 #define BIASEDEMIN_SP32 1 #define EMIN_SP32 -126 #define BIASEDEMAX_SP32 254 #define EMAX_SP32 127 #define MANTLENGTH_SP32 24 #define BASEDIGITS_SP32 7 ROCm-Device-Libs-rocm-5.0.0/ocml/src/mathH.h000066400000000000000000000034511415221260100202660ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ // OCML prototypes #include "ocml.h" // Tables #include "tables.h" // Builtins #include "builtins.h" // Mangling #define MATH_MANGLE(N) OCML_MANGLE_F16(N) #define MATH_MANGLE2(N) OCML_MANGLE_2F16(N) #define MATH_PRIVATE(N) MANGLE3(__ocmlpriv,N,f16) #define MATH_UPMANGLE(N) OCML_MANGLE_F32(N) // Optimization Controls #include "opts.h" // Attributes #define ALIGNEDATTR(X) __attribute__((aligned(X))) #define INLINEATTR __attribute__((always_inline)) #define PUREATTR __attribute__((pure)) #define CONSTATTR __attribute__((const)) // Math controls #include "privH.h" // Floating point patterns #define SIGNBIT_HP16 0x8000 #define EXSIGNBIT_HP16 0x7fff #define EXPBITS_HP16 0x7c00 #define MANTBITS_HP16 0x03ff #define ONEEXPBITS_HP16 0x3c00 #define TWOEXPBITS_HP16 0x4000 #define HALFEXPBITS_HP16 0x3800 #define IMPBIT_HP16 0x0400 #define QNANBITPATT_HP16 0x7e00 #define PINFBITPATT_HP16 0x7c00 #define NINFBITPATT_HP16 0xfc00 #define EXPBIAS_HP16 15 #define EXPSHIFTBITS_HP16 10 #define BIASEDEMIN_HP16 1 #define EMIN_HP16 -14 #define BIASEDEMAX_HP16 30 #define EMAX_HP16 15 #define MANTLENGTH_HP16 11 #define BASEDIGITS_HP16 5 #define UGEN(N) \ half2 MATH_MANGLE2(N)(half2 x) \ { \ return (half2)(MATH_MANGLE(N)(x.lo), MATH_MANGLE(N)(x.hi)); \ } #define BGEN(N) \ half2 MATH_MANGLE2(N)(half2 x, half2 y) \ { \ return (half2)(MATH_MANGLE(N)(x.lo, y.lo), MATH_MANGLE(N)(x.hi, y.hi)); \ } #pragma OPENCL EXTENSION cl_khr_fp16 : enable ROCm-Device-Libs-rocm-5.0.0/ocml/src/maxD.cl000066400000000000000000000006631415221260100202670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(max)(double x, double y) { return BUILTIN_CMAX_F64(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/maxF.cl000066400000000000000000000006601415221260100202660ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(max)(float x, float y) { return BUILTIN_CMAX_F32(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/maxH.cl000066400000000000000000000010121415221260100202600ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(max)(half2 x, half2 y) { return BUILTIN_CMAX_2F16(x, y); } CONSTATTR half MATH_MANGLE(max)(half x, half y) { return BUILTIN_CMAX_F16(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/maxmagD.cl000066400000000000000000000011151415221260100207450ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(maxmag)(double x, double y) { double ret = BUILTIN_MAX_F64(x, y); double ax = BUILTIN_ABS_F64(x); double ay = BUILTIN_ABS_F64(y); ret = ax > ay ? x : ret; ret = ay > ax ? y : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/maxmagF.cl000066400000000000000000000011071415221260100207500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(maxmag)(float x, float y) { float ret = BUILTIN_MAX_F32(x, y); float ax = BUILTIN_ABS_F32(x); float ay = BUILTIN_ABS_F32(y); ret = ax > ay ? x : ret; ret = ay > ax ? y : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/maxmagH.cl000066400000000000000000000011311415221260100207470ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR BGEN(maxmag) CONSTATTR half MATH_MANGLE(maxmag)(half x, half y) { half ret = BUILTIN_MAX_F16(x, y); half ax = BUILTIN_ABS_F16(x); half ay = BUILTIN_ABS_F16(y); ret = ax > ay ? x : ret; ret = ay > ax ? y : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/minD.cl000066400000000000000000000006631415221260100202650ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(min)(double x, double y) { return BUILTIN_CMIN_F64(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/minF.cl000066400000000000000000000006601415221260100202640ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(min)(float x, float y) { return BUILTIN_CMIN_F32(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/minH.cl000066400000000000000000000010121415221260100202560ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(min)(half2 x, half2 y) { return BUILTIN_CMIN_2F16(x, y); } CONSTATTR half MATH_MANGLE(min)(half x, half y) { return BUILTIN_CMIN_F16(x, y); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/minmagD.cl000066400000000000000000000011151415221260100207430ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(minmag)(double x, double y) { double ret = BUILTIN_MIN_F64(x, y); double ax = BUILTIN_ABS_F64(x); double ay = BUILTIN_ABS_F64(y); ret = ax < ay ? x : ret; ret = ay < ax ? y : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/minmagF.cl000066400000000000000000000011071415221260100207460ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(minmag)(float x, float y) { float ret = BUILTIN_MIN_F32(x, y); float ax = BUILTIN_ABS_F32(x); float ay = BUILTIN_ABS_F32(y); ret = ax < ay ? x : ret; ret = ay < ax ? y : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/minmagH.cl000066400000000000000000000011311415221260100207450ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR BGEN(minmag) CONSTATTR half MATH_MANGLE(minmag)(half x, half y) { half ret = BUILTIN_MIN_F16(x, y); half ax = BUILTIN_ABS_F16(x); half ay = BUILTIN_ABS_F16(y); ret = ax < ay ? x : ret; ret = ay < ax ? y : ret; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/modfD.cl000066400000000000000000000010711415221260100204210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" double MATH_MANGLE(modf)(double x, __private double *iptr) { double tx = BUILTIN_TRUNC_F64(x); double ret = x - tx; ret = BUILTIN_ISINF_F64(x) ? 0.0 : ret; *iptr = tx; return BUILTIN_COPYSIGN_F64(ret, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/modfF.cl000066400000000000000000000010651415221260100204260ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" float MATH_MANGLE(modf)(float x, __private float *iptr) { float tx = BUILTIN_TRUNC_F32(x); float ret = x - tx; ret = BUILTIN_ISINF_F32(x) ? 0.0f : ret; *iptr = tx; return BUILTIN_COPYSIGN_F32(ret, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/modfH.cl000066400000000000000000000015741415221260100204350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS half2 MATH_MANGLE2(modf)(half2 x, __private half2 *iptr) { half2 tx = BUILTIN_TRUNC_2F16(x); half2 ret = x - tx; ret.lo = BUILTIN_ISINF_F16(x.lo) ? 0.0h : ret.lo; ret.hi = BUILTIN_ISINF_F16(x.hi) ? 0.0h : ret.hi; *iptr = tx; return BUILTIN_COPYSIGN_2F16(ret, x); } REQUIRES_16BIT_INSTS half MATH_MANGLE(modf)(half x, __private half *iptr) { half tx = BUILTIN_TRUNC_F16(x); half ret = x - tx; ret = BUILTIN_ISINF_F16(x) ? 0.0h : ret; *iptr = tx; return BUILTIN_COPYSIGN_F16(ret, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/mulD.cl000066400000000000000000000010621415221260100202710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define GEN(LN,UN) \ CONSTATTR double \ MATH_MANGLE(LN)(double x, double y) \ { \ return BUILTIN_##UN##_F64(x, y); \ } // GEN(mul_rte,MUL_RTE) // GEN(mul_rtn,MUL_RTN) // GEN(mul_rtp,MUL_RTP) // GEN(mul_rtz,MUL_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/mulF.cl000066400000000000000000000010571415221260100202770ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define GEN(LN,UN) \ CONSTATTR float \ MATH_MANGLE(LN)(float x, float y) \ { \ return BUILTIN_##UN##_F32(x, y); \ } // GEN(mul_rte,MUL_RTE) // GEN(mul_rtn,MUL_RTN) // GEN(mul_rtp,MUL_RTP) // GEN(mul_rtz,MUL_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/mulH.cl000066400000000000000000000010541415221260100202760ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #define GEN(LN,UN) \ CONSTATTR half \ MATH_MANGLE(LN)(half x, half y) \ { \ return BUILTIN_##UN##_F16(x, y); \ } // GEN(mul_rte,MUL_RTE) // GEN(mul_rtn,MUL_RTN) // GEN(mul_rtp,MUL_RTP) // GEN(mul_rtz,MUL_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/nanD.cl000066400000000000000000000007171415221260100202560ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(nan)(ulong nancode) { return AS_DOUBLE((nancode & MANTBITS_DP64) | QNANBITPATT_DP64); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/nanF.cl000066400000000000000000000007061415221260100202560ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(nan)(uint nancode) { return AS_FLOAT(QNANBITPATT_SP32 | (nancode & 0xfffff)); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/nanH.cl000066400000000000000000000012001415221260100202460ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(nan)(ushort2 nancode) { ushort2 h = (ushort2)QNANBITPATT_HP16 | (nancode & (ushort2)0x01ff); return AS_HALF2(h); } CONSTATTR half MATH_MANGLE(nan)(ushort nancode) { ushort h = (ushort)QNANBITPATT_HP16 | (nancode & (ushort)0x01ff); return AS_HALF(h); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/nativeD.cl000066400000000000000000000023631415221260100207670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(native_recip)(double x) { // FIXME: Should use IR fdiv with arcp set. return __builtin_amdgcn_rcp(x); } CONSTATTR double MATH_MANGLE(native_sqrt)(double x) { return __builtin_sqrt(x); } CONSTATTR double MATH_MANGLE(native_rsqrt)(double x) { return __builtin_amdgcn_rsq(x); } CONSTATTR double MATH_MANGLE(native_sin)(double x) { return __builtin_sin(x); } CONSTATTR double MATH_MANGLE(native_cos)(double x) { return __builtin_cos(x); } CONSTATTR double MATH_MANGLE(native_exp)(double x) { return __builtin_exp(x); } CONSTATTR double MATH_MANGLE(native_exp2)(double x) { return __builtin_exp2(x); } CONSTATTR double MATH_MANGLE(native_log)(double x) { return __builtin_log(x); } CONSTATTR double MATH_MANGLE(native_log2)(double x) { return __builtin_log2(x); } CONSTATTR double MATH_MANGLE(native_log10)(double x) { return __builtin_log10(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/nativeF.cl000066400000000000000000000026101415221260100207640ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" // Value of log2(10) #define M_LOG2_10_F 0x1.a934f0p+1f CONSTATTR float MATH_MANGLE(native_recip)(float x) { // FIXME: Should use IR fdiv with arcp set. return __builtin_amdgcn_rcpf(x); } CONSTATTR float MATH_MANGLE(native_sqrt)(float x) { return __builtin_sqrtf(x); } CONSTATTR float MATH_MANGLE(native_rsqrt)(float x) { return __builtin_amdgcn_rsqf(x); } CONSTATTR float MATH_MANGLE(native_sin)(float x) { return __builtin_sinf(x); } CONSTATTR float MATH_MANGLE(native_cos)(float x) { return __builtin_cosf(x); } CONSTATTR float MATH_MANGLE(native_exp)(float x) { return __builtin_expf(x); } CONSTATTR float MATH_MANGLE(native_exp2)(float x) { return __builtin_exp2f(x); } CONSTATTR float MATH_MANGLE(native_exp10)(float x) { return __builtin_exp2f(M_LOG2_10_F * x); } CONSTATTR float MATH_MANGLE(native_log)(float x) { return __builtin_logf(x); } CONSTATTR float MATH_MANGLE(native_log2)(float x) { return __builtin_log2f(x); } CONSTATTR float MATH_MANGLE(native_log10)(float x) { return __builtin_log10f(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/nativeH.cl000066400000000000000000000024161415221260100207720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(native_rcp)(half x) { // FIXME: Should use IR fdiv with arcp set. return __builtin_amdgcn_rcph(x); } CONSTATTR half MATH_MANGLE(native_sqrt)(half x) { return __builtin_sqrtf16(x); } REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(native_rsqrt)(half x) { return __builtin_amdgcn_rsqh(x); } CONSTATTR half MATH_MANGLE(native_sin)(half x) { return __builtin_sinf16(x); } CONSTATTR half MATH_MANGLE(native_cos)(half x) { return __builtin_cosf16(x); } CONSTATTR half MATH_MANGLE(native_exp)(half x) { return __builtin_expf16(x); } CONSTATTR half MATH_MANGLE(native_exp2)(half x) { return __builtin_exp2f16(x); } CONSTATTR half MATH_MANGLE(native_log)(half x) { return __builtin_logf16(x); } CONSTATTR half MATH_MANGLE(native_log2)(half x) { return __builtin_log2f16(x); } CONSTATTR half MATH_MANGLE(native_log10)(half x) { return __builtin_log10f16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ncdfD.cl000066400000000000000000000204671415221260100204200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #if !defined EXTRA_ACCURACY CONSTATTR double MATH_MANGLE(ncdf)(double x) { const double chi = -0x1.6a09e667f3bcdp-1; const double clo = 0x1.bdd3413b26456p-55; const double b = 0x1.34d4edce2b7d6p+5; x = BUILTIN_ABS_F64(x) > b ? BUILTIN_COPYSIGN_F64(b, x) : x; double thi = chi * x; double tlo = MATH_MAD(clo, x, MATH_MAD(chi, x, -thi)); double yhi = thi + tlo; double ylo = tlo - (yhi - thi); double r = MATH_MANGLE(erfc)(yhi); double dr = -2.0 * yhi * r; dr = x >= -1.0 ? 0.0f : dr; r = MATH_MAD(ylo, dr, r); return 0.5 * r; } #else CONSTATTR double MATH_MANGLE(ncdf)(double x) { double ret; if (x > -0x1.5956b87528a49p-1) { if (x < 1.0) { double t = x * x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.8cb754014e0b3p-34, 0x1.320d075b1fdefp-29), -0x1.61ab7dd43f8c3p-25), 0x1.6584e2ae1c515p-21), -0x1.3ce8d5eca373fp-17), 0x1.e42b0c16331c9p-14), -0x1.37403f689501bp-10), 0x1.46d0429761749p-7), -0x1.1058377e2ce69p-4), 0x1.9884533d43650p-2); ret = MATH_MAD(x, ret, 0.5); } else if (x < 2.5) { double t = x - 1.0; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.060edab4a19d2p-29, -0x1.53a0eb739ccefp-25), 0x1.4c8f542ea757fp-22), -0x1.1c15387d5063ap-20), 0x1.fadb9735a0803p-22), 0x1.a2bae693176d3p-18), -0x1.cd9e9b6a563dbp-21), -0x1.73fccf7f7f32cp-14), 0x1.f8d0e4a86cde5p-14), 0x1.92ac8d4045877p-11), -0x1.084ad98cd25bfp-9), -0x1.084c041e359abp-8), 0x1.4a5ee6ad39afcp-6), -0x1.c16ac04dad985p-35), -0x1.ef8e58e30ef67p-4), 0x1.ef8e58e331308p-3), 0x1.aec4bd120d37ep-1); } else if (x < 4.0) { double t = x - 2.5; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.5f0f31da8eb78p-33, -0x1.51820cdbd28e7p-32), 0x1.af16a4a50d960p-26), -0x1.b5b829c3676fep-23), 0x1.6a839ce113434p-21), -0x1.efa0b32917d76p-24), -0x1.c2eaad7a58467p-18), 0x1.2c1fa77adea62p-16), 0x1.c789d533e599bp-16), -0x1.13874be6da82dp-12), 0x1.0d3cf7e102cccp-11), 0x1.5d67fa3a182e7p-11), -0x1.84e50141ef284p-8), 0x1.f6924953c9cbbp-7), -0x1.66fac6add3b42p-6), 0x1.1f2f0557f4ab9p-6), 0x1.fcd21635036c6p-1); } else if (x < 8.2109375) { double t = x - 4.0; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.49dae5934aa9ep-37, 0x1.a0a9b27e4276cp-33), -0x1.40ae395c9950bp-32), -0x1.6d7df112c9529p-26), 0x1.f76261921be9dp-25), 0x1.a70ffb3533144p-19), -0x1.9e462dbfa92d9p-16), -0x1.5db0c27784edap-13), 0x1.3c5a964f22d79p-9), 0x1.5cadd35757947p-9), -0x1.1b11634e869afp-3), 0x1.0bf46d4a7c1dap-1); ret = ret * ret; ret = ret * ret; ret = ret * ret; ret = MATH_MAD(-ret, ret, 1.0); } else { ret = 1.0; } } else { if (x > -1.5) { double t = -1.5 - x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.87f6d8bacfe4dp-24, -0x1.48dcea6d816e1p-23), 0x1.a32c40a47a30ep-20), 0x1.bd22f42e45845p-21), -0x1.40839ec0fb6a8p-16), 0x1.a659159d48d42p-16), 0x1.6f322a8af7fa6p-13), -0x1.2466b5cb3347ep-11), -0x1.58d37df0dc6c4p-11), 0x1.809d8fed7b759p-8), -0x1.8de0c7fed2ce4p-8), -0x1.ba1633b5691dfp-6), 0x1.8de0c823b3adcp-4), -0x1.0940856d21e73p-3), 0x1.11a46d89647efp-4); } else if (x > -2.25) { double t = -2.25 - x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.34778becb8778p-25, -0x1.48b485e383089p-24), -0x1.bd48bc73889cap-21), 0x1.b73b6859639c8p-20), 0x1.3582af30190aap-18), -0x1.1ac5d5e34ec1bp-15), 0x1.0cc99e25a5373p-15), 0x1.14835909e7060p-12), -0x1.03e8ee71d051cp-10), 0x1.e44553637b8cap-12), 0x1.9234723301c22p-8), -0x1.601939c453937p-6), 0x1.24833bce57500p-5), -0x1.0402dfd3dc1adp-5), 0x1.90924f21d3612p-7); } else if (x > -2.75) { double t = -2.75 - x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.b9337a6a3734cp-24, -0x1.6590be46da1cep-23), -0x1.267a1aba29190p-20), 0x1.5254da7def6c3p-18), -0x1.502fd581f8723p-19), -0x1.9d5f911317093p-15), 0x1.7a91271378f92p-13), -0x1.f4331ea1149bdp-14), -0x1.2654aaf562b70p-10), 0x1.378ebd4d4cb5bp-8), -0x1.45e9ccb8cbc85p-7), 0x1.99b83490879c6p-7), -0x1.29fa54c6341e5p-7), 0x1.86904349ec803p-9); } else if (x > -38.46875) { double t = MATH_RCP(x * x); if (x > -4.0) ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.088bebb0c7bfcp+25, -0x1.964e1d51045b9p+25), 0x1.255cf223ca4ddp+25), -0x1.093e30bdaaf0ap+24), 0x1.51dabf56ccafap+22), -0x1.440d8ce218330p+20), 0x1.eaab175120c83p+17), -0x1.31cd405f6ece6p+15), 0x1.4949b45c18bffp+12), -0x1.476ca2d47ed6dp+9), 0x1.4b5c83b73de92p+6), -0x1.86317d1686e59p+3), 0x1.3fab4df0327b3p+1), -0x1.fffc093fa2eedp-1), -0x1.3f9112da61104p-8); else ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.668af6ed742f7p+59, -0x1.e8a3ea3ebba9fp+58), 0x1.39149210574c4p+57), -0x1.f6e7aed1dc814p+54), 0x1.1d2c1545c3a31p+52), -0x1.e8eb69ce384f2p+48), 0x1.4c8445a6d688bp+45), -0x1.7638c79bb1508p+41), 0x1.6c05288dd5cfbp+37), -0x1.41fe50b8d5f0fp+33), 0x1.12af999e7acfap+29), -0x1.e02f34f68433ep+24), 0x1.c4864e8ef2105p+20), -0x1.dc7852ceec4e8p+16), 0x1.1f83f2164bb6fp+13), -0x1.9819642b134dbp+9), 0x1.60fffe9105243p+6), -0x1.8aaaaaa42b3fdp+3), 0x1.3ffffffff70fdp+1), -0x1.fffffffffff98p-1), -0x1.3f8e4325f5a57p-8); double xh = AS_DOUBLE(AS_LONG(x) & 0xffffffff00000000L); ret = MATH_DIV(MATH_MANGLE(exp)(MATH_MAD(x - xh, -0.5*(x + xh), ret)), -x) * MATH_MANGLE(exp)(MATH_MAD(xh, -0.5*xh, -0.9140625)); } else { ret = BUILTIN_ISNAN_F64(x) ? x : 0.0; } } return ret; } #endif ROCm-Device-Libs-rocm-5.0.0/ocml/src/ncdfF.cl000066400000000000000000000113771415221260100204220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #if !defined EXTRA_ACCURACY CONSTATTR float MATH_MANGLE(ncdf)(float x) { const float chi = -0x1.6a09e6p-1f; const float clo = -0x1.9fcef4p-27f; const float b = 0x1.c57228p+3f; x = BUILTIN_ABS_F32(x) > b ? BUILTIN_COPYSIGN_F32(b, x) : x; float thi = chi * x; float tlo = BUILTIN_FMA_F32(clo, x, BUILTIN_FMA_F32(chi, x, -thi)); float yhi = thi + tlo; float ylo = tlo - (yhi - thi); float r = MATH_MANGLE(erfc)(yhi); float dr = -2.0f * yhi * r; dr = x >= -1.0f ? 0.0f : dr; r = BUILTIN_FMA_F32(ylo, dr, r); return 0.5f * r; } #else CONSTATTR float MATH_MANGLE(ncdf)(float x) { float ret; // cut at -0x1.5956b8p-1f if (x > -0x1.5956b8p-1f) { if (x < 1.0f) { float t = x*x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.20379ep-21f, -0x1.3727aep-17f), 0x1.e3af2ep-14f), -0x1.373d8cp-10f), 0x1.46d034p-7f), -0x1.105838p-4f), 0x1.988454p-2f); ret = MATH_MAD(x, ret, 0.5f); } else if (x < 2.5f) { float t = x - 1.0f; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.53eaecp-13f, 0x1.3458b4p-10f), -0x1.306adcp-9f), -0x1.01ae44p-8f), 0x1.4a7e5ep-6f), -0x1.fe4012p-17f), -0x1.ef8a62p-4f), 0x1.ef8e32p-3f), 0x1.aec4bep-1f); } else if (x < 4.0f) { float t = x - 2.5f; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.4ca664p-13f, 0x1.990fd2p-10f), -0x1.b0d706p-8f), 0x1.ffa500p-7f), -0x1.67e84cp-6f), 0x1.1f419cp-6f), 0x1.fcd214p-1f); } else if (x < 5.296875f) { float t = x - 4.0f; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.eae60ap-10f, 0x1.9b6438p-9f), -0x1.1b57a8p-3f), 0x1.0bf538p-1f); ret = ret * ret; ret = ret * ret; ret = ret * ret; ret = MATH_MAD(-ret, ret, 1.0f); } else { ret = 1.0f; } } else { if (x > -1.5f) { float t = -1.5f - x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.a29ef2p-11f, -0x1.a25e42p-11f), 0x1.7eaaaap-8f), -0x1.8d95e2p-8f), -0x1.ba093ap-6f), 0x1.8de146p-4f), -0x1.094082p-3f), 0x1.11a46ep-4f); } else if (x > -2.5f) { float t = -2.5f - x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.aef5d0p-14f, 0x1.0b8148p-11f), -0x1.232788p-12f), -0x1.1afa4cp-11f), 0x1.877322p-8f), -0x1.f65b2ep-7f), 0x1.66fd08p-6f), -0x1.1f2ef4p-6f), 0x1.96f4e6p-8f); } else if (x > -3.25f) { float t = -3.25f - x; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.8963dep-15f, -0x1.2e81a4p-17f), 0x1.7477b2p-13f), -0x1.c8841ap-11f), 0x1.1036c6p-9f), -0x1.a7e084p-9f), 0x1.b02b86p-9f), -0x1.09f390p-9f), 0x1.2e86fep-11f); } else if (x > -14.125f) { float t = MATH_FAST_RCP(x * x); if (x > -5.0f) ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.f9b114p+7f, -0x1.32f4b4p+7f), 0x1.723550p+5f), -0x1.4b98dcp+3f), 0x1.3821cep+1f), -0x1.ff6d7cp-1f), -0x1.4023a6p-8f); else ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.f31adep+10f, -0x1.030fd6p+9f), 0x1.41d2c6p+6f), -0x1.86b97ap+3f), 0x1.3fdb64p+1f), -0x1.ffff50p-1f), -0x1.3f8e6cp-8f); float xh = AS_FLOAT(AS_INT(x) & 0xffffe000); ret = MATH_FAST_DIV(MATH_MANGLE(exp)(MATH_MAD(x - xh, -0.5f*(x + xh), ret)), -x) * MATH_MANGLE(exp)(MATH_MAD(xh, -0.5f*xh, -0.9140625f)); } else { ret = BUILTIN_ISNAN_F32(x) ? x : 0.0f; } } return ret; } #endif ROCm-Device-Libs-rocm-5.0.0/ocml/src/ncdfH.cl000066400000000000000000000007111415221260100204120ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(ncdf) CONSTATTR half MATH_MANGLE(ncdf)(half x) { return (half)MATH_UPMANGLE(ncdf)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ncdfinvD.cl000066400000000000000000000007121415221260100211240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(ncdfinv)(double x) { return -0x1.6a09e667f3bcdp+0 * MATH_MANGLE(erfcinv)(x + x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ncdfinvF.cl000066400000000000000000000007021415221260100211250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(ncdfinv)(float x) { return -0x1.6a09e6p+0f * MATH_MANGLE(erfcinv)(x + x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/ncdfinvH.cl000066400000000000000000000007221415221260100211310ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(ncdfinv) CONSTATTR half MATH_MANGLE(ncdfinv)(half x) { return (half)MATH_UPMANGLE(ncdfinv)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/nearbyintD.cl000066400000000000000000000006541415221260100214750ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(nearbyint)(double x) { return BUILTIN_RINT_F64(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/nearbyintF.cl000066400000000000000000000006521415221260100214750ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(nearbyint)(float x) { return BUILTIN_RINT_F32(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/nearbyintH.cl000066400000000000000000000007771415221260100215070ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(nearbyint)(half2 x) { return BUILTIN_RINT_2F16(x); } CONSTATTR half MATH_MANGLE(nearbyint)(half x) { return BUILTIN_RINT_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/nextafterD.cl000066400000000000000000000016711415221260100215020ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(nextafter)(double x, double y) { long ix = AS_LONG(x); long mx = SIGNBIT_DP64 - ix; mx = ix < 0 ? mx : ix; long iy = AS_LONG(y); long my = SIGNBIT_DP64 - iy; my = iy < 0 ? my : iy; long t = mx + (mx < my ? 1 : -1); long r = SIGNBIT_DP64 - t; r = t < 0 ? r : t; if (!FINITE_ONLY_OPT()) { r = BUILTIN_ISNAN_F64(x) ? ix : r; r = BUILTIN_ISNAN_F64(y) ? iy : r; } double ax = BUILTIN_ABS_F64(x); double ay = BUILTIN_ABS_F64(y); r = ((AS_LONG(ax)|AS_LONG(ay)) == 0L | ix == iy) ? iy : r; return AS_DOUBLE(r); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/nextafterF.cl000066400000000000000000000016521415221260100215030ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(nextafter)(float x, float y) { int ix = AS_INT(x); int mx = SIGNBIT_SP32 - ix; mx = ix < 0 ? mx : ix; int iy = AS_INT(y); int my = SIGNBIT_SP32 - iy; my = iy < 0 ? my : iy; int t = mx + (mx < my ? 1 : -1); int r = SIGNBIT_SP32 - t; r = t < 0 ? r : t; if (!FINITE_ONLY_OPT()) { r = BUILTIN_ISNAN_F32(x) ? ix : r; r = BUILTIN_ISNAN_F32(y) ? iy : r; } float ax = BUILTIN_ABS_F32(x); float ay = BUILTIN_ABS_F32(y); r = ((AS_INT(ax) | AS_INT(ay)) == 0 | ix == iy) ? iy : r; return AS_FLOAT(r); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/nextafterH.cl000066400000000000000000000020221415221260100214750ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR BGEN(nextafter) CONSTATTR half MATH_MANGLE(nextafter)(half x, half y) { short ix = AS_SHORT(x); short mx = (short)SIGNBIT_HP16 - ix; mx = ix < (short)0 ? mx : ix; short iy = AS_SHORT(y); short my = (short)SIGNBIT_HP16 - iy; my = iy < (short)0 ? my : iy; short t = mx + (mx < my ? (short)1 : (short)-1); short r = (short)SIGNBIT_HP16 - t; r = t < (short)0 ? r : t; if (!FINITE_ONLY_OPT()) { r = BUILTIN_ISNAN_F16(x) ? ix : r; r = BUILTIN_ISNAN_F16(y) ? iy : r; } half ax = BUILTIN_ABS_F16(x); half ay = BUILTIN_ABS_F16(y); r = ((AS_SHORT(ax) | AS_SHORT(ay)) == (short)0 | ix == iy) ? iy : r; return AS_HALF(r); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/opts.h000066400000000000000000000012161415221260100202070ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" #define HAVE_FAST_FMA32() (__oclc_ISA_version == 7001 || __oclc_ISA_version == 8001 || __oclc_ISA_version >= 9000) #define FINITE_ONLY_OPT() __oclc_finite_only_opt #define UNSAFE_MATH_OPT() __oclc_unsafe_math_opt #define DAZ_OPT() __oclc_daz_opt #define CORRECTLY_ROUNDED_SQRT32() __oclc_correctly_rounded_sqrt32 ROCm-Device-Libs-rocm-5.0.0/ocml/src/powD.cl000066400000000000000000000005571415221260100203110ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_POW #include "powD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/powD_base.h000066400000000000000000000063501415221260100211310ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" extern CONSTATTR double2 MATH_PRIVATE(epln)(double); extern CONSTATTR double MATH_PRIVATE(expep)(double2); #define DOUBLE_SPECIALIZATION #include "ep.h" static bool samesign(double x, double y) { uint xh = AS_UINT2(x).hi; uint yh = AS_UINT2(y).hi; return ((xh ^ yh) & 0x80000000U) == 0; } CONSTATTR double #if defined(COMPILING_POWR) MATH_MANGLE(powr)(double x, double y) #elif defined(COMPILING_POWN) MATH_MANGLE(pown)(double x, int ny) #elif defined(COMPILING_ROOTN) MATH_MANGLE(rootn)(double x, int ny) #else MATH_MANGLE(pow)(double x, double y) #endif { #if defined(COMPILING_POWN) double y = (double) ny; #elif defined(COMPILING_ROOTN) double2 y = rcp((double)ny); #endif double ax = BUILTIN_ABS_F64(x); double expylnx = MATH_PRIVATE(expep)(omul(y, MATH_PRIVATE(epln)(ax))); // y status: 0=not integer, 1=odd, 2=even #if defined(COMPILING_POWN) | defined(COMPILING_ROOTN) int inty = 2 - (ny & 1); #else double ay = BUILTIN_ABS_F64(y); int inty; { double tay = BUILTIN_TRUNC_F64(ay); inty = ay == tay; inty += inty & (BUILTIN_FRACTION_F64(tay*0.5) == 0.0); } #endif double ret = BUILTIN_COPYSIGN_F64(expylnx, ((inty == 1) & (x < 0.0)) ? -0.0 : 0.0); // Now all the edge cases #if defined COMPILING_POWR double iz = y < 0.0 ? AS_DOUBLE(PINFBITPATT_DP64) : 0.0; double zi = y < 0.0 ? 0.0 : AS_DOUBLE(PINFBITPATT_DP64); if (x == 0.0) ret = iz; if (BUILTIN_ISINF_F64(x)) ret = zi; if (BUILTIN_ISINF_F64(y)) ret = ax < 1.0 ? iz : zi; if (y == 0.0) ret = x == 0.0 || BUILTIN_ISINF_F64(x) ? AS_DOUBLE(QNANBITPATT_DP64) : 1.0; if (x == 1.0) ret = BUILTIN_ISINF_F64(y) ? AS_DOUBLE(QNANBITPATT_DP64) : 1.0; if (x < 0.0 || BUILTIN_ISNAN_F64(x) || BUILTIN_ISNAN_F64(y)) ret = AS_DOUBLE(QNANBITPATT_DP64); #elif defined COMPILING_POWN if (BUILTIN_ISINF_F64(ax) || x == 0.0) ret = BUILTIN_COPYSIGN_F64((x == 0.0) ^ (ny < 0) ? 0.0 : AS_DOUBLE(PINFBITPATT_DP64), inty == 1 ? x : 0.0); if (BUILTIN_ISNAN_F64(x)) ret = AS_DOUBLE(QNANBITPATT_DP64); if (ny == 0) ret = 1.0; #elif defined COMPILING_ROOTN if (BUILTIN_ISINF_F64(ax) || x == 0.0) ret = BUILTIN_COPYSIGN_F64((x == 0.0) ^ (ny < 0) ? 0.0 : AS_DOUBLE(PINFBITPATT_DP64), inty == 1 ? x : 0.0); if ((x < 0.0 && inty != 1) || ny == 0) ret = AS_DOUBLE(QNANBITPATT_DP64); #else if (x < 0.0 && !inty) ret = AS_DOUBLE(QNANBITPATT_DP64); if (BUILTIN_ISINF_F64(ay)) ret = ax == 1.0 ? ax : (samesign(y, ax - 1.0) ? ay : 0.0); if (BUILTIN_ISINF_F64(ax) || x == 0.0) ret = BUILTIN_COPYSIGN_F64((x == 0.0) ^ (y < 0.0) ? 0.0 : AS_DOUBLE(PINFBITPATT_DP64), inty == 1 ? x : 0.0); if (BUILTIN_ISNAN_F64(x) || BUILTIN_ISNAN_F64(y)) ret = AS_DOUBLE(QNANBITPATT_DP64); if (x == 1.0 || y == 0.0) ret = 1.0; #endif return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/powF.cl000066400000000000000000000005571415221260100203130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_POW #include "powF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/powF_base.h000066400000000000000000000075351415221260100211410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float2 MATH_PRIVATE(epln)(float); extern CONSTATTR float MATH_PRIVATE(expep)(float2); static bool samesign(float x, float y) { return ((AS_UINT(x) ^ AS_UINT(y)) & 0x80000000) == 0; } CONSTATTR float #if defined(COMPILING_POWR) MATH_MANGLE(powr)(float x, float y) #elif defined(COMPILING_POWN) MATH_MANGLE(pown)(float x, int ny) #elif defined(COMPILING_ROOTN) MATH_MANGLE(rootn)(float x, int ny) #else MATH_MANGLE(pow)(float x, float y) #endif { float ax = BUILTIN_ABS_F32(x); float expylnx; if (UNSAFE_MATH_OPT()) { #if defined COMPILING_POWN float y = (float)ny; #elif defined COMPILING_ROOTN float y = MATH_FAST_RCP((float)ny); #endif if (DAZ_OPT()) { expylnx = BUILTIN_EXP2_F32(y * BUILTIN_LOG2_F32(ax)); } else { bool b = ax < 0x1.0p-126f; float ylnx = y * (BUILTIN_LOG2_F32(ax * (b ? 0x1.0p+24f : 1.0f)) - (b ? 24.0f : 0.0f)); b = ylnx < -126.0f; expylnx = BUILTIN_EXP2_F32(ylnx + (b ? 24.0f : 0.0f)) * (b ? 0x1.0p-24f : 1.0f); } } else { #if defined COMPILING_POWN || defined COMPILING_ROOTN int nyh = ny & 0xffff0000; float2 y = fadd((float)nyh, (float)(ny - nyh)); #if defined(COMPILING_ROOTN) y = rcp(y); #endif #endif expylnx = MATH_PRIVATE(expep)(omul(y, MATH_PRIVATE(epln)(ax))); } // y status: 0=not integer, 1=odd, 2=even #if defined(COMPILING_POWN) || defined(COMPILING_ROOTN) int inty = 2 - (ny & 1); #else float ay = BUILTIN_ABS_F32(y); int inty; { float tay = BUILTIN_TRUNC_F32(ay); inty = ay == tay; inty += inty & (BUILTIN_FRACTION_F32(tay*0.5f) == 0.0f); } #endif float ret = BUILTIN_COPYSIGN_F32(expylnx, ((inty == 1) & (x < 0.0f)) ? -0.0f : 0.0f); // Now all the edge cases #if defined COMPILING_POWR float iz = y < 0.0f ? AS_FLOAT(PINFBITPATT_SP32) : 0.0f; float zi = y < 0.0f ? 0.0f : AS_FLOAT(PINFBITPATT_SP32); if (x == 0.0f) ret = iz; if (BUILTIN_ISINF_F32(x)) ret = zi; if (BUILTIN_ISINF_F32(y)) ret = ax < 1.0f ? iz : zi; if (y == 0.0f) ret = x == 0.0f || BUILTIN_ISINF_F32(x) ? AS_FLOAT(QNANBITPATT_SP32) : 1.0f; if (x == 1.0f) ret = BUILTIN_ISINF_F32(y) ? AS_FLOAT(QNANBITPATT_SP32) : 1.0f; if (x < 0.0f || BUILTIN_ISNAN_F32(x) || BUILTIN_ISNAN_F32(y)) ret = AS_FLOAT(QNANBITPATT_SP32); #elif defined COMPILING_POWN if (BUILTIN_ISINF_F32(ax) || x == 0.0f) ret = BUILTIN_COPYSIGN_F32((x == 0.0f) ^ (ny < 0) ? 0.0f : AS_FLOAT(PINFBITPATT_SP32), inty == 1 ? x : 0.0f); if (BUILTIN_ISNAN_F32(x)) ret = AS_FLOAT(QNANBITPATT_SP32); if (ny == 0) ret = 1.0f; #elif defined COMPILING_ROOTN if (BUILTIN_ISINF_F32(ax) || x == 0.0f) ret = BUILTIN_COPYSIGN_F32((x == 0.0f) ^ (ny < 0) ? 0.0f : AS_FLOAT(PINFBITPATT_SP32), inty == 1 ? x : 0.0f); if ((x < 0.0f && inty != 1) || ny == 0) ret = AS_FLOAT(QNANBITPATT_SP32); #else if (x < 0.0f && !inty) ret = AS_FLOAT(QNANBITPATT_SP32); if (BUILTIN_ISINF_F32(ay)) ret = ax == 1.0f ? ax : (samesign(y, ax - 1.0f) ? ay : 0.0f); if (BUILTIN_ISINF_F32(ax) || x == 0.0f) ret = BUILTIN_COPYSIGN_F32((x == 0.0f) ^ (y < 0.0f) ? 0.0f : AS_FLOAT(PINFBITPATT_SP32), inty == 1 ? x : 0.0f); if (BUILTIN_ISNAN_F32(x) || BUILTIN_ISNAN_F32(y)) ret = AS_FLOAT(QNANBITPATT_SP32); if (x == 1.0f || y == 0.0f) ret = 1.0f; #endif return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/powH.cl000066400000000000000000000006301415221260100203050ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR BGEN(pow) #define COMPILING_POW #include "powH_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/powH_base.h000066400000000000000000000224441415221260100211370ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ REQUIRES_16BIT_INSTS CONSTATTR half #if defined(COMPILING_POWR) MATH_MANGLE(powr)(half x, half y) #elif defined(COMPILING_POWN) MATH_MANGLE(pown)(half x, int ny) #elif defined(COMPILING_ROOTN) MATH_MANGLE(rootn)(half x, int ny) #else MATH_MANGLE(pow)(half x, half y) #endif { half ax = BUILTIN_ABS_F16(x); #if defined(COMPILING_POWN) float fy = (float)ny; #elif defined(COMPILING_ROOTN) float fy = BUILTIN_RCP_F32((float)ny); #else float fy = (float)y; #endif float p = BUILTIN_EXP2_F32(fy * BUILTIN_LOG2_F32((float)ax)); // Classify y: // inty = 0 means not an integer. // inty = 1 means odd integer. // inty = 2 means even integer. #if defined(COMPILING_POWN) || defined(COMPILING_ROOTN) int inty = 2 - (ny & 1); #else half ay = BUILTIN_ABS_F16(y); int inty; { half tay = BUILTIN_TRUNC_F16(ay); inty = ay == tay; inty += inty & (BUILTIN_FRACTION_F16(tay*0.5h) == 0.0h); } #endif half ret = BUILTIN_COPYSIGN_F16((half)p, ((inty == 1) & (x < 0.0h)) ? -0.0f : 0.0f); // Now all the edge cases #if defined COMPILING_POWR bool ax_eq_0 = ax == 0.0h; bool ax_ne_0 = ax != 0.0h; bool ax_lt_1 = ax < 1.0h; bool ax_eq_1 = ax == 1.0h; bool ax_gt_1 = ax > 1.0h; bool ax_lt_pinf = BUILTIN_CLASS_F16(x, CLASS_PNOR|CLASS_PSUB); bool ax_eq_pinf = BUILTIN_CLASS_F16(x, CLASS_PINF); bool ax_eq_nan = BUILTIN_ISNAN_F16(x); bool x_pos = BUILTIN_CLASS_F16(x, CLASS_PZER|CLASS_PSUB|CLASS_PNOR|CLASS_PINF); bool ay_eq_0 = ay == 0.0h; bool ay_eq_pinf = BUILTIN_CLASS_F16(ay, CLASS_PINF); bool ay_eq_nan = BUILTIN_ISNAN_F16(ay); bool y_eq_ninf = BUILTIN_CLASS_F16(y, CLASS_NINF); bool y_eq_pinf = BUILTIN_CLASS_F16(y, CLASS_PINF); bool ay_lt_inf = BUILTIN_CLASS_F16(y, CLASS_PNOR|CLASS_PSUB); bool y_pos = BUILTIN_CLASS_F16(y, CLASS_PZER|CLASS_PSUB|CLASS_PNOR|CLASS_PINF); if (!FINITE_ONLY_OPT()) { ret = (ax_lt_1 & y_eq_ninf) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = (ax_lt_1 & y_eq_pinf) ? 0.0h : ret; ret = (ax_eq_1 & ay_lt_inf) ? 1.0h : ret; ret = (ax_eq_1 & ay_eq_pinf) ? AS_HALF((ushort)QNANBITPATT_HP16) : ret; ret = (ax_gt_1 & y_eq_ninf) ? 0.0h : ret; ret = (ax_gt_1 & y_eq_pinf) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = (ax_lt_pinf & ay_eq_0) ? 1.0h : ret; ret = (ax_eq_pinf & !y_pos) ? 0.0h : ret; ret = (ax_eq_pinf & y_pos) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = (ax_eq_pinf & y_eq_pinf) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = (ax_eq_pinf & ay_eq_0) ? AS_HALF((ushort)QNANBITPATT_HP16) : ret; ret = (ax_eq_0 & !y_pos) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = (ax_eq_0 & y_pos) ? 0.0h : ret; ret = (ax_eq_0 & ay_eq_0) ? AS_HALF((ushort)QNANBITPATT_HP16) : ret; ret = (ax_ne_0 & !x_pos) ? AS_HALF((ushort)QNANBITPATT_HP16) : ret; ret = ax_eq_nan ? x : ret; ret = ay_eq_nan ? y : ret; } else { ret = ax_eq_1 ? 1.0h : ret; ret = ay_eq_0 ? 1.0h : ret; ret = (ax_eq_0 & y_pos) ? 0.0h : ret; } #elif defined COMPILING_POWN bool ax_eq_0 = ax == 0.0h; bool x_eq_ninf = BUILTIN_CLASS_F16(x, CLASS_NINF); bool x_eq_pinf = BUILTIN_CLASS_F16(x, CLASS_PINF); bool ax_lt_pinf = BUILTIN_CLASS_F16(x, CLASS_PNOR|CLASS_PSUB); bool ax_eq_pinf = BUILTIN_CLASS_F16(x, CLASS_PINF); bool ax_eq_nan = BUILTIN_ISNAN_F16(x); bool x_pos = BUILTIN_CLASS_F16(x, CLASS_PZER|CLASS_PSUB|CLASS_PNOR|CLASS_PINF); bool y_pos = ny >= 0; if (!FINITE_ONLY_OPT()) { half xinf = BUILTIN_COPYSIGN_F16(AS_HALF((ushort)PINFBITPATT_HP16), x); ret = (ax_eq_0 & !y_pos & (inty == 1)) ? xinf : ret; ret = (ax_eq_0 & !y_pos & (inty == 2)) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = (ax_eq_0 & y_pos & (inty == 2)) ? 0.0h : ret; half xzero = BUILTIN_COPYSIGN_F16(0.0h, x); ret = (ax_eq_0 & y_pos & (inty == 1)) ? xzero : ret; ret = (x_eq_ninf & !y_pos & (inty == 1)) ? -0.0h : ret; ret = (x_eq_ninf & !y_pos & (inty != 1)) ? 0.0h : ret; ret = (x_eq_ninf & y_pos & (inty == 1)) ? AS_HALF((ushort)NINFBITPATT_HP16) : ret; ret = (x_eq_ninf & y_pos & (inty != 1)) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = (x_eq_pinf & !y_pos) ? 0.0h : ret; ret = (x_eq_pinf & y_pos) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = ax_eq_nan ? x : ret; } else { half xzero = BUILTIN_COPYSIGN_F16(0.0h, x); ret = (ax_eq_0 & y_pos & (inty == 1)) ? xzero : ret; ret = (ax_eq_0 & y_pos & (inty == 2)) ? 0.0h : ret; } ret = ny == 0 ? 1.0h : ret; #elif defined COMPILING_ROOTN bool ax_eq_0 = ax == 0.0h; bool x_eq_ninf = BUILTIN_CLASS_F16(x, CLASS_NINF); bool x_eq_pinf = BUILTIN_CLASS_F16(x, CLASS_PINF); bool ax_lt_pinf = BUILTIN_CLASS_F16(x, CLASS_PNOR|CLASS_PSUB); bool ax_eq_pinf = BUILTIN_CLASS_F16(x, CLASS_PINF); bool ax_eq_nan = BUILTIN_ISNAN_F16(x); bool x_pos = BUILTIN_CLASS_F16(x, CLASS_PZER|CLASS_PSUB|CLASS_PNOR|CLASS_PINF); bool y_pos = ny >= 0; if (!FINITE_ONLY_OPT()) { ret = (!x_pos & (inty == 2)) ? AS_HALF((ushort)QNANBITPATT_HP16) : ret; half xinf = BUILTIN_COPYSIGN_F16(AS_HALF((ushort)PINFBITPATT_HP16), x); ret = (ax_eq_0 & !y_pos & (inty == 1)) ? xinf : ret; ret = (ax_eq_0 & !y_pos & (inty == 2)) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = (ax_eq_0 & y_pos & (inty == 2)) ? 0.0h : ret; half xzero = BUILTIN_COPYSIGN_F16(0.0h, x); ret = (ax_eq_0 & y_pos & (inty == 1)) ? xzero : ret; ret = (x_eq_ninf & y_pos & (inty == 1)) ? AS_HALF((ushort)NINFBITPATT_HP16) : ret; ret = (x_eq_ninf & !y_pos & (inty == 1)) ? -0.0h : ret; ret = (x_eq_pinf & !y_pos) ? 0.0h : ret; ret = (x_eq_pinf & y_pos) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = ax_eq_nan ? x : ret; ret = ny == 0 ? AS_HALF((ushort)QNANBITPATT_HP16) : ret; } else { half xzero = BUILTIN_COPYSIGN_F16(0.0h, x); ret = (ax_eq_0 & y_pos & (inty == 1)) ? xzero : ret; ret = (ax_eq_0 & y_pos & (inty == 2)) ? 0.0h : ret; } #else bool ax_eq_0 = ax == 0.0h; bool ax_ne_0 = ax != 0.0h; bool ax_lt_1 = ax < 1.0h; bool ax_eq_1 = ax == 1.0h; bool ax_gt_1 = ax > 1.0h; bool ax_lt_pinf = BUILTIN_CLASS_F16(x, CLASS_PNOR|CLASS_PSUB); bool ax_eq_pinf = BUILTIN_CLASS_F16(x, CLASS_PINF); bool ax_eq_nan = BUILTIN_ISNAN_F16(x); bool x_pos = BUILTIN_CLASS_F16(x, CLASS_PZER|CLASS_PSUB|CLASS_PNOR|CLASS_PINF); bool x_eq_ninf = BUILTIN_CLASS_F16(x, CLASS_NINF); bool x_eq_pinf = BUILTIN_CLASS_F16(x, CLASS_PINF); bool ay_eq_0 = ay == 0.0h; bool ay_eq_pinf = BUILTIN_CLASS_F16(ay, CLASS_PINF); bool ay_eq_nan = BUILTIN_ISNAN_F16(ay); bool y_eq_ninf = BUILTIN_CLASS_F16(y, CLASS_NINF); bool y_eq_pinf = BUILTIN_CLASS_F16(y, CLASS_PINF); bool ay_lt_inf = BUILTIN_CLASS_F16(y, CLASS_PNOR|CLASS_PSUB); bool y_pos = BUILTIN_CLASS_F16(y, CLASS_PZER|CLASS_PSUB|CLASS_PNOR|CLASS_PINF); if (!FINITE_ONLY_OPT()) { ret = (!x_pos & (inty == 0)) ? AS_HALF((ushort)QNANBITPATT_HP16) : ret; ret = (ax_lt_1 & y_eq_ninf) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = (ax_gt_1 & y_eq_ninf) ? 0.0h : ret; ret = (ax_lt_1 & y_eq_pinf) ? 0.0h : ret; ret = (ax_gt_1 & y_eq_pinf) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; half xinf = BUILTIN_COPYSIGN_F16(AS_HALF((ushort)PINFBITPATT_HP16), x); ret = (ax_eq_0 & !y_pos & (inty == 1)) ? xinf : ret; ret = (ax_eq_0 & !y_pos & (inty != 1)) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; half xzero = BUILTIN_COPYSIGN_F16(0.0h, x); ret = (ax_eq_0 & y_pos & (inty == 1)) ? xzero : ret; ret = (ax_eq_0 & y_pos & (inty != 1)) ? 0.0h : ret; ret = (ax_eq_0 & y_eq_ninf) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = ((x == -1.0h) & ay_eq_pinf) ? 1.0h : ret; ret = (x_eq_ninf & !y_pos & (inty == 1)) ? -0.0h : ret; ret = (x_eq_ninf & !y_pos & (inty != 1)) ? 0.0h : ret; ret = (x_eq_ninf & y_pos & (inty == 1)) ? AS_HALF((ushort)NINFBITPATT_HP16) : ret; ret = (x_eq_ninf & y_pos & (inty != 1)) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = (x_eq_pinf & !y_pos) ? 0.0h : ret; ret = (x_eq_pinf & y_pos) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; ret = ax_eq_nan ? x : ret; ret = ay_eq_nan ? y : ret; } else { // XXX work around conformance test incorrectly checking these cases half xinf = BUILTIN_COPYSIGN_F16(AS_HALF((ushort)PINFBITPATT_HP16), x); ret = (ax_eq_0 & !y_pos & (inty == 1)) ? xinf : ret; ret = (ax_eq_0 & !y_pos & (inty != 1)) ? AS_HALF((ushort)PINFBITPATT_HP16) : ret; half xzero = BUILTIN_COPYSIGN_F16(0.0h, x); ret = (ax_eq_0 & y_pos & (inty == 1)) ? xzero : ret; ret = (ax_eq_0 & y_pos & (inty != 1)) ? 0.0h : ret; } ret = ay == 0.0h ? 1.0h : ret; ret = x == 1.0h ? 1.0h : ret; #endif return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/pownD.cl000066400000000000000000000005601415221260100204610ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_POWN #include "powD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/pownF.cl000066400000000000000000000005601415221260100204630ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_POWN #include "powF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/pownH.cl000066400000000000000000000010221415221260100204570ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(pown)(half2 x, int2 ny) { return (half2)(MATH_MANGLE(pown)(x.lo, ny.lo), MATH_MANGLE(pown)(x.hi, ny.hi)); } #define COMPILING_POWN #include "powH_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/powrD.cl000066400000000000000000000005601415221260100204650ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_POWR #include "powD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/powrF.cl000066400000000000000000000005601415221260100204670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_POWR #include "powF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/powrH.cl000066400000000000000000000006321415221260100204710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR BGEN(powr) #define COMPILING_POWR #include "powH_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/privD.h000066400000000000000000000065351415221260100203170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define MATH_CLZI(U) ({ \ uint _clzi_u = U; \ uint _clzi_z = BUILTIN_FIRSTBIT_U32(_clzi_u); \ uint _clzi_ret = _clzi_u == 0u ? 32u : _clzi_z; \ _clzi_ret; \ }) #define MATH_CLZL(U) ({ \ ulong _clzl_u = U; \ uint2 _clzl_u2 = AS_UINT2(_clzl_u); \ uint _clzl_zlo = BUILTIN_FIRSTBIT_U32(_clzl_u2.lo); \ uint _clzl_zhi = BUILTIN_FIRSTBIT_U32(_clzl_u2.hi); \ uint _clzl_clo = (_clzl_u2.lo == 0 ? 32 : _clzl_zlo) + 32; \ uint _clzl_ret = _clzl_u2.hi == 0 ? _clzl_clo : _clzl_zhi; \ _clzl_ret; \ }) #define MATH_MAD(A,B,C) BUILTIN_FMA_F64(A, B, C) #define MATH_FAST_RCP(X) ({ \ double _frcp_x = X; \ double _frcp_ret; \ _frcp_ret = BUILTIN_RCP_F64(_frcp_x); \ _frcp_ret = BUILTIN_FMA_F64(BUILTIN_FMA_F64(-_frcp_x, _frcp_ret, 1.0), _frcp_ret, _frcp_ret); \ _frcp_ret = BUILTIN_FMA_F64(BUILTIN_FMA_F64(-_frcp_x, _frcp_ret, 1.0), _frcp_ret, _frcp_ret); \ _frcp_ret; \ }) #define MATH_RCP(X) BUILTIN_DIV_F64(1.0, X) #define MATH_FAST_DIV(X, Y) ({ \ double _fdiv_x = X; \ double _fdiv_y = Y; \ double _fdiv_ret; \ double _fdiv_r = BUILTIN_RCP_F64(_fdiv_y); \ _fdiv_r = BUILTIN_FMA_F64(BUILTIN_FMA_F64(-_fdiv_y, _fdiv_r, 1.0), _fdiv_r, _fdiv_r); \ _fdiv_r = BUILTIN_FMA_F64(BUILTIN_FMA_F64(-_fdiv_y, _fdiv_r, 1.0), _fdiv_r, _fdiv_r); \ _fdiv_ret = _fdiv_x * _fdiv_r; \ _fdiv_ret = BUILTIN_FMA_F64(BUILTIN_FMA_F64(-_fdiv_y, _fdiv_ret, _fdiv_x), _fdiv_r, _fdiv_ret); \ _fdiv_ret; \ }) #define MATH_DIV(X,Y) BUILTIN_DIV_F64(X, Y) #define MATH_FAST_SQRT(X) ({ \ double _fsqrt_x = X; \ double _fsqrt_y = BUILTIN_RSQRT_F64(_fsqrt_x); \ double _fsqrt_s0 = _fsqrt_x * _fsqrt_y; \ double _fsqrt_h0 = 0.5 * _fsqrt_y; \ double _fsqrt_r0 = BUILTIN_FMA_F64(-_fsqrt_h0, _fsqrt_s0, 0.5); \ double _fsqrt_h1 = BUILTIN_FMA_F64(_fsqrt_h0, _fsqrt_r0, _fsqrt_h0); \ double _fsqrt_s1 = BUILTIN_FMA_F64(_fsqrt_s0, _fsqrt_r0, _fsqrt_s0); \ double _fsqrt_d0 = BUILTIN_FMA_F64(-_fsqrt_s1, _fsqrt_s1, _fsqrt_x); \ double _fsqrt_ret = BUILTIN_FMA_F64(_fsqrt_d0, _fsqrt_h1, _fsqrt_s1); \ _fsqrt_ret = _fsqrt_x == 0.0 ? _fsqrt_x : _fsqrt_ret; \ _fsqrt_ret; \ }) #define MATH_SQRT(X) ({ \ double _sqrt_x = X; \ bool _sqrt_b = _sqrt_x < 0x1.0p-767; \ _sqrt_x *= _sqrt_b ? 0x1.0p+256 : 1.0; \ double _sqrt_y = BUILTIN_RSQRT_F64(_sqrt_x); \ double _sqrt_s0 = _sqrt_x * _sqrt_y; \ double _sqrt_h0 = 0.5 * _sqrt_y; \ double _sqrt_r0 = BUILTIN_FMA_F64(-_sqrt_h0, _sqrt_s0, 0.5); \ double _sqrt_h1 = BUILTIN_FMA_F64(_sqrt_h0, _sqrt_r0, _sqrt_h0); \ double _sqrt_s1 = BUILTIN_FMA_F64(_sqrt_s0, _sqrt_r0, _sqrt_s0); \ double _sqrt_d0 = BUILTIN_FMA_F64(-_sqrt_s1, _sqrt_s1, _sqrt_x); \ double _sqrt_s2 = BUILTIN_FMA_F64(_sqrt_d0, _sqrt_h1, _sqrt_s1); \ double _sqrt_d1 = BUILTIN_FMA_F64(-_sqrt_s2, _sqrt_s2, _sqrt_x); \ double _sqrt_ret = BUILTIN_FMA_F64(_sqrt_d1, _sqrt_h1, _sqrt_s2); \ _sqrt_ret *= _sqrt_b ? 0x1.0p-128 : 1.0; \ _sqrt_ret = ((_sqrt_x == 0.0) | (_sqrt_x == (double)INFINITY)) ? _sqrt_x : _sqrt_ret; \ _sqrt_ret; \ }) ROCm-Device-Libs-rocm-5.0.0/ocml/src/privF.h000066400000000000000000000042401415221260100203100ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define MATH_CLZI(U) ({ \ uint _clzi_u = U; \ uint _clzi_z = BUILTIN_FIRSTBIT_U32(_clzi_u); \ uint _clzi_ret = _clzi_u == 0u ? 32u : _clzi_z; \ _clzi_ret; \ }) #define MATH_MAD(A,B,C) BUILTIN_MAD_F32(A, B, C) #define MATH_MAD2(A,B,C) BUILTIN_MAD_2F32(A, B, C) #define MATH_FAST_RCP(X) BUILTIN_RCP_F32(X) #define MATH_RCP(X) BUILTIN_DIV_F32(1.0f, X) #define MATH_FAST_DIV(X, Y) ({ \ float _fdiv_x = X; \ float _fdiv_y = Y; \ float _fdiv_ret = _fdiv_x * BUILTIN_RCP_F32(_fdiv_y); \ _fdiv_ret; \ }) #define MATH_DIV(X,Y) BUILTIN_DIV_F32(X, Y) #define MATH_FAST_SQRT(X) BUILTIN_SQRT_F32(X) #define MATH_SQRT(X) ({ \ float _sqrt_x = X; \ bool _sqrt_b = _sqrt_x < 0x1.0p-96f; \ _sqrt_x *= _sqrt_b ? 0x1.0p+32f : 1.0f; \ float _sqrt_s; \ if (!DAZ_OPT()) { \ _sqrt_s = BUILTIN_SQRT_F32(_sqrt_x); \ float _sqrt_sp = AS_FLOAT(AS_INT(_sqrt_s) - 1); \ float _sqrt_ss = AS_FLOAT(AS_INT(_sqrt_s) + 1); \ float _sqrt_vp = BUILTIN_FMA_F32(-_sqrt_sp, _sqrt_s, _sqrt_x); \ float _sqrt_vs = BUILTIN_FMA_F32(-_sqrt_ss, _sqrt_s, _sqrt_x); \ _sqrt_s = _sqrt_vp <= 0.0f ? _sqrt_sp : _sqrt_s; \ _sqrt_s = _sqrt_vs > 0.0f ? _sqrt_ss : _sqrt_s; \ } else { \ float _sqrt_r = BUILTIN_RSQRT_F32(_sqrt_x); \ _sqrt_s = _sqrt_x * _sqrt_r; \ float _sqrt_h = 0.5f * _sqrt_r; \ float _sqrt_e = BUILTIN_FMA_F32(-_sqrt_h, _sqrt_s, 0.5f); \ _sqrt_h = BUILTIN_FMA_F32(_sqrt_h, _sqrt_e, _sqrt_h); \ _sqrt_s = BUILTIN_FMA_F32(_sqrt_s, _sqrt_e, _sqrt_s); \ float _sqrt_d = BUILTIN_FMA_F32(-_sqrt_s, _sqrt_s, _sqrt_x); \ _sqrt_s = BUILTIN_FMA_F32(_sqrt_d, _sqrt_h, _sqrt_s); \ } \ _sqrt_s *= _sqrt_b ? 0x1.0p-16f : 1.0f; \ _sqrt_s = BUILTIN_CLASS_F32(_sqrt_x, CLASS_PZER|CLASS_NZER|CLASS_PINF) ? _sqrt_x : _sqrt_s; \ _sqrt_s; \ }) ROCm-Device-Libs-rocm-5.0.0/ocml/src/privH.h000066400000000000000000000017471415221260100203230ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define MATH_CLZI(U) ({ \ uint _clzi_u = U; \ uint _clzi_z = BUILTIN_FIRSTBIT_U32(_clzi_u); \ uint _clzi_ret = _clzi_u == 0u ? 32u : _clzi_z; \ _clzi_ret; \ }) #define MATH_MAD(A,B,C) BUILTIN_FMA_F16(A, B, C) #define MATH_MAD2(A,B,C) BUILTIN_FMA_2F16(A, B, C) #define MATH_FAST_RCP(X) BUILTIN_RCP_F16(X) #define MATH_RCP(X) BUILTIN_DIV_F16(1.0h, X) #define MATH_FAST_DIV(X, Y) ({ \ half _fdiv_x = X; \ half _fdiv_y = Y; \ half _fdiv_ret = _fdiv_x * BUILTIN_RCP_F16(_fdiv_y); \ _fdiv_ret; \ }) #define MATH_DIV(X,Y) BUILTIN_DIV_F16(X, Y) #define MATH_FAST_SQRT(X) BUILTIN_SQRT_F16(X) #define MATH_SQRT(X) ((half)BUILTIN_SQRT_F32((float)(X))) ROCm-Device-Libs-rocm-5.0.0/ocml/src/rcbrtD.cl000066400000000000000000000023331415221260100206120ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(rcbrt)(double x) { double a = BUILTIN_ABS_F64(x); int e3 = BUILTIN_FREXP_EXP_F64(a); int e = (int)BUILTIN_RINT_F32(0x1.555556p-2f * (float)e3); a = BUILTIN_FLDEXP_F64(a, -3*e); double c = (double)BUILTIN_EXP2_F32(-0x1.555556p-2f * BUILTIN_LOG2_F32((float)a)); // Correction is c + c*(1 - a c^3)/(1 + 2 a c^3) // = c + c*t/(3 - 2t) where t = 1 - a c^3 // use t/(3 - 2t) ~ t/3 + 2 t^2 / 9 + 4 t^3 / 27 ... // compute t with extra precision for better accuracy double c3 = c * c * c; double t = MATH_MAD(-a, c3, 1.0); c = MATH_MAD(c, t*MATH_MAD(t, 0x1.c71c71c71c8b2p-3, 0x1.5555555555685p-2), c); c = BUILTIN_FLDEXP_F64(c, -e); if (!FINITE_ONLY_OPT()) { c = BUILTIN_CLASS_F64(a, CLASS_PINF) ? 0.0 : c; c = x == 0.0 ? AS_DOUBLE(PINFBITPATT_DP64) : c; } return BUILTIN_COPYSIGN_F64(c, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rcbrtF.cl000066400000000000000000000020711415221260100206130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(rcbrt)(float x) { if (DAZ_OPT()) { x = BUILTIN_CANONICALIZE_F32(x); } float ax = BUILTIN_ABS_F32(x); if (!DAZ_OPT()) { ax = BUILTIN_CLASS_F32(x, CLASS_NSUB|CLASS_PSUB) ? BUILTIN_FLDEXP_F32(ax, 24) : ax; } float z = BUILTIN_EXP2_F32(-0x1.555556p-2f * BUILTIN_LOG2_F32(ax)); z = MATH_MAD(MATH_MAD(z*z, -z*ax, 1.0f), 0x1.555556p-2f*z, z); if (!DAZ_OPT()) { z = BUILTIN_CLASS_F32(x, CLASS_NSUB|CLASS_PSUB) ? BUILTIN_FLDEXP_F32(z, 8) : z; } float xi = MATH_FAST_RCP(x); z = BUILTIN_CLASS_F32(x, CLASS_SNAN|CLASS_QNAN|CLASS_PZER|CLASS_NZER|CLASS_PINF|CLASS_NINF) ? xi : z; return BUILTIN_COPYSIGN_F32(z, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rcbrtH.cl000066400000000000000000000013251415221260100206160ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(rcbrt) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(rcbrt)(half x) { half ret = (half)BUILTIN_EXP2_F32(-0x1.555556p-2f * BUILTIN_LOG2_F32((float)BUILTIN_ABS_F16(x))); half xi = MATH_FAST_RCP(x); ret = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_PZER|CLASS_NZER|CLASS_PINF|CLASS_NINF) ? xi : ret; return ret = BUILTIN_COPYSIGN_F16(ret, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/remainderD.cl000066400000000000000000000005731415221260100214500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_REMAINDER #include "remainderD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/remainderD_base.h000066400000000000000000000072231415221260100222720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR INLINEATTR static double fnma(double a, double b, double c) { return BUILTIN_FMA_F64(-a, b, c); } #if defined(COMPILING_FMOD) CONSTATTR double MATH_MANGLE(fmod)(double x, double y) #elif defined(COMPILING_REMQUO) double MATH_MANGLE(remquo)(double x, double y, __private int *q7p) #else CONSTATTR double MATH_MANGLE(remainder)(double x, double y) #endif { // How many bits of the quotient per iteration const int bits = 26; double ax = BUILTIN_ABS_F64(x); double ay = BUILTIN_ABS_F64(y); double ret; #if defined(COMPILING_REMQUO) int q7; #endif if (ax > ay) { int ex, ey; ex = BUILTIN_FREXP_EXP_F64(ax) - 1; ax = BUILTIN_FLDEXP_F64(BUILTIN_FREXP_MANT_F64(ax), bits); ey = BUILTIN_FREXP_EXP_F64(ay) - 1; ay = BUILTIN_FLDEXP_F64(BUILTIN_FREXP_MANT_F64(ay), 1); int nb = ex - ey; double ayinv = MATH_RCP(ay); #if !defined(COMPILING_FMOD) int qacc = 0; #endif while (nb > bits) { double q = BUILTIN_RINT_F64(ax * ayinv); ax = fnma(q, ay, ax); int clt = ax < 0.0; double axp = ax + ay; ax = clt ? axp : ax; #if defined(COMPILING_REMQUO) int iq = (int)q; iq -= clt; qacc = (qacc << bits) | iq; #endif ax = BUILTIN_FLDEXP_F64(ax, bits); nb -= bits; } ax = BUILTIN_FLDEXP_F64(ax, nb - bits + 1); // Final iteration { double q = BUILTIN_RINT_F64(ax * ayinv); ax = fnma(q, ay, ax); int clt = ax < 0.0; double axp = ax + ay; ax = clt ? axp : ax; #if !defined(COMPILING_FMOD) int iq = (int)q; iq -= clt; #if defined(COMPILING_REMQUO) qacc = (qacc << (nb+1)) | iq; #else qacc = iq; #endif #endif } #if !defined(COMPILING_FMOD) // Adjust ax so that it is the range (-y/2, y/2] // We need to choose the even integer when x/y is midway between two integers int aq = (2.0*ax > ay) | ((qacc & 0x1) & (2.0f*ax == ay)); ax = ax - (aq ? ay : 0.0f); #if defined(COMPILING_REMQUO) qacc += aq; int qneg = (AS_INT2(x).hi ^ AS_INT2(y).hi) >> 31; q7 = ((qacc & 0x7f) ^ qneg) - qneg; #endif #endif ax = BUILTIN_FLDEXP_F64(ax, ey); ret = AS_DOUBLE((AS_ULONG(x) & SIGNBIT_DP64) ^ AS_ULONG(ax)); } else { ret = x; #if defined(COMPILING_REMQUO) q7 = 0; #endif #if !defined(COMPILING_FMOD) int c = (ay < 0x1.0p+1023 & 2.0*ax > ay) | (ax > 0.5*ay); int qsgn = 1 + (((AS_INT2(x).hi ^ AS_INT2(y).hi) >> 31) << 1); double t = MATH_MAD(y, -(double)qsgn, x); ret = c ? t : ret; #if defined(COMPILING_REMQUO) q7 = c ? qsgn : q7; #endif #endif ret = ax == ay ? BUILTIN_COPYSIGN_F64(0.0, x) : ret; #if defined(COMPILING_REMQUO) q7 = ax == ay ? qsgn : q7; #endif } if (!FINITE_ONLY_OPT()) { ret = y == 0.0 ? AS_DOUBLE(QNANBITPATT_DP64) : ret; #if defined(COMPILING_REMQUO) q7 = y == 0.0 ? 0 : q7; #endif bool c = !BUILTIN_ISNAN_F64(y) && BUILTIN_ISFINITE_F64(x); ret = c ? ret : AS_DOUBLE(QNANBITPATT_DP64); #if defined(COMPILING_REMQUO) q7 = c ? q7 : 0; #endif } #if defined(COMPILING_REMQUO) *q7p = q7; #endif return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/remainderF.cl000066400000000000000000000005731415221260100214520ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_REMAINDER #include "remainderF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/remainderF_base.h000066400000000000000000000104651415221260100222760ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" // The arguments must only be variable names #define FULL_MUL(A, B, CHI, CLO) \ do { \ float __ha = AS_FLOAT(AS_UINT(A) & 0xfffff000U); \ float __ta = A - __ha; \ float __hb = AS_FLOAT(AS_UINT(B) & 0xfffff000U); \ float __tb = B - __hb; \ CHI = A * B; \ CLO = MATH_MAD(__ta, __tb, MATH_MAD(__ta, __hb, MATH_MAD(__ha, __tb, MATH_MAD(__ha, __hb, -CHI)))); \ } while (0) CONSTATTR static float fnma(float a, float b, float c) { float d; if (HAVE_FAST_FMA32()) { d = BUILTIN_FMA_F32(-a, b, c); } else { float h, t; FULL_MUL(a, b, h, t); d = c - h; d = (((c - d) - h) - t) + d; } return d; } #if defined(COMPILING_FMOD) CONSTATTR float MATH_MANGLE(fmod)(float x, float y) #elif defined(COMPILING_REMQUO) float MATH_MANGLE(remquo)(float x, float y, __private int *q7p) #else CONSTATTR float MATH_MANGLE(remainder)(float x, float y) #endif { if (DAZ_OPT()) { x = BUILTIN_CANONICALIZE_F32(x); y = BUILTIN_CANONICALIZE_F32(y); } // How many bits of the quotient per iteration const int bits = 12; float ax = BUILTIN_ABS_F32(x); float ay = BUILTIN_ABS_F32(y); float ret; #if defined(COMPILING_REMQUO) int q7; #endif if (ax > ay) { int ex, ey; ex = BUILTIN_FREXP_EXP_F32(ax) - 1; ax = BUILTIN_FLDEXP_F32(BUILTIN_FREXP_MANT_F32(ax), bits); ey = BUILTIN_FREXP_EXP_F32(ay) - 1; ay = BUILTIN_FLDEXP_F32(BUILTIN_FREXP_MANT_F32(ay), 1); int nb = ex - ey; float ayinv = MATH_FAST_RCP(ay); #if !defined(COMPILING_FMOD) int qacc = 0; #endif while (nb > bits) { float q = BUILTIN_RINT_F32(ax * ayinv); ax = fnma(q, ay, ax); int clt = ax < 0.0f; float axp = ax + ay; ax = clt ? axp : ax; #if defined(COMPILING_REMQUO) int iq = (int)q; iq -= clt; qacc = (qacc << bits) | iq; #endif ax = BUILTIN_FLDEXP_F32(ax, bits); nb -= bits; } ax = BUILTIN_FLDEXP_F32(ax, nb - bits + 1); // Final iteration { float q = BUILTIN_RINT_F32(ax * ayinv); ax = fnma(q, ay, ax); int clt = ax < 0.0f; float axp = ax + ay; ax = clt ? axp : ax; #if !defined(COMPILING_FMOD) int iq = (int)q; iq -= clt; #if defined(COMPILING_REMQUO) qacc = (qacc << (nb+1)) | iq; #else qacc = iq; #endif #endif } #if !defined(COMPILING_FMOD) // Adjust ax so that it is the range (-y/2, y/2] // We need to choose the even integer when x/y is midway between two integers int aq = (2.0f*ax > ay) | ((qacc & 0x1) & (2.0f*ax == ay)); ax = ax - (aq ? ay : 0.0f); #if defined(COMPILING_REMQUO) qacc += aq; int qneg = (AS_INT(x) ^ AS_INT(y)) >> 31; q7 = ((qacc & 0x7f) ^ qneg) - qneg; #endif #endif ax = BUILTIN_FLDEXP_F32(ax, ey); ret = AS_FLOAT((AS_INT(x) & SIGNBIT_SP32) ^ AS_INT(ax)); } else { ret = x; #if defined(COMPILING_REMQUO) q7 = 0; #endif #if !defined(COMPILING_FMOD) bool c = (ay < 0x1.0p+127f & 2.0f*ax > ay) | (ax > 0.5f*ay); int qsgn = 1 + (((AS_INT(x) ^ AS_INT(y)) >> 31) << 1); float t = MATH_MAD(y, -(float)qsgn, x); ret = c ? t : ret; #if defined(COMPILING_REMQUO) q7 = c ? qsgn : q7; #endif #endif ret = ax == ay ? BUILTIN_COPYSIGN_F32(0.0f, x) : ret; #if defined(COMPILING_REMQUO) q7 = ax == ay ? qsgn : q7; #endif } if (!FINITE_ONLY_OPT()) { ret = y == 0.0f ? AS_FLOAT(QNANBITPATT_SP32) : ret; #if defined(COMPILING_REMQUO) q7 = y == 0.0f ? 0 : q7; #endif bool c = !BUILTIN_ISNAN_F32(y) && BUILTIN_ISFINITE_F32(x); ret = c ? ret : AS_FLOAT(QNANBITPATT_SP32); #if defined(COMPILING_REMQUO) q7 = c ? q7 : 0; #endif } #if defined(COMPILING_REMQUO) *q7p = q7; #endif return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/remainderH.cl000066400000000000000000000006521415221260100214520ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR BGEN(remainder) #define COMPILING_REMAINDER #include "remainderH_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/remainderH_base.h000066400000000000000000000073471415221260100223050ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ CONSTATTR INLINEATTR static bool samesign(half x, half y) { return (AS_USHORT(x) & (ushort)SIGNBIT_HP16) == (AS_USHORT(y) & (ushort)SIGNBIT_HP16); } #if defined(COMPILING_FMOD) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(fmod)(half x, half y) #elif defined(COMPILING_REMQUO) REQUIRES_16BIT_INSTS half MATH_MANGLE(remquo)(half x, half y, __private int *q7p) #else REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(remainder)(half x, half y) #endif { // How many bits of the quotient per iteration const int bits = 11; float ax = (float)BUILTIN_ABS_F16(x); float ay = (float)BUILTIN_ABS_F16(y); float ret; #if defined(COMPILING_REMQUO) int q7; #endif if (ax > ay) { int ex, ey; ex = BUILTIN_FREXP_EXP_F32(ax) - 1; ax = BUILTIN_FLDEXP_F32(BUILTIN_FREXP_MANT_F32(ax), bits); ey = BUILTIN_FREXP_EXP_F32(ay) - 1; ay = BUILTIN_FLDEXP_F32(BUILTIN_FREXP_MANT_F32(ay), 1); int nb = ex - ey; float ayinv = BUILTIN_RCP_F32(ay); #if !defined(COMPILING_FMOD) int qacc = 0; #endif while (nb > bits) { float q = BUILTIN_RINT_F32(ax * ayinv); ax = BUILTIN_MAD_F32(-q, ay, ax); int clt = ax < 0.0f; float axp = ax + ay; ax = clt ? axp : ax; #if defined(COMPILING_REMQUO) int iq = (int)q; iq -= clt; qacc = (qacc << bits) | iq; #endif ax = BUILTIN_FLDEXP_F32(ax, bits); nb -= bits; } ax = BUILTIN_FLDEXP_F32(ax, nb - bits + 1); // Final iteration { float q = BUILTIN_RINT_F32(ax * ayinv); ax = BUILTIN_MAD_F32(-q, ay, ax); int clt = ax < 0.0f; float axp = ax + ay; ax = clt ? axp : ax; #if !defined(COMPILING_FMOD) int iq = (int)q; iq -= clt; #if defined(COMPILING_REMQUO) qacc = (qacc << (nb+1)) | iq; #else qacc = iq; #endif #endif } #if !defined(COMPILING_FMOD) // Adjust ax so that it is the range (-y/2, y/2] // We need to choose the even integer when x/y is midway between two integers int aq = (2.0f*ax > ay) | ((qacc & 0x1) & (2.0f*ax == ay)); ax = ax - (aq ? ay : 0.0f); #if defined(COMPILING_REMQUO) qacc += aq; int qneg = samesign(x, y) ? 0 : -1; q7 = ((qacc & 0x7f) ^ qneg) - qneg; #endif #endif ax = BUILTIN_FLDEXP_F32(ax, ey); short ir = AS_SHORT((half)ax); ir ^= AS_SHORT(x) & (short)SIGNBIT_HP16; ret = AS_HALF(ir); } else { ret = x; #if defined(COMPILING_REMQUO) q7 = 0; #endif #if !defined(COMPILING_FMOD) bool c = ax > 0.5f*ay; int qsgn = samesign(x,y) ? 1 : -1; half t = MATH_MAD(y, -(half)qsgn, x); ret = c ? t : ret; #if defined(COMPILING_REMQUO) q7 = c ? qsgn : q7; #endif #endif ret = ax == ay ? BUILTIN_COPYSIGN_F16(0.0h, x) : ret; #if defined(COMPILING_REMQUO) q7 = ax == ay ? qsgn : q7; #endif } if (!FINITE_ONLY_OPT()) { ret = y == 0.0h ? AS_HALF((short)QNANBITPATT_HP16) : ret; #if defined(COMPILING_REMQUO) q7 = y == 0.0h ? 0 : q7; #endif bool c = !BUILTIN_ISNAN_F16(y) && BUILTIN_ISFINITE_F16(x); ret = c ? ret : AS_HALF((short)QNANBITPATT_HP16); #if defined(COMPILING_REMQUO) q7 = c ? q7 : 0; #endif } #if defined(COMPILING_REMQUO) *q7p = q7; #endif return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/remquoD.cl000066400000000000000000000005701415221260100210070ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_REMQUO #include "remainderD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/remquoF.cl000066400000000000000000000005701415221260100210110ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_REMQUO #include "remainderF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/remquoH.cl000066400000000000000000000012011415221260100210030ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" half2 MATH_MANGLE2(remquo)(half2 x, half2 y, __private int2 *q7p) { int qlo, qhi; half2 r; r.lo = MATH_MANGLE(remquo)(x.lo, y.lo, &qlo); r.hi = MATH_MANGLE(remquo)(x.hi, y.hi, &qhi); *q7p = (int2)(qlo, qhi); return r; } #define COMPILING_REMQUO #include "remainderH_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/rhypotD.cl000066400000000000000000000021511415221260100210210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(rhypot)(double x, double y) { double a = BUILTIN_ABS_F64(x); double b = BUILTIN_ABS_F64(y); double t = BUILTIN_MAX_F64(a, b); int e = BUILTIN_FREXP_EXP_F64(t); a = BUILTIN_FLDEXP_F64(a, -e); b = BUILTIN_FLDEXP_F64(b, -e); double d2 = MATH_MAD(a, a, b*b); double z = BUILTIN_RSQRT_F64(d2); double u = MATH_MAD(-d2*z, z, 1.0); z = MATH_MAD(z*u, MATH_MAD(u, 0.375, 0.5), z); double ret = BUILTIN_FLDEXP_F64(z, -e); if (!FINITE_ONLY_OPT()) { ret = t == 0.0 ? AS_DOUBLE(PINFBITPATT_DP64) : ret; ret = (BUILTIN_ISNAN_F64(x) | BUILTIN_ISNAN_F64(y)) ? AS_DOUBLE(QNANBITPATT_DP64) : ret; ret = (BUILTIN_ISINF_F64(x) | BUILTIN_ISINF_F64(y)) ? 0.0 : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rhypotF.cl000066400000000000000000000015561415221260100210330ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(rhypot)(float x, float y) { float a = BUILTIN_ABS_F32(x); float b = BUILTIN_ABS_F32(y); float t = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(a), AS_UINT(b))); int e = BUILTIN_FREXP_EXP_F32(t); a = BUILTIN_FLDEXP_F32(a, -e); b = BUILTIN_FLDEXP_F32(b, -e); float ret = BUILTIN_FLDEXP_F32(BUILTIN_RSQRT_F32(MATH_MAD(a, a, b*b)), -e); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISINF_F32(x) | BUILTIN_ISINF_F32(y)) ? 0.0f : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rhypotH.cl000066400000000000000000000013021415221260100210220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR BGEN(rhypot) CONSTATTR half MATH_MANGLE(rhypot)(half x, half y) { float fx = (float)x; float fy = (float)y; float d2 = BUILTIN_MAD_F32(fx, fx, fy*fy); half ret = (half)BUILTIN_RSQRT_F32(d2); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISINF_F16(x) | BUILTIN_ISINF_F16(y)) ? 0.0h : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rintD.cl000066400000000000000000000006471415221260100204600ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(rint)(double x) { return BUILTIN_RINT_F64(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rintF.cl000066400000000000000000000006451415221260100204600ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(rint)(float x) { return BUILTIN_RINT_F32(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rintH.cl000066400000000000000000000007651415221260100204650ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(rint)(half2 x) { return BUILTIN_RINT_2F16(x); } CONSTATTR half MATH_MANGLE(rint)(half x) { return BUILTIN_RINT_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rlen3D.cl000066400000000000000000000027631415221260100205300ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(rlen3)(double x, double y, double z) { double a = BUILTIN_ABS_F64(x); double b = BUILTIN_ABS_F64(y); double c = BUILTIN_ABS_F64(z); double a1 = BUILTIN_MAX_F64(a, b); double b1 = BUILTIN_MIN_F64(a, b); a = BUILTIN_MAX_F64(a1, c); double c1 = BUILTIN_MIN_F64(a1, c); b = BUILTIN_MAX_F64(b1, c1); c = BUILTIN_MIN_F64(b1, c1); int e = BUILTIN_FREXP_EXP_F64(a); a = BUILTIN_FLDEXP_F64(a, -e); b = BUILTIN_FLDEXP_F64(b, -e); c = BUILTIN_FLDEXP_F64(c, -e); double d2 = MATH_MAD(a, a, MATH_MAD(b, b, c*c)); double v = BUILTIN_RSQRT_F64(d2); double u = MATH_MAD(-d2*v, v, 1.0); v = MATH_MAD(v*u, MATH_MAD(u, 0.375, 0.5), v); double ret = BUILTIN_FLDEXP_F64(v, -e); if (!FINITE_ONLY_OPT()) { ret = a == 0.0 ? AS_DOUBLE(PINFBITPATT_DP64) : ret; ret = (BUILTIN_ISNAN_F64(x) | BUILTIN_ISNAN_F64(y) | BUILTIN_ISNAN_F64(z)) ? AS_DOUBLE(QNANBITPATT_DP64) : ret; ret = (BUILTIN_ISINF_F64(x) | BUILTIN_ISINF_F64(y) | BUILTIN_ISINF_F64(z)) ? 0.0 : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rlen3F.cl000066400000000000000000000025101415221260100205200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(rlen3)(float x, float y, float z) { float a = BUILTIN_ABS_F32(x); float b = BUILTIN_ABS_F32(y); float c = BUILTIN_ABS_F32(z); float a1 = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(a), AS_UINT(b))); float b1 = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(a), AS_UINT(b))); a = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(a1), AS_UINT(c))); float c1 = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(a1), AS_UINT(c))); b = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(b1), AS_UINT(c1))); c = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(b1), AS_UINT(c1))); int e = BUILTIN_FREXP_EXP_F32(a); a = BUILTIN_FLDEXP_F32(a, -e); b = BUILTIN_FLDEXP_F32(b, -e); c = BUILTIN_FLDEXP_F32(c, -e); float ret = BUILTIN_RSQRT_F32(MATH_MAD(a, a, MATH_MAD(b, b, c*c))); ret = BUILTIN_FLDEXP_F32(ret, -e); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISINF_F32(x) | BUILTIN_ISINF_F32(y) | BUILTIN_ISINF_F32(z)) ? 0.0f : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rlen3H.cl000066400000000000000000000014121415221260100205220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half MATH_MANGLE(rlen3)(half x, half y, half z) { float fx = (float)x; float fy = (float)y; float fz = (float)z; float d2 = BUILTIN_MAD_F32(fx, fx, BUILTIN_MAD_F32(fy, fy, fz*fz)); half ret = (half)BUILTIN_RSQRT_F32(d2); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISINF_F16(x) | BUILTIN_ISINF_F16(y) | BUILTIN_ISINF_F16(z)) ? 0.0h : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rlen4D.cl000066400000000000000000000035031415221260100205220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(rlen4)(double x, double y, double z, double w) { double a = BUILTIN_ABS_F64(x); double b = BUILTIN_ABS_F64(y); double c = BUILTIN_ABS_F64(z); double d = BUILTIN_ABS_F64(w); double a1 = BUILTIN_MAX_F64(a, b); double b1 = BUILTIN_MIN_F64(a, b); double c1 = BUILTIN_MAX_F64(c, d); double d1 = BUILTIN_MIN_F64(c, d); a = BUILTIN_MAX_F64(a1, c1); double c2 = BUILTIN_MIN_F64(a1, c1); double b2 = BUILTIN_MAX_F64(b1, d1); d = BUILTIN_MIN_F64(b1, d1); b = BUILTIN_MAX_F64(b2, c2); c = BUILTIN_MIN_F64(b2, c2); int e = BUILTIN_FREXP_EXP_F64(a); a = BUILTIN_FLDEXP_F64(a, -e); b = BUILTIN_FLDEXP_F64(b, -e); c = BUILTIN_FLDEXP_F64(c, -e); d = BUILTIN_FLDEXP_F64(d, -e); double l2 = MATH_MAD(a, a, MATH_MAD(b, b, MATH_MAD(c, c, d*d))); double v = BUILTIN_RSQRT_F64(l2); double u = MATH_MAD(-l2*v, v, 1.0); v = MATH_MAD(v*u, MATH_MAD(u, 0.375, 0.5), v); double ret = BUILTIN_FLDEXP_F64(v, -e); if (!FINITE_ONLY_OPT()) { ret = a == 0.0 ? AS_DOUBLE(PINFBITPATT_DP64) : ret; ret = (BUILTIN_ISNAN_F64(x) | BUILTIN_ISNAN_F64(y) | BUILTIN_ISNAN_F64(z) | BUILTIN_ISNAN_F64(w)) ? AS_DOUBLE(QNANBITPATT_DP64) : ret; ret = (BUILTIN_ISINF_F64(x) | BUILTIN_ISINF_F64(y) | BUILTIN_ISINF_F64(z) | BUILTIN_ISINF_F64(w)) ? 0.0 : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rlen4F.cl000066400000000000000000000033151415221260100205250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(rlen4)(float x, float y, float z, float w) { float a = BUILTIN_ABS_F32(x); float b = BUILTIN_ABS_F32(y); float c = BUILTIN_ABS_F32(z); float d = BUILTIN_ABS_F32(w); float a1 = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(a), AS_UINT(b))); float b1 = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(a), AS_UINT(b))); float c1 = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(c), AS_UINT(d))); float d1 = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(c), AS_UINT(d))); a = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(a1), AS_UINT(c1))); float c2 = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(a1), AS_UINT(c1))); float b2 = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(b1), AS_UINT(d1))); d = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(b1), AS_UINT(d1))); b = AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(b2), AS_UINT(c2))); c = AS_FLOAT(BUILTIN_MIN_U32(AS_UINT(b2), AS_UINT(c2))); int e = BUILTIN_FREXP_EXP_F32(a); a = BUILTIN_FLDEXP_F32(a, -e); b = BUILTIN_FLDEXP_F32(b, -e); c = BUILTIN_FLDEXP_F32(c, -e); d = BUILTIN_FLDEXP_F32(d, -e); float ret = BUILTIN_FLDEXP_F32(BUILTIN_RSQRT_F32(MATH_MAD(a, a, MATH_MAD(b, b, MATH_MAD(c, c, d*d)))), -e); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISINF_F32(x) | BUILTIN_ISINF_F32(y) | BUILTIN_ISINF_F32(z) | BUILTIN_ISINF_F32(w)) ? 0.0f : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rlen4H.cl000066400000000000000000000015521415221260100205300ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half MATH_MANGLE(rlen4)(half x, half y, half z, half w) { float fx = (float)x; float fy = (float)y; float fz = (float)z; float fw = (float)w; float d2 = BUILTIN_MAD_F32(fx, fx, BUILTIN_MAD_F32(fy, fy, BUILTIN_MAD_F32(fz, fz, fw*fw))); half ret = (half)BUILTIN_RSQRT_F32(d2); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISINF_F16(x) | BUILTIN_ISINF_F16(y) | BUILTIN_ISINF_F16(z) | BUILTIN_ISINF_F16(w)) ? 0.0h : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rootnD.cl000066400000000000000000000005611415221260100206400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_ROOTN #include "powD_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/rootnF.cl000066400000000000000000000005611415221260100206420ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define COMPILING_ROOTN #include "powF_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/rootnH.cl000066400000000000000000000010261415221260100206410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(rootn)(half2 x, int2 ny) { return (half2)(MATH_MANGLE(rootn)(x.lo, ny.lo), MATH_MANGLE(rootn)(x.hi, ny.hi)); } #define COMPILING_ROOTN #include "powH_base.h" ROCm-Device-Libs-rocm-5.0.0/ocml/src/roundD.cl000066400000000000000000000006511415221260100206260ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(round)(double x) { return BUILTIN_ROUND_F64(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/roundF.cl000066400000000000000000000006471415221260100206350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(round)(float x) { return BUILTIN_ROUND_F32(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/roundH.cl000066400000000000000000000007711415221260100206350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(round)(half2 x) { return BUILTIN_ROUND_2F16(x); } CONSTATTR half MATH_MANGLE(round)(half x) { return BUILTIN_ROUND_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rsqrtD.cl000066400000000000000000000011271415221260100206510ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(rsqrt)(double x) { double y0 = BUILTIN_RSQRT_F64(x); double e = MATH_MAD(-x*y0, y0, 1.0); double y1 = MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0); return BUILTIN_CLASS_F64(y0, CLASS_PSUB|CLASS_PNOR) ? y1 : y0; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rsqrtF.cl000066400000000000000000000011171415221260100206520ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(rsqrt)(float x) { if (DAZ_OPT()) { return BUILTIN_RSQRT_F32(x); } else { bool s = x < 0x1.0p-100f; return BUILTIN_RSQRT_F32(x * (s ? 0x1.0p+100f : 1.0f)) * (s ? 0x1.0p+50f : 1.0f); } } ROCm-Device-Libs-rocm-5.0.0/ocml/src/rsqrtH.cl000066400000000000000000000007211415221260100206540ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(rsqrt) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(rsqrt)(half x) { return BUILTIN_RSQRT_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/scalbD.cl000066400000000000000000000016501415221260100205630ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(scalb)(double x, double y) { double t = BUILTIN_MIN_F64(BUILTIN_MAX_F64(y, -0x1.0p+20), 0x1.0p+20); double ret = MATH_MANGLE(ldexp)(x, (int)BUILTIN_RINT_F64(t)); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISNAN_F64(x) | BUILTIN_ISNAN_F64(y)) ? AS_DOUBLE(QNANBITPATT_DP64) : ret; ret = (BUILTIN_CLASS_F64(x, CLASS_NZER|CLASS_PZER) & BUILTIN_CLASS_F64(y, CLASS_PINF)) ? AS_DOUBLE(QNANBITPATT_DP64) : ret; ret = (BUILTIN_ISINF_F64(x) & BUILTIN_CLASS_F64(y, CLASS_NINF)) ? AS_DOUBLE(QNANBITPATT_DP64) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/scalbF.cl000066400000000000000000000015741415221260100205720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(scalb)(float x, float y) { float t = BUILTIN_CLAMP_F32(y, -0x1.0p+20f, 0x1.0p+20f); float ret = MATH_MANGLE(ldexp)(x, (int)BUILTIN_RINT_F32(t)); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISNAN_F32(x) | BUILTIN_ISNAN_F32(y)) ? AS_FLOAT(QNANBITPATT_SP32) : ret; ret = (BUILTIN_ISINF_F32(x) & BUILTIN_CLASS_F32(y, CLASS_PINF)) ? AS_FLOAT(QNANBITPATT_SP32) : ret; ret = (BUILTIN_ISINF_F32(x) & BUILTIN_CLASS_F32(y, CLASS_NINF)) ? AS_FLOAT(QNANBITPATT_SP32) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/scalbH.cl000066400000000000000000000017301415221260100205660ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR BGEN(scalb) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(scalb)(half x, half y) { half t = BUILTIN_MIN_F16(BUILTIN_MAX_F16(y, -0x1.0p+6h), 0x1.0p+6h); half ret = MATH_MANGLE(ldexp)(x, (int)BUILTIN_RINT_F16(t)); if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISNAN_F16(x) | BUILTIN_ISNAN_F16(y)) ? AS_HALF((short)QNANBITPATT_HP16) : ret; ret = (BUILTIN_CLASS_F16(x, CLASS_NZER|CLASS_PZER) & BUILTIN_CLASS_F16(y, CLASS_PINF)) ? AS_HALF((short)QNANBITPATT_HP16) : ret; ret = (BUILTIN_ISINF_F16(x) & BUILTIN_CLASS_F16(y, CLASS_NINF)) ? AS_HALF((short)QNANBITPATT_HP16) : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/scalbnD.cl000066400000000000000000000006651415221260100207460ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(scalbn)(double x, int n) { return MATH_MANGLE(ldexp)(x, n); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/scalbnF.cl000066400000000000000000000006631415221260100207460ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(scalbn)(float x, int n) { return MATH_MANGLE(ldexp)(x, n); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/scalbnH.cl000066400000000000000000000011521415221260100207420ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" REQUIRES_16BIT_INSTS CONSTATTR half2 MATH_MANGLE2(scalbn)(half2 x, int2 n) { return (half2)(MATH_MANGLE(ldexp)(x.lo, n.lo), MATH_MANGLE(ldexp)(x.hi, n.hi)); } REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(scalbn)(half x, int n) { return MATH_MANGLE(ldexp)(x, n); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/signbitD.cl000066400000000000000000000006451415221260100211410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR int MATH_MANGLE(signbit)(double x) { return AS_INT2(x).hi < 0; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/signbitF.cl000066400000000000000000000006371415221260100211440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR int MATH_MANGLE(signbit)(float x) { return AS_INT(x) < 0; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/signbitH.cl000066400000000000000000000011221415221260100211340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR short2 MATH_MANGLE2(signbit)(half2 x) { return (short2) (AS_SHORT(x.lo) < 0 ? (short)-1 : (short)0, AS_SHORT(x.hi) < 0 ? (short)-1 : (short)0); } CONSTATTR int MATH_MANGLE(signbit)(half x) { return AS_SHORT(x) < 0; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sinD.cl000066400000000000000000000014561415221260100202740ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigredD.h" CONSTATTR double MATH_MANGLE(sin)(double x) { double ax = BUILTIN_ABS_F64(x); struct redret r = MATH_PRIVATE(trigred)(ax); struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c); s.hi ^= (r.i > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000); if (!FINITE_ONLY_OPT()) { s = BUILTIN_ISFINITE_F64(ax) ? s : AS_INT2(QNANBITPATT_DP64); } return AS_DOUBLE(s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sinF.cl000066400000000000000000000016161415221260100202740ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" float MATH_MANGLE(sin)(float x) { float ax = BUILTIN_ABS_F32(x); struct redret r = MATH_PRIVATE(trigred)(ax); #if defined EXTRA_PRECISION struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); #else struct scret sc = MATH_PRIVATE(sincosred)(r.hi); #endif float s = (r.i & 1) != 0 ? sc.c : sc.s; s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0) ^ (AS_INT(x) ^ AS_INT(ax))); if (!FINITE_ONLY_OPT()) { s = BUILTIN_ISFINITE_F32(ax) ? s : AS_FLOAT(QNANBITPATT_SP32); } return s; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sinH.cl000066400000000000000000000015221415221260100202720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigredH.h" UGEN(sin) REQUIRES_16BIT_INSTS half MATH_MANGLE(sin)(half x) { half ax = BUILTIN_ABS_F16(x); struct redret r = MATH_PRIVATE(trigred)(ax); struct scret sc = MATH_PRIVATE(sincosred)(r.hi); short s = AS_SHORT((r.i & (short)1) == (short)0 ? sc.s : sc.c); s ^= (r.i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000); if (!FINITE_ONLY_OPT()) { s = BUILTIN_ISFINITE_F16(ax) ?(short)QNANBITPATT_HP16 : s; } return AS_HALF(s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sinbD.cl000066400000000000000000000025741415221260100204400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigredD.h" #define FSUM2(A, B, H, L) \ do { \ double __s = A + B; \ double __t = B - (__s - A); \ H = __s; \ L = __t; \ } while (0) #define FDIF2(A, B, H, L) \ do { \ double __d = A - B; \ double __e = (A - __d) - B; \ H = __d; \ L = __e; \ } while (0) double MATH_PRIVATE(sinb)(double x, int n, double p) { struct redret r = MATH_PRIVATE(trigred)(x); bool b = r.hi < p; r.i = (r.i - b - n) & 3; // This is a properly signed extra precise pi/4 double ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0))); double pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0))); double sh, sl; FDIF2(ph, p, ph, sl); pl += sl; FSUM2(ph, pl, ph, pl); FSUM2(ph, r.hi, sh, sl); sl += pl + r.lo; FSUM2(sh, sl, sh, sl); struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl); int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c); s.hi ^= r.i > 1 ? 0x80000000 : 0; return AS_DOUBLE(s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sinbF.cl000066400000000000000000000026661415221260100204440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" #define FSUM2(A, B, H, L) \ do { \ float __s = A + B; \ float __t = B - (__s - A); \ H = __s; \ L = __t; \ } while (0) #define FDIF2(A, B, H, L) \ do { \ float __d = A - B; \ float __e = (A - __d) - B; \ H = __d; \ L = __e; \ } while (0) float MATH_PRIVATE(sinb)(float x, int n, float p) { struct redret r = MATH_PRIVATE(trigred)(x); bool b = r.hi < p; r.i = (r.i - b - n) & 3; #if defined EXTRA_PRECISION float ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); float pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0)); float sh, sl; FDIF2(ph, p, ph, sl); pl += sl; FSUM2(ph, pl, ph, pl); FSUM2(ph, r.hi, sh, sl); sl += pl + r.lo; FSUM2(sh, sl, sh, sl); struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl); #else r.hi = r.hi - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); struct scret sc = MATH_PRIVATE(sincosred)(r.hi); #endif float s = (r.i & 1) != 0 ? sc.c : sc.s; s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0)); return s; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincosD.cl000066400000000000000000000020451415221260100207740ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigredD.h" double MATH_MANGLE(sincos)(double x, __private double * cp) { double ax = BUILTIN_ABS_F64(x); struct redret r = MATH_PRIVATE(trigred)(ax); struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); int flip = r.i > 1 ? (int)0x80000000 : 0; bool odd = (r.i & 1) != 0; int2 s = AS_INT2(odd ? sc.c : sc.s); s.hi ^= flip ^ (AS_INT2(x).hi &(int)0x80000000); sc.s = -sc.s; int2 c = AS_INT2(odd ? sc.s : sc.c); c.hi ^= flip; if (!FINITE_ONLY_OPT()) { bool finite = BUILTIN_ISFINITE_F64(x); s = finite ? s : AS_INT2(QNANBITPATT_DP64); c = finite ? c : AS_INT2(QNANBITPATT_DP64); } *cp = AS_DOUBLE(c); return AS_DOUBLE(s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincosF.cl000066400000000000000000000021631415221260100207770ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" float MATH_MANGLE(sincos)(float x, __private float *cp) { float ax = BUILTIN_ABS_F32(x); struct redret r = MATH_PRIVATE(trigred)(ax); #if defined EXTRA_PRECISION struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); #else struct scret sc = MATH_PRIVATE(sincosred)(r.hi); #endif int flip = r.i > 1 ? 0x80000000 : 0; bool odd = (r.i & 1) != 0; float s = odd ? sc.c : sc.s; s = AS_FLOAT(AS_INT(s) ^ flip ^ (AS_INT(ax) ^ AS_INT(x))); sc.s = -sc.s; float c = odd ? sc.s : sc.c; c = AS_FLOAT(AS_INT(c) ^ flip); if (!FINITE_ONLY_OPT()) { bool finite = BUILTIN_ISFINITE_F32(ax); c = finite ? c : AS_FLOAT(QNANBITPATT_SP32); s = finite ? s : AS_FLOAT(QNANBITPATT_SP32); } *cp = c; return s; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincosH.cl000066400000000000000000000024711415221260100210030ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigredH.h" REQUIRES_16BIT_INSTS half2 MATH_MANGLE2(sincos)(half2 x, __private half2 *cp) { half2 s; half clo, chi; s.lo = MATH_MANGLE(sincos)(x.lo, &clo); s.hi = MATH_MANGLE(sincos)(x.hi, &chi); *cp = (half2)(clo, chi); return s; } REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(sincos)(half x, __private half *cp) { half ax = BUILTIN_ABS_F16(x); struct redret r = MATH_PRIVATE(trigred)(ax); struct scret sc = MATH_PRIVATE(sincosred)(r.hi); short flip = r.i > (short)1 ? (short)0x8000 : (short)0; bool odd = (r.i & (short)1) != (short)0; short s = AS_SHORT(odd ? sc.c : sc.s); s ^= flip ^ (AS_SHORT(x) & (short)0x8000); sc.s = -sc.s; short c = AS_SHORT(odd ? sc.s : sc.c); c ^= flip; if (!FINITE_ONLY_OPT()) { bool finite = BUILTIN_ISFINITE_F16(ax); c = finite ? c : (short)QNANBITPATT_HP16; s = finite ? s : (short)QNANBITPATT_HP16; } *cp = AS_HALF(c); return AS_HALF(s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincospiD.cl000066400000000000000000000020621415221260100213240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigpiredD.h" double MATH_MANGLE(sincospi)(double x, __private double * cp) { struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x)); struct scret sc = MATH_PRIVATE(sincospired)(r.hi); int flip = r.i > 1 ? (int)0x80000000 : 0; bool odd = (r.i & 1) != 0; int2 s = AS_INT2(odd ? sc.c : sc.s); s.hi ^= flip ^ (AS_INT2(x).hi & 0x80000000); sc.s = -sc.s; int2 c = AS_INT2(odd ? sc.s : sc.c); c.hi ^= flip; if (!FINITE_ONLY_OPT()) { bool nori = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF); s = nori ? AS_INT2(QNANBITPATT_DP64) : s; c = nori ? AS_INT2(QNANBITPATT_DP64) : c; } *cp = AS_DOUBLE(c); return AS_DOUBLE(s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincospiF.cl000066400000000000000000000020251415221260100213250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigpiredF.h" float MATH_MANGLE(sincospi)(float x, __private float *cp) { float ax = BUILTIN_ABS_F32(x); struct redret r = MATH_PRIVATE(trigpired)(ax); struct scret sc = MATH_PRIVATE(sincospired)(r.hi); int flip = r.i > 1 ? 0x80000000 : 0; bool odd = (r.i & 1) != 0; float s = odd ? sc.c : sc.s; s = AS_FLOAT(AS_INT(s) ^ flip ^ (AS_INT(ax) ^ AS_INT(x))); sc.s = -sc.s; float c = odd ? sc.s : sc.c; c = AS_FLOAT(AS_INT(c) ^ flip); if (!FINITE_ONLY_OPT()) { bool finite = BUILTIN_ISFINITE_F32(ax); c = finite ? c : AS_FLOAT(QNANBITPATT_SP32); s = finite ? s : AS_FLOAT(QNANBITPATT_SP32); } *cp = c; return s; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincospiH.cl000066400000000000000000000025171415221260100213350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigpiredH.h" REQUIRES_16BIT_INSTS half2 MATH_MANGLE2(sincospi)(half2 x, __private half2 *cp) { half2 s; half clo, chi; s.lo = MATH_MANGLE(sincospi)(x.lo, &clo); s.hi = MATH_MANGLE(sincospi)(x.hi, &chi); *cp = (half2)(clo, chi); return s; } REQUIRES_16BIT_INSTS half MATH_MANGLE(sincospi)(half x, __private half *cp) { struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x)); struct scret sc = MATH_PRIVATE(sincospired)(r.hi); short flip = r.i > (short)1 ? (short)0x8000 : (short)0; bool odd = (r.i & (short)1) != (short)0; short s = AS_SHORT(odd ? sc.c : sc.s); s ^= flip ^ (AS_SHORT(x) & (short)0x8000); sc.s = -sc.s; short c = AS_SHORT(odd ? sc.s : sc.c); c ^= flip; if (!FINITE_ONLY_OPT()) { bool nori = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF); c = nori ? (short)QNANBITPATT_HP16 : c; s = nori ? (short)QNANBITPATT_HP16 : s; } *cp = AS_HALF(c); return AS_HALF(s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincospiredD.cl000066400000000000000000000022721415221260100220220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigpiredD.h" CONSTATTR struct scret MATH_PRIVATE(sincospired)(double x) { double t = x * x; double sx = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.e357ef99eb0bbp-12, -0x1.e2fe76fdffd2bp-8), 0x1.50782d5f14825p-4), -0x1.32d2ccdfe9424p-1), 0x1.466bc67754fffp+1), -0x1.4abbce625be09p+2); sx = x * t * sx; sx = MATH_MAD(x, 0x1.921fb54442d18p+1, sx); double cx = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, -0x1.b167302e21c33p-14, 0x1.f9c89ca1d4f33p-10), -0x1.a6d1e7294bff9p-6), 0x1.e1f5067b90b37p-3), -0x1.55d3c7e3c325bp+0), 0x1.03c1f081b5a67p+2), -0x1.3bd3cc9be45dep+2); cx = MATH_MAD(t, cx, 1.0); struct scret ret; ret.c = cx; ret.s = sx; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincospiredF.cl000066400000000000000000000016601415221260100220240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" CONSTATTR struct scret MATH_PRIVATE(sincospired)(float x) { float t = x * x; float sx = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.eb5482p-3f, -0x1.3e497cp-1f), 0x1.468e6cp+1f), -0x1.4abc1cp+2f); sx = x * t * sx; sx = MATH_MAD(x, 0x1.921fb6p+1f, sx); float cx = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.97ca88p-5f, 0x1.c85d3ap-3f), -0x1.55a3b4p+0f), 0x1.03c1a6p+2f), -0x1.3bd3ccp+2f); cx = MATH_MAD(t, cx, 1.0f); struct scret ret; ret.c = cx; ret.s = sx; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincospiredH.cl000066400000000000000000000013161415221260100220240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigpiredH.h" CONSTATTR struct scret MATH_PRIVATE(sincospired)(half x) { half t = x * x; half sx = MATH_MAD(t, 0x1.b84p+0h, -0x1.46cp+2h); sx = x * t * sx; sx = MATH_MAD(x, 0x1.92p+1h, sx); half cx = MATH_MAD(t, 0x1.fbp+1h, -0x1.3bcp+2h); cx = MATH_MAD(t, cx, 1.0h); struct scret ret; ret.c = cx; ret.s = sx; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincosred2D.cl000066400000000000000000000027641415221260100215610ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigredD.h" CONSTATTR struct scret MATH_PRIVATE(sincosred2)(double x, double y) { const double S0 = -0x1.5555555555555p-3; const double S1 = 0x1.1111111110bb3p-7; const double S2 = -0x1.a01a019e83e5cp-13; const double S3 = 0x1.71de3796cde01p-19; const double S4 = -0x1.ae600b42fdfa7p-26; const double S5 = 0x1.5e0b2f9a43bb8p-33; const double C0 = 0x1.5555555555555p-5; const double C1 = -0x1.6c16c16c16967p-10; const double C2 = 0x1.a01a019f4ec90p-16; const double C3 = -0x1.27e4fa17f65f6p-22; const double C4 = 0x1.1eeb69037ab78p-29; const double C5 = -0x1.907db46cc5e42p-37; double x2 = x*x; double x3 = x * x2; double r = 0.5 * x2; double t = 1.0 - r; double u = 1.0 - t; double v = u - r; double cxy = t + MATH_MAD(x2*x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, C5, C4), C3), C2), C1), C0), MATH_MAD(x, -y, v)); double sxy = MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, S5, S4), S3), S2), S1); sxy = x - MATH_MAD(-x3, S0, MATH_MAD(x2, MATH_MAD(-x3, sxy, 0.5*y), -y)); struct scret ret; ret.c = cxy; ret.s = sxy; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincosred2F.cl000066400000000000000000000022671415221260100215610ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" CONSTATTR struct scret MATH_PRIVATE(sincosred2)(float x, float y) { const float c0 = 0x1.555556p-5f; const float c1 = -0x1.6c16b2p-10f; const float c2 = 0x1.a00e98p-16f; const float c3 = -0x1.23c5e0p-22f; const float s0 = -0x1.555556p-3f; const float s1 = 0x1.11110ep-7f; const float s2 = -0x1.a0139ep-13f; const float s3 = 0x1.6dbc3ap-19f; float x2 = x*x; float x3 = x * x2; float r = 0.5f * x2; float t = 1.0f - r; float u = 1.0f - t; float v = u - r; float cxy = t + MATH_MAD(x2*x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, c3, c2), c1), c0), MATH_MAD(x, -y, v)); float sxy = MATH_MAD(x2, MATH_MAD(x2, s3, s2), s1); sxy = x - MATH_MAD(-x3, s0, MATH_MAD(x2, MATH_MAD(-x3, sxy, 0.5f*y), -y)); struct scret ret; ret.c = cxy; ret.s = sxy; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincosredD.cl000066400000000000000000000026231415221260100214710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigredD.h" CONSTATTR struct scret MATH_PRIVATE(sincosred)(double x) { const double S0 = -0x1.5555555555555p-3; const double S1 = 0x1.1111111110bb3p-7; const double S2 = -0x1.a01a019e83e5cp-13; const double S3 = 0x1.71de3796cde01p-19; const double S4 = -0x1.ae600b42fdfa7p-26; const double S5 = 0x1.5e0b2f9a43bb8p-33; const double C0 = 0x1.5555555555555p-5; const double C1 = -0x1.6c16c16c16967p-10; const double C2 = 0x1.a01a019f4ec90p-16; const double C3 = -0x1.27e4fa17f65f6p-22; const double C4 = 0x1.1eeb69037ab78p-29; const double C5 = -0x1.907db46cc5e42p-37; double x2 = x*x; double r = 0.5 * x2; double t = 1.0 - r; double u = 1.0 - t; double v = u - r; double cx = t + MATH_MAD(x2*x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, C5, C4), C3), C2), C1), C0), v); double sx = MATH_MAD(x2*x, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, S5, S4), S3), S2), S1), S0), x); struct scret ret; ret.c = cx; ret.s = sx; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincosredF.cl000066400000000000000000000014161415221260100214720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" CONSTATTR struct scret MATH_PRIVATE(sincosred)(float x) { float t = x * x; float s = MATH_MAD(x, t*MATH_MAD(t, MATH_MAD(t, -0x1.983304p-13f, 0x1.110388p-7f), -0x1.55553ap-3f), x); float c = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.aea668p-16f, -0x1.6c9e76p-10f), 0x1.5557eep-5f), -0x1.000008p-1f), 1.0f); struct scret ret; ret.c = c; ret.s = s; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sincosredH.cl000066400000000000000000000012161415221260100214720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigredH.h" CONSTATTR struct scret MATH_PRIVATE(sincosred)(half x) { half t = x * x; half s = MATH_MAD(x, t*MATH_MAD(t, 0x1.0bp-7h, -0x1.554p-3h), x); half c = MATH_MAD(t, MATH_MAD(t, 0x1.4b4p-5h, -0x1.ffcp-2h), 1.0h); struct scret ret; ret.c = c; ret.s = s; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sinhD.cl000066400000000000000000000015511415221260100204400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x); CONSTATTR double MATH_MANGLE(sinh)(double x) { double y = BUILTIN_ABS_F64(x); double2 e = MATH_PRIVATE(epexpep)(sub(y, con(0x1.62e42fefa39efp-1,0x1.abc9e3b39803fp-56))); double2 s = fsub(e, ldx(rcp(e), -2)); double z = s.hi; if (!FINITE_ONLY_OPT()) { z = y >= 0x1.633ce8fb9f87ep+9 ? AS_DOUBLE(PINFBITPATT_DP64) : z; } z = y < 0x1.0p-27 ? y : z; return BUILTIN_COPYSIGN_F64(z, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sinhF.cl000066400000000000000000000015171415221260100204440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x); CONSTATTR float MATH_MANGLE(sinh)(float x) { float y = BUILTIN_ABS_F32(x); float2 e = MATH_PRIVATE(epexpep)(sub(y, con(0x1.62e430p-1f, -0x1.05c610p-29f))); float2 s = fsub(e, ldx(rcp(e), -2)); float z = s.hi; if (!FINITE_ONLY_OPT()) { z = y > 0x1.65a9f8p+6f ? AS_FLOAT(PINFBITPATT_SP32) : z; } z = y < 0x1.0p-12f ? y : z; return BUILTIN_COPYSIGN_F32(z, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sinhH.cl000066400000000000000000000010141415221260100204360ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(sinh) CONSTATTR half MATH_MANGLE(sinh)(half hx) { float x = (float)hx * 0x1.715476p+0f; return (half)(0.5f * (BUILTIN_EXP2_F32(x) - BUILTIN_EXP2_F32(-x))); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sinpiD.cl000066400000000000000000000014721415221260100206230ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigpiredD.h" double MATH_MANGLE(sinpi)(double x) { struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x)); struct scret sc = MATH_PRIVATE(sincospired)(r.hi); int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c); s.hi ^= (r.i > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000); if (!FINITE_ONLY_OPT()) { s = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : s; } return AS_DOUBLE(s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sinpiF.cl000066400000000000000000000014501415221260100206210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigpiredF.h" CONSTATTR float MATH_MANGLE(sinpi)(float x) { float ax = BUILTIN_ABS_F32(x); struct redret r = MATH_PRIVATE(trigpired)(ax); struct scret sc = MATH_PRIVATE(sincospired)(r.hi); float s = (r.i & 1) == 0 ? sc.s : sc.c; s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0) ^ (AS_INT(x) ^ AS_INT(ax))); if (!FINITE_ONLY_OPT()) { s = BUILTIN_ISFINITE_F32(ax) ? s : AS_FLOAT(QNANBITPATT_SP32); } return s; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sinpiH.cl000066400000000000000000000015641415221260100206310ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigpiredH.h" UGEN(sinpi) REQUIRES_16BIT_INSTS half MATH_MANGLE(sinpi)(half x) { struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x)); struct scret sc = MATH_PRIVATE(sincospired)(r.hi); short s = AS_SHORT((r.i & (short)1) == (short)0 ? sc.s : sc.c); s ^= (r.i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000); if (!FINITE_ONLY_OPT()) { s = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : s; } return AS_HALF(s); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/sqrtD.cl000066400000000000000000000011701415221260100204650ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(sqrt)(double x) { return MATH_SQRT(x); } #define GEN(LN,UN) \ CONSTATTR double \ MATH_MANGLE(LN)(double x) \ { \ return BUILTIN_##UN##_F64(x); \ } // GEN(sqrt_rte,SQRT_RTE) // GEN(sqrt_rtn,SQRT_RTN) // GEN(sqrt_rtp,SQRT_RTP) // GEN(sqrt_rtz,SQRT_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/sqrtF.cl000066400000000000000000000013231415221260100204670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(sqrt)(float x) { if (CORRECTLY_ROUNDED_SQRT32()) { return MATH_SQRT(x); } else { return MATH_FAST_SQRT(x); } } #define GEN(LN,UN) \ CONSTATTR float \ MATH_MANGLE(LN)(float x) \ { \ return BUILTIN_##UN##_F32(x); \ } // GEN(sqrt_rte,SQRT_RTE) // GEN(sqrt_rtn,SQRT_RTN) // GEN(sqrt_rtp,SQRT_RTP) // GEN(sqrt_rtz,SQRT_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/sqrtH.cl000066400000000000000000000012151415221260100204710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(sqrt) CONSTATTR half MATH_MANGLE(sqrt)(half x) { return BUILTIN_SQRT_F16(x); } #define GEN(LN,UN) \ CONSTATTR half \ MATH_MANGLE(LN)(half x) \ { \ return BUILTIN_##UN##_F16(x); \ } // GEN(sqrt_rte,SQRT_RTE) // GEN(sqrt_rtp,SQRT_RTN) // GEN(sqrt_rtn,SQRT_RTP) // GEN(sqrt_rtz,SQRT_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/subD.cl000066400000000000000000000010621415221260100202650ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define GEN(LN,UN) \ CONSTATTR double \ MATH_MANGLE(LN)(double x, double y) \ { \ return BUILTIN_##UN##_F64(x, y); \ } // GEN(sub_rte,SUB_RTE) // GEN(sub_rtn,SUB_RTN) // GEN(sub_rtp,SUB_RTP) // GEN(sub_rtz,SUB_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/subF.cl000066400000000000000000000010571415221260100202730ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #define GEN(LN,UN) \ CONSTATTR float \ MATH_MANGLE(LN)(float x, float y) \ { \ return BUILTIN_##UN##_F32(x, y); \ } // GEN(sub_rte,SUB_RTE) // GEN(sub_rtn,SUB_RTN) // GEN(sub_rtp,SUB_RTP) // GEN(sub_rtz,SUB_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/subH.cl000066400000000000000000000010541415221260100202720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #define GEN(LN,UN) \ CONSTATTR half \ MATH_MANGLE(LN)(half x, half y) \ { \ return BUILTIN_##UN##_F16(x, y); \ } // GEN(sub_rte,SUB_RTE) // GEN(sub_rtn,SUB_RTN) // GEN(sub_rtp,SUB_RTP) // GEN(sub_rtz,SUB_RTZ) ROCm-Device-Libs-rocm-5.0.0/ocml/src/tables.cl000066400000000000000000000014371415221260100206500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #include "tables.h" #ifdef USE_TABLESTRUCT #define DECLARE_TABLE(TYPE,NAME,LENGTH) { #define END_TABLE() }, __attribute__((visibility("protected"))) __constant struct __tbl_mem_s __tbl_mem = { #else #define DECLARE_TABLE(TYPE,NAME,LENGTH) \ __attribute__((visibility("protected"))) __constant TYPE TABLE_MANGLE(NAME) [ LENGTH ] = { #define END_TABLE() }; #endif #include "besselF_table.h" #include "besselD_table.h" #ifdef USE_TABLESTRUCT }; #endif ROCm-Device-Libs-rocm-5.0.0/ocml/src/tables.h000066400000000000000000000023541415221260100205000ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ // Table stuff #undef USE_TABLESTRUCT #ifdef USE_TABLESTRUCT struct __tbl_mem_s { float M32_J0[72]; float M32_J1[72]; float M32_Y0[162] float M32_Y1[162] double M64_J0[120]; double M64_J1[120]; double M64_Y0[270]; double M64_Y1[270]; }; extern __constant struct __tbl_mem_s __tbl_mem; #define USE_TABLE(TYPE,PTR,NAME) \ __constant TYPE * PTR = __ocmltbl_mem . NAME #else #define TABLE_MANGLE(NAME) __ocmltbl_##NAME extern __constant float TABLE_MANGLE(M32_J0)[]; extern __constant float TABLE_MANGLE(M32_J1)[]; extern __constant float TABLE_MANGLE(M32_Y0)[]; extern __constant float TABLE_MANGLE(M32_Y1)[]; extern __constant double TABLE_MANGLE(M64_J0)[]; extern __constant double TABLE_MANGLE(M64_J1)[]; extern __constant double TABLE_MANGLE(M64_Y0)[]; extern __constant double TABLE_MANGLE(M64_Y1)[]; #define USE_TABLE(TYPE,PTR,NAME) \ __constant TYPE * PTR = TABLE_MANGLE(NAME) #endif ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanD.cl000066400000000000000000000013461415221260100202630ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigredD.h" CONSTATTR double MATH_MANGLE(tan)(double x) { double ax = BUILTIN_ABS_F64(x); struct redret r = MATH_PRIVATE(trigred)(ax); int2 t = AS_INT2(MATH_PRIVATE(tanred2)(r.hi, r.lo, r.i & 1)); t.hi ^= AS_INT2(x).hi & (int)0x80000000; if (!FINITE_ONLY_OPT()) { t = BUILTIN_ISFINITE_F64(ax) ? t : AS_INT2(QNANBITPATT_DP64); } return AS_DOUBLE(t); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanF.cl000066400000000000000000000014721415221260100202650ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" float MATH_MANGLE(tan)(float x) { float ax = BUILTIN_ABS_F32(x); struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax)); #if defined EXTRA_PRECISION float t = MATH_PRIVATE(tanred)(r.hi + r.lo, r.i & 1); #else float t = MATH_PRIVATE(tanred)(r.hi, r.i & 1); #endif t = AS_FLOAT(AS_INT(t) ^ (AS_INT(x) ^ AS_INT(ax))); if (!FINITE_ONLY_OPT()) { t = BUILTIN_ISFINITE_F32(ax) ? t : AS_FLOAT(QNANBITPATT_SP32); } return t; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanH.cl000066400000000000000000000013541415221260100202660ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigredH.h" UGEN(tan) REQUIRES_16BIT_INSTS half MATH_MANGLE(tan)(half x) { half ax = BUILTIN_ABS_F16(x); struct redret r = MATH_PRIVATE(trigred)(ax); short t = AS_SHORT(MATH_PRIVATE(tanred)(r.hi, r.i & (short)1)); t ^= AS_SHORT(x) & (short)0x8000; if (!FINITE_ONLY_OPT()) { t = BUILTIN_ISFINITE_F16(ax) ? t : (short)QNANBITPATT_HP16; } return AS_HALF(t); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanhD.cl000066400000000000000000000014161415221260100204310ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x); CONSTATTR double MATH_MANGLE(tanh)(double x) { double y = BUILTIN_ABS_F64(x); double2 e = MATH_PRIVATE(epexpep)(con(y, 0.0)); double2 ei = rcp(e); double2 t = fdiv(fsub(e, ei), fadd(e, ei)); double z = t.hi; z = y > 19.0625 ? 1.0 : z; z = y < 0x1.0p-27 ? y : z; return BUILTIN_COPYSIGN_F64(z, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanhF.cl000066400000000000000000000023441415221260100204340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #if defined EXTRA_ACCURACY #define FLOAT_SPECIALIZATION #include "ep.h" extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x); #endif CONSTATTR float MATH_MANGLE(tanh)(float x) { float y = BUILTIN_ABS_F32(x); #if defined EXTRA_ACCURACY float2 e = MATH_PRIVATE(epexpep)(con(y, 0.0f)); float2 ei = rcp(e); float2 t = fdiv(fsub(e, ei), fadd(e, ei)); float z = t.hi; z = y > 9.0f ? 1.0f : z; z = y < 0x1.0p-13f ? y : z; #else float z; if (y < 0.625f) { float y2 = y*y; float p = MATH_MAD(y2, MATH_MAD(y2, MATH_MAD(y2, MATH_MAD(y2, -0x1.758e7ap-8f, 0x1.521192p-6f), -0x1.b8389cp-5f), 0x1.110704p-3f), -0x1.555532p-2f); z = MATH_MAD(y2, y*p, y); } else { float t = MATH_MANGLE(exp)(2.0f * y); z = 1.0f - MATH_FAST_DIV(2.0f, t + 1.0f); } #endif return BUILTIN_COPYSIGN_F32(z, x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanhH.cl000066400000000000000000000012461415221260100204360ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR UGEN(tanh) CONSTATTR half MATH_MANGLE(tanh)(half hx) { float x = (float)hx * 0x1.715476p+0f; float a = BUILTIN_EXP2_F32(x); float b = BUILTIN_EXP2_F32(-x); half one = BUILTIN_COPYSIGN_F16(1.0h, hx); half ret = (half)((a - b) * BUILTIN_RCP_F32(a + b)); return BUILTIN_ABS_F16(hx) > 4.5h ? one : ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanpiD.cl000066400000000000000000000015061415221260100206120ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigpiredD.h" CONSTATTR double MATH_MANGLE(tanpi)(double x) { struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x)); int2 t = AS_INT2(MATH_PRIVATE(tanpired)(r.hi, r.i & 1)); t.hi ^= (((r.i == 1) | (r.i == 2)) & (r.hi == 0.0)) ? 0x80000000 : 0; t.hi ^= AS_INT2(x).hi & (int)0x80000000; if (!FINITE_ONLY_OPT()) { t = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : t; } return AS_DOUBLE(t); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanpiF.cl000066400000000000000000000014641415221260100206170ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigpiredF.h" CONSTATTR float MATH_MANGLE(tanpi)(float x) { struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F32(x)); int t = AS_INT(MATH_PRIVATE(tanpired)(r.hi, r.i & 1)); t ^= (((r.i == 1) | (r.i == 2)) & (r.hi == 0.0f)) ? (int)0x80000000 : 0; t ^= AS_INT(x) & (int)0x80000000; if (!FINITE_ONLY_OPT()) { t = BUILTIN_CLASS_F32(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? QNANBITPATT_SP32 : t; } return AS_FLOAT(t); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanpiH.cl000066400000000000000000000016021415221260100206130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigpiredH.h" CONSTATTR UGEN(tanpi) REQUIRES_16BIT_INSTS CONSTATTR half MATH_MANGLE(tanpi)(half x) { struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x)); short t = AS_SHORT(MATH_PRIVATE(tanpired)(r.hi, r.i & (short)1)); t ^= (((r.i == (short)1) | (r.i == (short)2)) & (r.hi == 0.0h)) ? (short)0x8000 : (short)0; t ^= AS_SHORT(x) & (short)0x8000; if (!FINITE_ONLY_OPT()) { t = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : t; } return AS_HALF(t); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanpiredD.cl000066400000000000000000000022561415221260100213100ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigpiredD.h" CONSTATTR double MATH_PRIVATE(tanpired)(double x, int i) { double s = x * x; double t = MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, 0x1.3fad0a71ea6d1p+32, -0x1.11a76ac97377bp+30), 0x1.ba2bcaca6da1bp+27), -0x1.79e8e2d7aaf57p+22), 0x1.c1c1102e46eccp+21), 0x1.31291bbcb5588p+19), 0x1.486b2d6bb3db2p+17), 0x1.45be1b46ff156p+15), 0x1.45f61b419c746p+13), 0x1.45f311045a4ffp+11), 0x1.45f4739a998c7p+9), 0x1.45fff9b243050p+7), 0x1.466bc6775cf74p+5), 0x1.4abbce625be8bp+3); t = x * s * t; t = MATH_MAD(x, 0x1.921fb54442d18p+1, t); double tr = -MATH_RCP(t); return i ? tr : t; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanpiredF.cl000066400000000000000000000014641415221260100213120ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigpiredF.h" CONSTATTR float MATH_PRIVATE(tanpired)(float x, int i) { float s = x * x; float t = MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, 0x1.7d2bd4p+16f, 0x1.a4d306p+12f), 0x1.435004p+11f), 0x1.4b6926p+9f), 0x1.451e22p+7f), 0x1.467a9cp+5f), 0x1.4abb6ap+3f); t = x * s * t; t = MATH_MAD(x, 0x1.921fb6p+1f, t); float tr = -MATH_RCP(t); return i ? tr : t; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanpiredH.cl000066400000000000000000000011711415221260100213070ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigpiredH.h" CONSTATTR half MATH_PRIVATE(tanpired)(half x, short i) { half s = x * x; half t = MATH_MAD(s, MATH_MAD(s, 0x1.3d8p+8h, 0x1.fe4p+4h), 0x1.508p+3h); t = x * s * t; t = MATH_MAD(x, 0x1.92p+1h, t); half tr = -MATH_RCP(t); return i ? tr : t; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanred2D.cl000066400000000000000000000061411415221260100210360ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #define DOUBLE_SPECIALIZATION #include "ep.h" #define NOCFLOW CONSTATTR double MATH_PRIVATE(tanred2)(double x, double xx, int sel) { #if defined NOCFLOW double s = sqr(con(x,xx)).hi; double p = s * MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, 0x1.5e089c751c08cp-16, -0x1.78809a9a29f71p-15), 0x1.7746f90a8aaep-14), -0x1.bb44da6fbf144p-16), 0x1.1e634a7943acfp-13), 0x1.d250fdeb68febp-13), 0x1.37fd9b58c4d95p-11), 0x1.7d5af15120e2cp-10), 0x1.d6d93e09491dfp-9), 0x1.226e12033784dp-7), 0x1.664f49ac36ae2p-6), 0x1.ba1ba1b451c21p-5), 0x1.11111111185b7p-3), 0x1.55555555554eep-2); double2 t = fadd(con(x,xx), mul(x, p)); double2 tr = frcp(t); return sel ? -tr.hi : t.hi; #else const double piby4_lead = 0x1.921fb54442d18p-1; const double piby4_tail = 0x1.1a62633145c06p-55; // In order to maintain relative precision transform using the identity: // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4. // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4. bool ca = x > 0.68; bool cb = x < -0.68; double transform = ca ? 1.0 : 0.0; transform = cb ? -1.0 : transform; double tx = MATH_MAD(-transform, x, piby4_lead) + MATH_MAD(-transform, xx, piby4_tail); bool c = ca | cb; x = c ? tx : x; xx = c ? 0.0 : xx; // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68]. double t1 = x; double r = MATH_MAD(x*xx, 2.0, x*x); double a = MATH_MAD(r, MATH_MAD(r, 0x1.d5daf289c385ap-13, -0x1.77c24c7569abbp-6), 0x1.7d50f6638564ap-2); double b = MATH_MAD(r, MATH_MAD(r, MATH_MAD(r, -0x1.e7517ef6d98f8p-13, 0x1.ab0f4f80a0acfp-6), -0x1.08046499eb90fp-1), 0x1.1dfcb8caa40b8p+0); double t2 = MATH_MAD(MATH_FAST_DIV(a, b), x*r, xx); double tp = t1 + t2; double ret; if (c) { if (sel) ret = transform * (MATH_FAST_DIV(2.0*tp, tp - 1.0) - 1.0); else ret = transform * (1.0 - MATH_FAST_DIV(2.0*tp, 1.0 + tp)); } else { if (sel) { // Compute -1.0/(t1 + t2) accurately double tq = t2 - (tp - t1); double tr = -MATH_FAST_RCP(tp); double e = MATH_MAD(tr, tq, MATH_MAD(tr, tp, 1.0)); ret = MATH_MAD(e, tr, tr); } else { ret = tp; } } return ret; #endif } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanredF.cl000066400000000000000000000023631415221260100207600ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" CONSTATTR float MATH_PRIVATE(tanred)(float x, int i) { float s = x * x; #if defined MORE_ACCURACY float p = s * MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, 0x1.33d5e6p-7f, 0x1.9697f8p-9f), 0x1.907be2p-6f), 0x1.b581ap-5f), 0x1.112e2p-3f), 0x1.5554dcp-2f); #else float a = MATH_MAD(s, -0x1.19dba6p-6f, 0x1.8a8b0ep-2f); float b = MATH_MAD(s, MATH_MAD(s, 0x1.2e2900p-6f, -0x1.07266ep-1f), 0x1.27e84ap+0f); float p = s * MATH_FAST_DIV(a,b); #endif #if defined LESS_ACCURACY float t = MATH_MAD(p, x, x); float tr = -MATH_FAST_RCP(t); #else float t = BUILTIN_FMA_F32(p, x, x); float tt = BUILTIN_FMA_F32(p, x, -(t - x)); float tr = -MATH_FAST_RCP(t); float e = BUILTIN_FMA_F32(tt, tr, BUILTIN_FMA_F32(t, tr, 1.0f)); tr = BUILTIN_FMA_F32(e, tr, tr); #endif return i ? tr : t; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tanredH.cl000066400000000000000000000011311415221260100207520ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigredH.h" CONSTATTR half MATH_PRIVATE(tanred)(half x, short i) { half s = x * x; half t = MATH_MAD(s, MATH_MAD(s, 0x1.794p-4h, 0x1.e3cp-4h), 0x1.57p-2h); t = MATH_MAD(x, s*t, x); half tr = -MATH_RCP(t); return i ? tr : t; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tgammaD.cl000066400000000000000000000142261415221260100207500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(tgamma)(double x) { const double pi = 3.14159265358979323846; double ax = BUILTIN_ABS_F64(x); double ret; if (ax > 0x1.0p-11) { // For x < 4, push to [1-3] range using gamma(x) = gamma(x+1) / x // For 4.5 < x < 6.5, push above 6.5 // [4,4.5) left alone double nterm = 1.0; double dterm = 1.0; double z = ax; if (ax < 4.5) { if (ax < 1.0) { dterm = z; z += 1.0; } else if (ax < 3.0) { ; // do nothing } else if (ax < 4.0) { z -= 1.0; nterm = z; } } else if (ax < 5.5) { dterm = MATH_MAD(z,z,z); z += 2.0; } else if (ax < 6.5) { dterm = z; z += 1.0; } double negadj = 1.0; if (x < 0.0) { negadj = -x * MATH_MANGLE(sinpi)(x); } double etonegz = MATH_MANGLE(exp)(-z); if (z < 4.5) { const double rn0 = 297.312130630940277; const double rn1 = 16926.1409177878806; const double rn2 = 131675.407800922036; const double rn3 = 344586.743316038732; const double rn4 = 440619.954224349898; const double rn5 = 275507.567385621460; const double rn6 = 84657.9644812230335; const double rd0 = 1.00000000000000000; const double rd1 = -13.3400904528209096; const double rd2 = 3270.94389286527964; const double rd3 = 41972.5365974090031; const double rd4 = 123293.896672792281; const double rd5 = 166739.899991898533; const double rd6 = 107097.146935059144; const double rd7 = 33773.6414083704053; double num = MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, rn6, rn5), rn4), rn3), rn2), rn1), rn0) * nterm; double den = MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, MATH_MAD(z, rd7, rd6), rd5), rd4), rd3), rd2), rd1), rd0) * dterm; double zpow = MATH_MANGLE(powr)(z, z+0.5); if (x >= 0.0) { ret = etonegz * zpow * MATH_DIV(num,den); } else { ret = MATH_DIV(den*pi, negadj*etonegz*zpow*num); ret = BUILTIN_FRACTION_F64(x) == 0.0 ? AS_DOUBLE(QNANBITPATT_DP64) : ret; } } else { const double c0 = 2.5066282746310007; const double c1 = 0.20888568955258338; const double c2 = 0.008703570398024307; const double c3 = -0.0067210904740298821; const double c4 = -0.00057520123811017124; const double c5 = 0.0019652948815832029; const double c6 = 0.00017478252120455912; const double c7 = -0.0014843411351582762; const double c8 = -0.00012963757321125544; const double c9 = 0.0021043112297532062; const double c10 = 0.00018059994565555043; const double c11 = -0.0047987856705463457; const double c12 = -0.0004073678593815252; const double c13 = 0.01605085033194459500; const double c14 = 0.0013539922801590941; const double c15 = -0.074015421268427375; const double c16 = -0.0062208086788087787; const double c17 = 0.45004033385625097; double rz = MATH_RCP(z); double poly = MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, c17, c16), c15), c14), c13), c12), c11), c10), c9), c8), c7), c6), c5), c4), c3), c2), c1), c0); double zpow = MATH_MANGLE(powr)(z, MATH_MAD(0.5, z, -0.25)); if (x >= 0.0) { ret = MATH_DIV(etonegz*zpow*zpow*poly, dterm); ret = x > 0x1.573fae561f647p+7 ? AS_DOUBLE(PINFBITPATT_DP64) : ret; } else if (x < 0.0) { if (x >= -170.5) { ret = MATH_DIV(pi*dterm, etonegz*zpow*zpow*poly*negadj); } else if (x >= -184.0) { ret = MATH_DIV(MATH_DIV(pi*dterm, etonegz*zpow*poly), zpow*negadj); } else { ret = BUILTIN_COPYSIGN_F64(0.0, negadj); } ret = BUILTIN_FRACTION_F64(x) == 0.0 ? AS_DOUBLE(QNANBITPATT_DP64) : ret; } else { ret = x; } } } else { const double c0 = -0x1.2788cfc6fb619p-1; const double c1 = 0x1.fa658c23b1578p-1; const double c2 = -0x1.d0a118f324b63p-1; const double c3 = 0x1.f6a51055096b5p-1; ret = MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, c3, c2), c1), c0) + MATH_RCP(x); } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tgammaF.cl000066400000000000000000000040311415221260100207430ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(tgamma)(float x) { const float pi = 0x1.921fb6p+1f; const float sqrt2pi = 0x1.40d932p+1f; const float sqrtpiby2 = 0x1.40d932p+0f; float ax = BUILTIN_ABS_F32(x); float ret; if (ax > 0x1.0p-6f) { // For x < 3, push to larger value using gamma(x) = gamma(x+1) / x float d = 1.0f; if (x < 1.0f) { d = MATH_MAD((ax + 3.0f), ax, 2.0f) * ax; ax = ax + 3.0f; } else if (ax < 2.0f) { d = MATH_MAD(ax, ax, ax); ax = ax + 2.0f; } else if (ax < 3.0f) { d = ax; ax = ax + 1.0f; } // x^x e^-x (1 + poly(1/x)) sqrt(twopi / x) / d // Split x^x into a product since it overflows faster than gamma(x) float t1 = MATH_MANGLE(powr)(ax, MATH_MAD(ax, 0.5f, -0.25f)); float t2 = MATH_MANGLE(exp)(-ax); float xr = MATH_FAST_RCP(ax); float pt = xr*MATH_MAD(xr, MATH_MAD(xr, -139.0f/51840.0f, 1.0f/288.0f) , 1.0f/12.0f); if (x > 0.0f) { float p = sqrt2pi*t2*t1*t1 * MATH_FAST_RCP(d); ret = MATH_MAD(p, pt, p); ret = x > 0x1.18521ep+5f ? AS_FLOAT(PINFBITPATT_SP32) : ret; } else { float s = MATH_MANGLE(sinpi)(x); float p = s*x*t2*t1*t1; ret = MATH_DIV(-sqrtpiby2*d, MATH_MAD(p, pt, p)); ret = x < -42.0f ? 0.0f : ret; ret = BUILTIN_FRACTION_F32(x) == 0.0f ? AS_FLOAT(QNANBITPATT_SP32) : ret; } } else { ret = MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, 0x1.f6a510p-1f, -0x1.d0a118p-1f), 0x1.fa658cp-1f), -0x1.2788d0p-1f) + 4.0f*MATH_FAST_RCP(4.0f*x); } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/tgammaH.cl000066400000000000000000000007051415221260100207510ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" UGEN(tgamma) CONSTATTR half MATH_MANGLE(tgamma)(half x) { return (half)MATH_UPMANGLE(tgamma)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigpiredD.cl000066400000000000000000000012011415221260100214600ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigpiredD.h" CONSTATTR struct redret MATH_PRIVATE(trigpired)(double x) { double t = 2.0 * BUILTIN_FRACTION_F64(0.5 * x); x = x > 1.0 ? t : x; t = BUILTIN_RINT_F64(2.0 * x); struct redret ret; ret.hi = MATH_MAD(t, -0.5, x); ret.i = (int)t & 0x3; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigpiredD.h000066400000000000000000000011451415221260100213200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ struct redret { double hi; int i; }; struct scret { double c; double s; }; extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(double x); extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(double x); extern CONSTATTR double MATH_PRIVATE(tanpired)(double x, int i); ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigpiredF.cl000066400000000000000000000012041415221260100214650ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigpiredF.h" CONSTATTR struct redret MATH_PRIVATE(trigpired)(float x) { float t = 2.0f * BUILTIN_FRACTION_F32(0.5f * x); x = x > 1.0f ? t : x; t = BUILTIN_RINT_F32(2.0f * x); struct redret ret; ret.hi = MATH_MAD(t, -0.5f, x); ret.i = (int)t & 0x3; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigpiredF.h000066400000000000000000000011361415221260100213220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ struct redret { float hi; int i; }; struct scret { float s; float c; }; extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(float x); extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(float x); extern CONSTATTR float MATH_PRIVATE(tanpired)(float x, int i); ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigpiredH.cl000066400000000000000000000012401415221260100214670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigpiredH.h" REQUIRES_16BIT_INSTS CONSTATTR struct redret MATH_PRIVATE(trigpired)(half x) { half t = 2.0h * BUILTIN_FRACTION_F16(0.5h * x); x = x > 1.0h ? t : x; t = BUILTIN_RINT_F16(2.0h * x); struct redret ret; ret.hi = MATH_MAD(t, -0.5h, x); ret.i = (short)t & (short)0x3; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigpiredH.h000066400000000000000000000011331415221260100213210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ struct redret { half hi; short i; }; struct scret { half s; half c; }; extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(half x); extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(half x); extern CONSTATTR half MATH_PRIVATE(tanpired)(half x, short i); ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigredD.cl000066400000000000000000000010441415221260100211340ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigredD.h" CONSTATTR struct redret MATH_PRIVATE(trigred)(double x) { if (x < 0x1.0p+30) return MATH_PRIVATE(trigredsmall)(x); else return MATH_PRIVATE(trigredlarge)(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigredD.h000066400000000000000000000015241415221260100207700ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ struct redret { double lo; double hi; int i; }; struct scret { double s; double c; }; extern CONSTATTR struct redret MATH_PRIVATE(trigredsmall)(double x); extern CONSTATTR struct redret MATH_PRIVATE(trigredlarge)(double x); extern CONSTATTR struct redret MATH_PRIVATE(trigred)(double x); extern CONSTATTR struct scret MATH_PRIVATE(sincosred)(double x); extern CONSTATTR struct scret MATH_PRIVATE(sincosred2)(double x, double y); extern CONSTATTR double MATH_PRIVATE(tanred2)(double x, double xx, int sel); ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigredF.cl000066400000000000000000000010451415221260100211370ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" CONSTATTR struct redret MATH_PRIVATE(trigred)(float x) { if (x < SMALL_BOUND) return MATH_PRIVATE(trigredsmall)(x); else return MATH_PRIVATE(trigredlarge)(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigredF.h000066400000000000000000000017361415221260100207770ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define SMALL_BOUND 0x1.0p+17f #if defined EXTRA_PRECISION struct redret { float hi; float lo; int i; }; #else struct redret { float hi; int i; }; #endif struct scret { float s; float c; }; extern CONSTATTR struct redret MATH_PRIVATE(trigredsmall)(float x); extern CONSTATTR struct redret MATH_PRIVATE(trigredlarge)(float x); extern CONSTATTR struct redret MATH_PRIVATE(trigred)(float x); #if defined EXTRA_PRECISION extern CONSTATTR struct scret MATH_PRIVATE(sincosred2)(float x, float y); #else extern CONSTATTR struct scret MATH_PRIVATE(sincosred)(float x); #endif extern CONSTATTR float MATH_PRIVATE(tanred)(float x, int regn); ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigredH.cl000066400000000000000000000014741415221260100211470ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" #include "trigredH.h" CONSTATTR struct redret MATH_PRIVATE(trigred)(half hx) { const float twobypi = 0x1.45f306p-1f; const float pb2_a = 0x1.92p+0f; const float pb2_b = 0x1.fap-12f; const float pb2_c = 0x1.54442ep-20f; float x = (float)hx; float fn = BUILTIN_RINT_F32(x * twobypi); struct redret ret; ret.hi = (half)BUILTIN_MAD_F32(fn, -pb2_c, BUILTIN_MAD_F32(fn, -pb2_b, BUILTIN_MAD_F32(fn, -pb2_a, x))); ret.i = (int)fn & 0x3; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigredH.h000066400000000000000000000011271415221260100207730ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ struct redret { half hi; short i; }; struct scret { half s; half c; }; extern CONSTATTR struct redret MATH_PRIVATE(trigred)(half x); extern CONSTATTR struct scret MATH_PRIVATE(sincosred)(half x); extern CONSTATTR half MATH_PRIVATE(tanred)(half x, short i); ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigredlargeD.cl000066400000000000000000000054401415221260100221530ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigredD.h" // Allow H,L to be the same as A,B #define FSUM2(A, B, H, L) \ do { \ double __s = A + B; \ double __t = B - (__s - A); \ H = __s; \ L = __t; \ } while (0) #define SUM2(A, B, H, L) \ do { \ double __s = A + B; \ double __aa = __s - B; \ double __bb = __s - __aa; \ double __da = A - __aa; \ double __db = B - __bb; \ double __t = __da + __db; \ H = __s; \ L = __t; \ } while (0) #define PROD2(A, B, H, L) \ do { \ double __p = A * B; \ double __q = BUILTIN_FMA_F64(A, B, -__p); \ H = __p; \ L = __q; \ } while (0) #define EVALUATE(A, B2, B1, B0, F2, F1, F0) \ do { \ double __p2h, __p2l, __p1h, __p1l, __p0h, __p0l; \ double __v1h, __v1l, __v2h, __v2l, __w2h, __w2l; \ double __e0, __e1, __e2, __e3; \ PROD2(B0, A, __p0h, __p0l); \ PROD2(B1, A, __p1h, __p1l); \ PROD2(B2, A, __p2h, __p2l); \ SUM2(__p2l, __p1h, __v2h, __v2l); \ SUM2(__p1l, __p0h, __v1h, __v1l); \ SUM2(__v2l, __v1h, __w2h, __w2l); \ __e3 = __p2h; \ __e2 = __v2h; \ __e1 = __w2h; \ __e0 = __w2l + __v1l + __p0l; \ FSUM2(__e3, __e2, __e3, __e2); \ FSUM2(__e2, __e1, __e2, __e1); \ FSUM2(__e1, __e0, __e1, __e0); \ F2 = __e3; \ F1 = __e2; \ F0 = __e1; \ } while(0) CONSTATTR struct redret MATH_PRIVATE(trigredlarge)(double x) { // Scale x by relevant part of 2/pi double p2 = BUILTIN_TRIG_PREOP_F64(x, 0); double p1 = BUILTIN_TRIG_PREOP_F64(x, 1); double p0 = BUILTIN_TRIG_PREOP_F64(x, 2); x = x >= 0x1.0p+945 ? BUILTIN_FLDEXP_F64(x, -128) : x; double f2, f1, f0; EVALUATE(x, p2, p1, p0, f2, f1, f0); f2 = BUILTIN_FLDEXP_F64(BUILTIN_FRACTION_F64(BUILTIN_FLDEXP_F64(f2, -2)), 2); f2 += f2+f1 < 0.0 ? 4.0 : 0.0; int i = (int)(f2 + f1); f2 -= (double)i; FSUM2(f2, f1, f2, f1); FSUM2(f1, f0, f1, f0); int g = f2 >= 0.5; i += g; f2 -= g ? 1.0 : 0.0; FSUM2(f2, f1, f2, f1); const double pio2h = 0x1.921fb54442d18p+0; const double pio2t = 0x1.1a62633145c07p-54; double rh = f2 * pio2h; double rt = BUILTIN_FMA_F64(f1, pio2h, BUILTIN_FMA_F64(f2, pio2t, BUILTIN_FMA_F64(f2, pio2h, -rh))); FSUM2(rh, rt, rh, rt); struct redret ret; ret.hi = rh; ret.lo = rt; ret.i = i & 0x3; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigredlargeF.cl000066400000000000000000000106331415221260100221550ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" CONSTATTR struct redret MATH_PRIVATE(trigredlarge)(float x) { int xe = (int)(AS_UINT(x) >> 23) - 127; uint xm = 0x00800000U | (AS_UINT(x) & 0x7fffffU); // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB const uint b6 = 0xA2F9836EU; const uint b5 = 0x4E441529U; const uint b4 = 0xFC2757D1U; const uint b3 = 0xF534DDC0U; const uint b2 = 0xDB629599U; const uint b1 = 0x3C439041U; const uint b0 = 0xFE5163ABU; uint p0, p1, p2, p3, p4, p5, p6, p7; ulong a; a = (ulong)xm * (ulong)b0; p0 = a; a >>= 32; a = (ulong)xm * (ulong)b1 + a; p1 = a; a >>= 32; a = (ulong)xm * (ulong)b2 + a; p2 = a; a >>= 32; a = (ulong)xm * (ulong)b3 + a; p3 = a; a >>= 32; a = (ulong)xm * (ulong)b4 + a; p4 = a; a >>= 32; a = (ulong)xm * (ulong)b5 + a; p5 = a; a >>= 32; a = (ulong)xm * (ulong)b6 + a; p6 = a; p7 = a >> 32; uint fbits = 224 + 23 - xe; // shift amount to get 2 lsb of integer part at top 2 bits // min: 25 (xe=18) max: 134 (xe=127) uint shift = 256U - 2 - fbits; // Shift by up to 134/32 = 4 words int c = shift > 63; p7 = c ? p5 : p7; p6 = c ? p4 : p6; p5 = c ? p3 : p5; p4 = c ? p2 : p4; p3 = c ? p1 : p3; p2 = c ? p0 : p2; shift -= (-c) & 64; c = shift > 31; p7 = c ? p6 : p7; p6 = c ? p5 : p6; p5 = c ? p4 : p5; p4 = c ? p3 : p4; p3 = c ? p2 : p3; shift -= (-c) & 32; c = shift > 31; p7 = c ? p6 : p7; p6 = c ? p5 : p6; p5 = c ? p4 : p5; p4 = c ? p3 : p4; shift -= (-c) & 32; // BUILTIN_BITALIGN_B32 cannot handle a shift of 32 c = shift > 0; shift = 32 - shift; uint t7 = BUILTIN_BITALIGN_B32(p7, p6, shift); uint t6 = BUILTIN_BITALIGN_B32(p6, p5, shift); uint t5 = BUILTIN_BITALIGN_B32(p5, p4, shift); p7 = c ? t7 : p7; p6 = c ? t6 : p6; p5 = c ? t5 : p5; // Get 2 lsb of int part and msb of fraction int i = p7 >> 29; // Scoot up 2 more bits so only fraction remains p7 = BUILTIN_BITALIGN_B32(p7, p6, 30); p6 = BUILTIN_BITALIGN_B32(p6, p5, 30); p5 = BUILTIN_BITALIGN_B32(p5, p4, 30); // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5 uint flip = i & 1 ? 0xffffffffU : 0U; uint sign = i & 1 ? 0x80000000U : 0U; p7 = p7 ^ flip; p6 = p6 ^ flip; p5 = p5 ^ flip; // Find exponent and shift away leading zeroes and hidden bit xe = MATH_CLZI(p7) + 1; shift = 32 - xe; p7 = BUILTIN_BITALIGN_B32(p7, p6, shift); p6 = BUILTIN_BITALIGN_B32(p6, p5, shift); // Most significant part of fraction float q1 = AS_FLOAT(sign | ((127 - xe) << 23) | (p7 >> 9)); // Shift out bits we captured on q1 p7 = BUILTIN_BITALIGN_B32(p7, p6, 32-23); // Get 24 more bits of fraction in another float, there are not long strings of zeroes here int xxe = MATH_CLZI(p7) + 1; p7 = BUILTIN_BITALIGN_B32(p7, p6, 32-xxe); float q0 = AS_FLOAT(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9)); // At this point, the fraction q1 + q0 is correct to at least 48 bits // Now we need to multiply the fraction by pi/2 // This loses us about 4 bits // pi/2 = C90 FDA A22 168 C23 4C4 const float pio2h = (float)0xc90fda / 0x1.0p+23f; const float pio2hh = (float)0xc90 / 0x1.0p+11f; const float pio2ht = (float)0xfda / 0x1.0p+23f; const float pio2t = (float)0xa22168 / 0x1.0p+47f; float rh, rt; if (HAVE_FAST_FMA32()) { rh = q1 * pio2h; rt = BUILTIN_FMA_F32(q0, pio2h, BUILTIN_FMA_F32(q1, pio2t, BUILTIN_FMA_F32(q1, pio2h, -rh))); } else { float q1h = AS_FLOAT(AS_UINT(q1) & 0xfffff000); float q1t = q1 - q1h; rh = q1 * pio2h; rt = MATH_MAD(q1t, pio2ht, MATH_MAD(q1t, pio2hh, MATH_MAD(q1h, pio2ht, MATH_MAD(q1h, pio2hh, -rh)))) + MATH_MAD(q0, pio2h, q1*pio2t); } struct redret ret; #if defined EXTRA_PRECISION float t = rh + rt; rt = rt - (t - rh); ret.hi = t; ret.lo = rt; #else ret.hi = rh + rt; #endif ret.i = ((i >> 1) + (i & 1)) & 0x3; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigredsmallD.cl000066400000000000000000000021671415221260100221740ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" #include "trigredD.h" CONSTATTR struct redret MATH_PRIVATE(trigredsmall)(double x) { const double twobypi = 0x1.45f306dc9c883p-1; const double piby2_h = 0x1.921fb54442d18p+0; const double piby2_m = 0x1.1a62633145c00p-54; const double piby2_t = 0x1.b839a252049c0p-104; double dn = BUILTIN_RINT_F64(x * twobypi); double xt = BUILTIN_FMA_F64(dn, -piby2_h, x); double yh = BUILTIN_FMA_F64(dn, -piby2_m, xt); double ph = dn * piby2_m; double pt = BUILTIN_FMA_F64(dn, piby2_m, -ph); double th = xt - ph; double tt = (xt - th) - ph; double yt = BUILTIN_FMA_F64(dn, -piby2_t, ((th - yh) + tt) - pt); double rh = yh + yt; double rt = yt - (rh - yh); struct redret ret; ret.hi = rh; ret.lo = rt; ret.i = (int)dn & 0x3; return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/trigredsmallF.cl000066400000000000000000000053061415221260100221740ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" #include "trigredF.h" #define FMUL(A, AHI, ALO, B, BHI, BLO, CHI, CLO) \ do { \ CHI = A * B; \ CLO = MATH_MAD(ALO, BLO, MATH_MAD(ALO, BHI, MATH_MAD(AHI, BLO, MATH_MAD(AHI, BHI, -CHI)))); \ } while(0) #define FNMA(A, AHI, ALO, B, BHI, BLO, C, D) \ do { \ float __ph, __pt; \ FMUL(A, AHI, ALO, B, BHI, BLO, __ph, __pt); \ float __t = C - __ph; \ D = __t + (((C - __t) - __ph) - __pt); \ } while(0) static inline struct redret mad_reduce(float x) { #if defined EXTRA_PRECISION #error Not implemented #else const float twobypi = 0x1.45f306p-1f; const float piby2_h = 0x1.921fb4p+0f; const float piby2_hh = 0x1.92p+0f; const float piby2_hl = 0x1.fb4p-12f; const float piby2_m = 0x1.4442d0p-24f; const float piby2_mh = 0x1.444p-24f; const float piby2_ml = 0x1.680p-39f; const float piby2_l = 0x1.846988p-48f; const float piby2_lh = 0x1.846p-48f; const float piby2_ll = 0x1.310p-61f; float fn = BUILTIN_RINT_F32(x * twobypi); float fnh = AS_FLOAT(AS_UINT(fn) & 0xfffff000U); float fnl = fn - fnh; float r; FNMA(fn, fnh, fnl, piby2_h, piby2_hh, piby2_hl, x, r); FNMA(fn, fnh, fnl, piby2_m, piby2_mh, piby2_ml, r, r); struct redret ret; ret.hi = MATH_MAD(-piby2_l, fn, r); ret.i = (int)fn & 0x3; return ret; #endif } static inline struct redret fma_reduce(float x) { const float twobypi = 0x1.45f306p-1f; const float piby2_h = 0x1.921fb4p+0f; const float piby2_m = 0x1.4442d0p-24f; const float piby2_l = 0x1.846988p-48f; float fn = BUILTIN_RINT_F32(x * twobypi); struct redret ret; #if defined EXTRA_PRECISION float xt = BUILTIN_FMA_F32(fn, -piby2_h, x); float yh = BUILTIN_FMA_F32(fn, -piby2_m, xt); float ph = fn * piby2_m; float pt = BUILTIN_FMA_F32(fn, piby2_m, -ph); float th = xt - ph; float tt = (xt - th) - ph; float yt = BUILTIN_FMA_F32(fn, -piby2_l, ((th - yh) + tt) - pt); float rh = yh + yt; float rt = yt - (rh - yh); ret.hi = rh; ret.lo = rt; #else float r = BUILTIN_FMA_F32(fn, -piby2_l, BUILTIN_FMA_F32(fn, -piby2_m, BUILTIN_FMA_F32(fn, -piby2_h, x))); ret.hi = r; #endif ret.i =(int)fn & 0x3; return ret; } CONSTATTR struct redret MATH_PRIVATE(trigredsmall)(float x) { if (HAVE_FAST_FMA32()) { return fma_reduce(x); } else { return mad_reduce(x); } } ROCm-Device-Libs-rocm-5.0.0/ocml/src/truncD.cl000066400000000000000000000006501415221260100206310ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" CONSTATTR double MATH_MANGLE(trunc)(double x) { return BUILTIN_TRUNC_F64(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/truncF.cl000066400000000000000000000006461415221260100206400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" CONSTATTR float MATH_MANGLE(trunc)(float x) { return BUILTIN_TRUNC_F32(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/truncH.cl000066400000000000000000000007711415221260100206410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" CONSTATTR half2 MATH_MANGLE2(trunc)(half2 x) { return BUILTIN_TRUNC_2F16(x); } CONSTATTR half MATH_MANGLE(trunc)(half x) { return BUILTIN_TRUNC_F16(x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/y0D.cl000066400000000000000000000115021415221260100200240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" extern double MATH_PRIVATE(sinb)(double, int, double); extern CONSTATTR double MATH_PRIVATE(bp0)(double); extern CONSTATTR double MATH_PRIVATE(ba0)(double); CONSTATTR double MATH_MANGLE(y0)(double x) { const double b0 = 0.3125; const double b1 = 0.4375; const double b2 = 0.5625; const double b3 = 0.6875; const double b4 = 0.8125; const double b5 = 1.0; const double b6 = 1.25; const double b7 = 1.625; const double b8 = 2.0; const double b9 = 2.53125; const double b10 = 3.0; const double b11 = 3.484375; const double b12 = 4.703125; const double b13 = 6.265625; const double b14 = 7.84375; const double b15 = 9.421875; const double b16 = 10.984375; const double b17 = 12.546875; double ret; if (x <= b17) { // Ty to maintain relative accuracy here USE_TABLE(double, p, M64_Y0); double ch, cl; if (x < b8) { if (x < b4) { if (x < b0) { ch = 0.0; cl = 0.0; } else if (x < b1) { ch = 0x1.4p-2; cl = 0.0; p += 1*15; } else if (x < b2) { ch = 0x1.cp-2; cl = 0.0; p += 2*15; } else if (x < b3) { ch = 0x1.2p-1; cl = 0.0; p += 3*15; } else { ch = 0x1.6p-1; cl = 0.0; p += 4*15; } } else { if (x < b5) { ch = 0x1.c982eb8d417eap-1; cl = 0x1.ea9d270347f83p-56; p += 5*15; } else if (x < b6) { ch = 0x1.p+0; cl = 0.0; p += 6*15; } else if (x < b7) { ch = 0x1.4p+0; cl = 0.0; p += 7*15; } else { ch = 0x1.ap+0; cl = 0.0; p += 8*15; } } } else { if (x < b13) { if (x < b9) { ch = 0x1.193bed4dff243p+1; cl = -0x1.bd1e50d219bfdp-55; p += 9*15; } else if (x < b10) { ch = 0x1.44p+1; cl = 0.0; p += 10*15; } else if (x < b11) { ch = 0x1.8p+1; cl = 0.0; p += 11*15; } else if (x < b12) { ch = 0x1.fa9534d98569cp+1; cl = -0x1.f06ae7804384ep-54; p += 12*15; } else { ch = 0x1.5b7fe4e87b02ep+2; cl = 0x1.dfe7bac228e8cp-52; p += 13*15; } } else { if (x < b14) { ch = 0x1.c581dc4e72103p+2; cl = -0x1.9774a495f56cfp-54; p += 14*15; } else if (x < b15) { ch = 0x1.13127ae6169b4p+3; cl = 0x1.479cc068d9046p-52; p += 15*15; } else if (x < b16) { ch = 0x1.471d735a47d58p+3; cl = -0x1.cb49ff791c495p-51; p += 16*15; } else { ch = 0x1.77f9138d43206p+3; cl = 0x1.0fc786ce0608p-55; p += 17*15; } } } ret = 0.0; if (x < b0) { ret = 0x1.45f306dc9c883p-1 * MATH_MANGLE(j0)(x) * MATH_MANGLE(log)(x); x = x*x; } x = x - ch - cl; ret += MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, p[14], p[13]), p[12]), p[11]), p[10]), p[9]), p[8]), p[7]), p[6]), p[5]), p[4]), p[3]), p[2]), p[1]), p[0]); } else { double r = MATH_RCP(x); double r2 = r*r; double p = MATH_PRIVATE(bp0)(r2) * r; ret = 0x1.9884533d43651p-1 * MATH_FAST_SQRT(r) * MATH_PRIVATE(ba0)(r2) * MATH_PRIVATE(sinb)(x, 0, p); ret = BUILTIN_CLASS_F64(x, CLASS_PINF) ? 0.0 : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/y0F.cl000066400000000000000000000110351415221260100200270ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" extern float MATH_PRIVATE(sinb)(float, int, float); extern CONSTATTR float MATH_PRIVATE(bp0)(float); extern CONSTATTR float MATH_PRIVATE(ba0)(float); CONSTATTR float MATH_MANGLE(y0)(float x) { const float b0 = 0.3125f; const float b1 = 0.4375f; const float b2 = 0.5625f; const float b3 = 0.6875f; const float b4 = 0.8125f; const float b5 = 1.0f; const float b6 = 1.25f; const float b7 = 1.625f; const float b8 = 2.0f; const float b9 = 2.53125f; const float b10 = 3.0f; const float b11 = 3.484375f; const float b12 = 4.703125f; const float b13 = 6.265625f; const float b14 = 7.84375f; const float b15 = 9.421875f; const float b16 = 10.984375f; const float b17 = 12.546875f; float ret; if (x <= b17) { // Ty to maintain relative accuracy here USE_TABLE(float, p, M32_Y0); float ch, cl; if (x < b8) { if (x < b4) { if (x < b0) { ch = 0.0f; cl = 0.0f; } else if (x < b1) { ch = 0x1.4p-2f; cl = 0.0f; p += 1*9; } else if (x < b2) { ch = 0x1.cp-2f; cl = 0.0f; p += 2*9; } else if (x < b3) { ch = 0x1.2p-1f; cl = 0.0f; p += 3*9; } else { ch = 0x1.6p-1f; cl = 0.0f; p += 4*9; } } else { if (x < b5) { ch = 0x1.c982ecp-1f; cl = -0x1.cafa06p-27f; p += 5*9; } else if (x < b6) { ch = 0x1.p+0f; cl = 0.0f; p += 6*9; } else if (x < b7) { ch = 0x1.4p+0f; cl = 0.0f; p += 7*9; } else { ch = 0x1.ap+0f; cl = 0.0f; p += 8*9; } } } else { if (x < b13) { if (x < b9) { ch = 0x1.193beep+1f; cl = -0x1.6401b8p-24f; p += 9*9; } else if (x < b10) { ch = 0x1.44p+1f; cl = 0.0f; p += 10*9; } else if (x < b11) { ch = 0x1.8p+1f; cl = 0.0f; p += 11*9; } else if (x < b12) { ch = 0x1.fa9534p+1f; cl = 0x1.b30ad4p-24f; p += 12*9; } else { ch = 0x1.5b7fe4p+2f; cl = 0x1.d0f606p-23f; p += 13*9; } } else { if (x < b14) { ch = 0x1.c581dcp+2f; cl = 0x1.39c84p-24f; p += 14*9; } else if (x < b15) { ch = 0x1.13127ap+3f; cl = 0x1.cc2d36p-22f; p += 15*9; } else if (x < b16) { ch = 0x1.471d74p+3f; cl = -0x1.4b7056p-22f; p += 16*9; } else { ch = 0x1.77f914p+3f; cl = -0x1.caf37ep-23f; p += 17*9; } } } ret = 0.0f; if (x < b0) { ret = 0x1.45f306p-1f * MATH_MANGLE(j0)(x) * MATH_MANGLE(log)(x); x = x*x; } x = x - ch - cl; ret += MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, p[8], p[7]), p[6]), p[5]), p[4]), p[3]), p[2]), p[1]), p[0]); } else { float r = MATH_RCP(x); float r2 = r*r; float p = MATH_PRIVATE(bp0)(r2) * r; ret = 0x1.988454p-1f * BUILTIN_RSQRT_F32(x) * MATH_PRIVATE(ba0)(r2) * MATH_PRIVATE(sinb)(x, 0, p); ret = BUILTIN_CLASS_F32(x, CLASS_PINF) ? 0.0f : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/y0H.cl000066400000000000000000000006711415221260100200350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" UGEN(y0) CONSTATTR half MATH_MANGLE(y0)(half x) { return (half)MATH_UPMANGLE(y0)((float)x); } ROCm-Device-Libs-rocm-5.0.0/ocml/src/y1D.cl000066400000000000000000000121261415221260100200300ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathD.h" extern double MATH_PRIVATE(sinb)(double, int, double); extern CONSTATTR double MATH_PRIVATE(bp1)(double); extern CONSTATTR double MATH_PRIVATE(ba1)(double); CONSTATTR double MATH_MANGLE(y1)(double x) { const double b0 = 0.5; const double b1 = 0.625; const double b2 = 0.75; const double b3 = 0.9375; const double b4 = 1.21875; const double b5 = 1.53125; const double b6 = 1.84375; const double b7 = 2.078125; const double b8 = 2.3125; const double b9 = 2.734375; const double b10 = 3.15625; const double b11 = 4.203125; const double b12 = 4.6875; const double b13 = 6.1875; const double b14 = 7.76953125; const double b15 = 9.359375; const double b16 = 10.9375; const double b17 = 12.5625; double ret; if (x <= b17) { // Ty to maintain relative accuracy here USE_TABLE(double, p, M64_Y1); double ch, cl; if (x < b8) { if (x < b4) { if (x < b0) { ch = 0.0; cl = 0.0; p += 0*15; } else if (x < b1) { ch = 0x1.0p-1; cl = 0.0; p += 1*15; } else if (x < b2) { ch = 0x1.4p-1; cl = 0.0; p += 2*15; } else if (x < b3) { ch = 0x1.8p-1; cl = 0.0; p += 3*15; } else { ch = 0x1.ep-1; cl = 0.0; p += 4*15; } } else { if (x < b5) { ch = 0x1.38p+0; cl = 0.0; p += 5*15; } else if (x < b6) { ch = 0x1.88p+0; cl = 0.0; p += 6*15; } else if (x < b7) { ch = 0x1.d8p+0; cl = 0.0; p += 7*15; } else { ch = 0x1.193bed4dff243p+1; cl = -0x1.bd1e50d219bfdp-55; p += 8*15; } } } else { if (x < b13) { if (x < b9) { ch = 0x1.28p+1; cl = 0.0; p += 9*15; } else if (x < b10) { ch = 0x1.5ep+1; cl = 0.0; p += 10*15; } else if (x < b11) { ch = 0x1.d76d4affba175p+1; cl = 0x1.3bac0714e4129p-58; p += 11*15; } else if (x < b12) { ch = 0x1.0dp+2; cl = 0.0; p += 12*15; } else { ch = 0x1.5b7fe4e87b02ep+2; cl = 0x1.dfe7bac228e8cp-52; p += 13*15; } } else { if (x < b14) { ch = 0x1.bc41890588553p+2; cl = 0x1.7960b6b1c46acp-53; p += 14*15; } else if (x < b15) { ch = 0x1.13127ae6169b4p+3; cl = 0x1.479cc068d9046p-52; p += 15*15; } else if (x < b16) { ch = 0x1.43f2ee51e8c7ep+3; cl = 0x1.8f4ba5d68e44p-51; p += 16*15; } else { ch = 0x1.77f9138d43206p+3; cl = 0x1.0fc786ce0608p-55; p += 17*15; } } } double x2 = x*x; double xs = x - ch - cl; double t = x < b0 ? x2 : xs; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, p[14], p[13]), p[12]), p[11]), p[10]), p[9]), p[8]), p[7]), p[6]), p[5]), p[4]), p[3]), p[2]), p[1]), p[0]); if (x < b0) { const double twobypi = 0x1.45f306dc9c883p-1; if (x < 0x1.0p-33) ret = MATH_DIV(-twobypi, BUILTIN_ABS_F64(x)); else ret = MATH_MAD(ret, x, twobypi*(MATH_MANGLE(j1)(x) * MATH_MANGLE(log)(x) - MATH_RCP(x))); ret = x < 0.0 ? AS_DOUBLE(QNANBITPATT_DP64) : ret; } } else { double r = MATH_RCP(x); double r2 = r*r; double p = MATH_PRIVATE(bp1)(r2) * r; ret = 0x1.9884533d43651p-1 * MATH_FAST_SQRT(r) * MATH_PRIVATE(ba1)(r2) * MATH_PRIVATE(sinb)(x, 1, p); ret = BUILTIN_CLASS_F64(x, CLASS_PINF) ? 0.0 : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/y1F.cl000066400000000000000000000115021415221260100200270ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathF.h" extern float MATH_PRIVATE(sinb)(float, int, float); extern CONSTATTR float MATH_PRIVATE(bp1)(float); extern CONSTATTR float MATH_PRIVATE(ba1)(float); CONSTATTR float MATH_MANGLE(y1)(float x) { const float b0 = 0.5f; const float b1 = 0.625f; const float b2 = 0.75f; const float b3 = 0.9375f; const float b4 = 1.21875f; const float b5 = 1.53125f; const float b6 = 1.84375f; const float b7 = 2.078125f; const float b8 = 2.3125f; const float b9 = 2.734375f; const float b10 = 3.15625f; const float b11 = 4.203125f; const float b12 = 4.6875f; const float b13 = 6.1875f; const float b14 = 7.76953125f; const float b15 = 9.359375f; const float b16 = 10.9375f; const float b17 = 12.5625f; float ret; if (x <= b17) { // Ty to maintain relative accuracy here USE_TABLE(float, p, M32_Y1); float ch, cl; if (x < b8) { if (x < b4) { if (x < b0) { ch = 0.0f; cl = 0.0f; p += 0*9; } else if (x < b1) { ch = 0x1.0p-1f; cl = 0.0f; p += 1*9; } else if (x < b2) { ch = 0x1.4p-1f; cl = 0.0f; p += 2*9; } else if (x < b3) { ch = 0x1.8p-1f; cl = 0.0f; p += 3*9; } else { ch = 0x1.ep-1f; cl = 0.0f; p += 4*9; } } else { if (x < b5) { ch = 0x1.38p+0f; cl = 0.0f; p += 5*9; } else if (x < b6) { ch = 0x1.88p+0f; cl = 0.0f; p += 6*9; } else if (x < b7) { ch = 0x1.d8p+0f; cl = 0.0f; p += 7*9; } else { ch = 0x1.193beep+1f; cl = -0x1.6401b8p-24f; p += 8*9; } } } else { if (x < b13) { if (x < b9) { ch = 0x1.28p+1f; cl = 0.0f; p += 9*9; } else if (x < b10) { ch = 0x1.5ep+1f; cl = 0.0f; p += 10*9; } else if (x < b11) { ch = 0x1.d76d4ap+1f; cl = 0x1.ff742ep-24f; p += 11*9; } else if (x < b12) { ch = 0x1.0dp+2f; cl = 0.0f; p += 12*9; } else { ch = 0x1.5b7fe4p+2f; cl = 0x1.d0f606p-23f; p += 13*9; } } else { if (x < b14) { ch = 0x1.bc418ap+2f; cl = -0x1.f4ef56p-23f; p += 14*9; } else if (x < b15) { ch = 0x1.13127ap+3f; cl = 0x1.cc2d36p-22f; p += 15*9; } else if (x < b16) { ch = 0x1.43f2eep+3f; cl = 0x1.47a32p-23f; p += 16*9; } else { ch = 0x1.77f914p+3f; cl = -0x1.caf37ep-23f; p += 17*9; } } } float x2 = x*x; float xs = x - ch - cl; float t = x < b0 ? x2 : xs; ret = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, p[8], p[7]), p[6]), p[5]), p[4]), p[3]), p[2]), p[1]), p[0]); if (x < b0) { const float twobypi = 0x1.45f306p-1f; if (x < 0x1.0p-20f) ret = MATH_DIV(-twobypi, BUILTIN_ABS_F32(x)); else ret = MATH_MAD(ret, x, twobypi*(MATH_MANGLE(j1)(x) * MATH_MANGLE(log)(x) - MATH_RCP(x))); ret = x < 0.0f ? AS_FLOAT(QNANBITPATT_SP32) : ret; } } else { float r = MATH_RCP(x); float r2 = r*r; float p = MATH_PRIVATE(bp1)(r2) * r; ret = 0x1.988454p-1f * BUILTIN_RSQRT_F32(x) * MATH_PRIVATE(ba1)(r2) * MATH_PRIVATE(sinb)(x, 1, p); ret = BUILTIN_CLASS_F32(x, CLASS_PINF) ? 0.0f : ret; } return ret; } ROCm-Device-Libs-rocm-5.0.0/ocml/src/y1H.cl000066400000000000000000000006571415221260100200420ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "mathH.h" UGEN(y1) half MATH_MANGLE(y1)(half x) { return (half)MATH_UPMANGLE(y1)((float)x); } ROCm-Device-Libs-rocm-5.0.0/opencl/000077500000000000000000000000001415221260100166105ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/CMakeLists.txt000066400000000000000000000026441415221260100213560ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- file(GLOB cl_sources ${CMAKE_CURRENT_SOURCE_DIR}/src/async/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/common/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/devenq/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/geometric/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/image/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/integer/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/math/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/media/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/misc/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/pipes/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/relational/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/subgroup/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/vldst/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/workgroup/*.cl ) file(GLOB sources ${cl_sources}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/integer) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/workgroup) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../irif/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ocml/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ockl/inc) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../oclc/inc) opencl_bc_lib(NAME opencl SOURCES ${sources}) ROCm-Device-Libs-rocm-5.0.0/opencl/src/000077500000000000000000000000001415221260100173775ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/async/000077500000000000000000000000001415221260100205145ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/async/awgcpy.cl000066400000000000000000000062721415221260100223350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _S(X) #X #define S(X) _S(X) #define ATTR __attribute__((overloadable)) #define IATTR #define AATTR(A) __attribute__((overloadable, alias(A))) #define BODY(D,S) \ size_t i; \ size_t d = mul24(mul24((int)get_local_size(0), (int)get_local_size(1)), (int)get_local_size(2)); \ for (i = get_local_linear_id(); istate is done and then start processing // WAIT_WORK_GROUP currently == WAIT_PARENT uint command_id; //!< [LWO/SRO] The unique command ID uint child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels. // It's incremented on the // start and decremented on the finish. The parent kernel can be considered as // done when the value is 0 and the state is DONE //!< [LWO/SRO] CL event for the current execution (clk_event_t) union { __global struct _AmdEvent *completion; ulong completion_padding; }; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*) union { __global struct _AmdAqlWrap *parent_wrap; ulong parent_padding; }; union { __global size_t *wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) ulong wait_list_padding; }; uint wait_num; //!< [LWO/SRO] The number of cl_event_wait objects uint reserved[5]; //!< For the future usage hsa_kernel_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet - 64 bytes AQL packet } AmdAqlWrap; typedef struct _AmdEvent { uint state; //!< [LRO/SRW] Event state: START, END, COMPLETE uint counter; //!< [LRW] Event retain/release counter. 0 means the event is free ulong timer[3]; //!< [LRO/SWO] Timer values for profiling for each state ulong capture_info; //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME } AmdEvent; // XXX this needs to match workgroup/wg.h MAX_WAVES_PER_SIMD #define CL_DEVICE_MAX_WORK_GROUP_SIZE 256 // ABI has 6 implicit trailing arguments: // global_offset[3], printf_buf, default vqueue pointer, and self AqlWrap pointer #define NUM_IMPLICIT_ARGS 6 static inline __global void * get_printf_ptr(void) { return (__global void *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3]); } static inline __global AmdVQueueHeader * get_vqueue(void) { return (__global AmdVQueueHeader *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[4]); } static inline __global AmdAqlWrap * get_aql_wrap(void) { return (__global AmdAqlWrap *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[5]); } // reserve a slot in a bitmask controlled resource // n is the number of slots static inline int reserve_slot(__global uint * restrict mask, uint n, uint mask_groups) { n >>= 5; uint j, k, v, vv, z; // Spread the starting points k = (get_local_linear_id() * mask_groups) % n; // Make only one pass for (j=0;j> 5)); uint v, vv; v = atomic_load_explicit(p, memory_order_relaxed, memory_scope_device); for (;;) { vv = v & b; if (atomic_compare_exchange_strong_explicit(p, &v, vv, memory_order_relaxed, memory_order_relaxed, memory_scope_device)) break; } } static inline uint align_up(uint start, uint align) { return (start + align - 1U) & -align; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/devenq/enqueue.cl000066400000000000000000000436661415221260100226670ustar00rootroot00000000000000 #include "devenq.h" #define LSIZE_LIMIT 65536U #define LOCAL_ALIGN 16 struct rtinfo { ulong kernel_object; uint private_segment_size; uint group_segment_size; }; static inline void copy_captured_context(__global void * restrict d, void * restrict s, uint size, uint align) { if (align == 8) { __global ulong * restrict d8 = (__global ulong * restrict)d; ulong * restrict s8 = (ulong * restrict)s; uint n = size / align; uint r = size % align; for (uint i=0; i 3) { *(__global uint * restrict)dd = *(uint * restrict)ss; dd += 4; ss += 4; r -= 4; } if (r > 1) { *(__global ushort * restrict)dd = *(ushort * restrict)ss; dd += 2; ss += 2; r -= 2; } if (r > 0) { *dd = *ss; } } } else if (align >= 16) { __global uint4 * restrict d16 = (__global uint4 * restrict)d; uint4 * restrict s16 = (uint4 * restrict)s; uint n = size / 16; uint r = size % 16; for (uint i=0; i 7) { *(__global ulong * restrict)dd = *(ulong * restrict)ss; dd += 8; ss += 8; r -= 8; } if (r > 3) { *(__global uint * restrict)dd = *(uint * restrict)ss; dd += 4; ss += 4; r -= 4; } if (r > 1) { *(__global ushort * restrict)dd = *(ushort * restrict)ss; dd += 2; ss += 2; r -= 2; } if (r > 0) { *dd = *ss; } } } else if (align == 4) { __global uint * restrict d4 = (__global uint * restrict)d; uint * restrict s4 = (uint * restrict)s; uint n = size / align; uint r = size % align; for (uint i=0; i 1) { *(__global ushort * restrict)dd = *(ushort * restrict)ss; dd += 2; ss += 2; r -= 2; } if (r > 0) { *dd = *ss; } } } else { __global char * restrict d1 = (__global char * restrict)d; char * restrict s1 = (char * restrict)s; for (uint i=0; icounter, (uint)1, memory_order_relaxed, memory_scope_device); dst[i] = src[i]; } } __attribute__((overloadable, always_inline, const)) queue_t get_default_queue(void) { return __builtin_astype(get_vqueue(), queue_t); } __attribute__((overloadable)) int enqueue_marker(queue_t q, uint nwl, const clk_event_t *wl, clk_event_t *ce) { __global AmdVQueueHeader *vq = __builtin_astype(q, __global AmdVQueueHeader *); if (nwl > vq->wait_size) return CLK_ENQUEUE_FAILURE; // Get a wrap slot __global uint *amask = (__global uint *)vq->aql_slot_mask; int ai = reserve_slot(amask, vq->aql_slot_num, vq->mask_groups); if (ai < 0) return CLK_ENQUEUE_FAILURE; // Get a return event slot __global uint *emask = (__global uint *)vq->event_slot_mask; int ei = reserve_slot(emask, vq->event_slot_num, 1); if (ei < 0) { release_slot(amask, ai); return CLK_ENQUEUE_FAILURE; } // Initialize return event __global AmdEvent *ev = (__global AmdEvent *)vq->event_slots + ei; ev->state = CL_SUBMITTED; ev->counter = 2; ev->capture_info = 0; // Initialize wrap __global AmdAqlWrap *me = get_aql_wrap(); __global AmdAqlWrap *aw = (__global AmdAqlWrap *)(vq + 1) + ai; aw->enqueue_flags = CLK_ENQUEUE_FLAGS_NO_WAIT; aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_relaxed, memory_scope_device); aw->child_counter = 0; aw->completion = ev; aw->parent_wrap = me; if (nwl > 0) copy_retain_waitlist((__global size_t *)aw->wait_list, (const size_t *)wl, nwl); aw->wait_num = nwl; // A marker is never enqueued so ignore displatch packet // Tell the scheduler atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_relaxed, memory_scope_device); atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_MARKER, memory_order_release, memory_scope_device); *ce = __builtin_astype(ev, clk_event_t); return 0; } int __enqueue_kernel_basic(queue_t q, kernel_enqueue_flags_t f, const ndrange_t r, void *block, void *capture) { uint csize = ((uint *)capture)[0]; uint calign = ((uint *)capture)[1]; __global AmdVQueueHeader *vq = __builtin_astype(q, __global AmdVQueueHeader *); if (align_up(csize, sizeof(size_t)) + NUM_IMPLICIT_ARGS*sizeof(size_t) > vq->arg_size || mul24(mul24((uint)r.localWorkSize[0], (uint)r.localWorkSize[1]), (uint)r.localWorkSize[2]) > CL_DEVICE_MAX_WORK_GROUP_SIZE) return CLK_ENQUEUE_FAILURE; // Get a queue slot __global uint *amask = (__global uint *)vq->aql_slot_mask; int ai = reserve_slot(amask, vq->aql_slot_num, vq->mask_groups); if (ai < 0) return CLK_ENQUEUE_FAILURE; __global AmdAqlWrap *aw = (__global AmdAqlWrap *)(vq + 1) + ai; // Set up kernarg copy_captured_context(aw->aql.kernarg_address, capture, csize, calign); __global size_t *implicit = (__global size_t *)((__global char *)aw->aql.kernarg_address + align_up(csize, sizeof(size_t))); implicit[0] = r.globalWorkOffset[0]; implicit[1] = r.globalWorkOffset[1]; implicit[2] = r.globalWorkOffset[2]; implicit[3] = (size_t)get_printf_ptr(); implicit[4] = (size_t)get_vqueue(); implicit[5] = (size_t)aw; const __global struct rtinfo *rti = (const __global struct rtinfo *)block; __global AmdAqlWrap *me = get_aql_wrap(); aw->enqueue_flags = f; aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_relaxed, memory_scope_device); aw->completion = 0UL; aw->parent_wrap = me; aw->wait_num = 0; aw->aql.header = (0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0); aw->aql.setup = r.workDimension; aw->aql.workgroup_size_x = (ushort)r.localWorkSize[0]; aw->aql.workgroup_size_y = (ushort)r.localWorkSize[1]; aw->aql.workgroup_size_z = (ushort)r.localWorkSize[2]; aw->aql.grid_size_x = (uint)r.globalWorkSize[0]; aw->aql.grid_size_y = (uint)r.globalWorkSize[1]; aw->aql.grid_size_z = (uint)r.globalWorkSize[2]; aw->aql.private_segment_size = rti->private_segment_size; aw->aql.group_segment_size = rti->group_segment_size; aw->aql.kernel_object = rti->kernel_object; aw->aql.completion_signal.handle = 0; atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_relaxed, memory_scope_device); atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_READY, memory_order_release, memory_scope_device); return 0; } int __enqueue_kernel_basic_events(queue_t q, kernel_enqueue_flags_t f, const ndrange_t r, uint nwl, const clk_event_t *wl, clk_event_t *ce, void *block, void *capture) { uint csize = ((uint *)capture)[0]; uint calign = ((uint *)capture)[1]; __global AmdVQueueHeader *vq = __builtin_astype(q, __global AmdVQueueHeader *); if (align_up(csize, sizeof(size_t)) + NUM_IMPLICIT_ARGS*sizeof(size_t) > vq->arg_size || nwl > vq->wait_size || mul24(mul24((uint)r.localWorkSize[0], (uint)r.localWorkSize[1]), (uint)r.localWorkSize[2]) > CL_DEVICE_MAX_WORK_GROUP_SIZE) return CLK_ENQUEUE_FAILURE; __global uint *amask = (__global uint *)vq->aql_slot_mask; int ai = reserve_slot(amask, vq->aql_slot_num, vq->mask_groups); if (ai < 0) return CLK_ENQUEUE_FAILURE; __global AmdEvent *ev = (__global AmdEvent *)NULL; if (ce) { // Get a completion event slot __global uint *emask = (__global uint *)vq->event_slot_mask; int ei = reserve_slot(emask, vq->event_slot_num, 1); if (ei < 0) { release_slot(amask, ai); return CLK_ENQUEUE_FAILURE; } // Initialize completion event ev = (__global AmdEvent *)vq->event_slots + ei; ev->state = CL_SUBMITTED; ev->counter = 2; ev->capture_info = 0; *ce = __builtin_astype(ev, clk_event_t); } __global AmdAqlWrap *aw = (__global AmdAqlWrap *)(vq + 1) + ai; // Set up kernarg copy_captured_context(aw->aql.kernarg_address, capture, csize, calign); __global size_t *implicit = (__global size_t *)((__global char *)aw->aql.kernarg_address + align_up(csize, sizeof(size_t))); implicit[0] = r.globalWorkOffset[0]; implicit[1] = r.globalWorkOffset[1]; implicit[2] = r.globalWorkOffset[2]; implicit[3] = (size_t)get_printf_ptr(); implicit[4] = (size_t)get_vqueue(); implicit[5] = (size_t)aw; const __global struct rtinfo *rti = (const __global struct rtinfo *)block; __global AmdAqlWrap *me = get_aql_wrap(); aw->enqueue_flags = f; aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_relaxed, memory_scope_device); aw->completion = ev; aw->parent_wrap = me; if (nwl > 0) copy_retain_waitlist(aw->wait_list, (const size_t *)wl, nwl); aw->wait_num = nwl; aw->aql.header = (ushort)((0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0)); aw->aql.setup = (ushort)r.workDimension; aw->aql.workgroup_size_x = (ushort)r.localWorkSize[0]; aw->aql.workgroup_size_y = (ushort)r.localWorkSize[1]; aw->aql.workgroup_size_z = (ushort)r.localWorkSize[2]; aw->aql.grid_size_x = (uint)r.globalWorkSize[0]; aw->aql.grid_size_y = (uint)r.globalWorkSize[1]; aw->aql.grid_size_z = (uint)r.globalWorkSize[2]; aw->aql.private_segment_size = rti->private_segment_size; aw->aql.group_segment_size = rti->group_segment_size; aw->aql.kernel_object = rti->kernel_object; aw->aql.completion_signal.handle = 0; atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_relaxed, memory_scope_device); atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_READY, memory_order_release, memory_scope_device); return 0; } int __enqueue_kernel_varargs(queue_t q, kernel_enqueue_flags_t f, const ndrange_t r, void *block, void *capture, uint nl, __private size_t *ll) { uint csize = ((uint *)capture)[0]; uint calign = ((uint *)capture)[1]; const __global struct rtinfo *rti = (const __global struct rtinfo *)block; uint lo = rti->group_segment_size; for (uint il=0; il LSIZE_LIMIT || align_up(align_up(csize, sizeof(uint)) + nl*sizeof(uint), sizeof(size_t)) + NUM_IMPLICIT_ARGS*sizeof(size_t) > vq->arg_size || mul24(mul24((uint)r.localWorkSize[0], (uint)r.localWorkSize[1]), (uint)r.localWorkSize[2]) > CL_DEVICE_MAX_WORK_GROUP_SIZE) return CLK_ENQUEUE_FAILURE; // Get a queue slot __global uint *amask = (__global uint *)vq->aql_slot_mask; int ai = reserve_slot(amask, vq->aql_slot_num, vq->mask_groups); if (ai < 0) return CLK_ENQUEUE_FAILURE; __global AmdAqlWrap *aw = (__global AmdAqlWrap *)(vq + 1) + ai; // Set up kernarg copy_captured_context(aw->aql.kernarg_address, capture, csize, calign); __global uint *la = (__global uint *)((__global char *)aw->aql.kernarg_address + align_up(csize, sizeof(uint))); lo = rti->group_segment_size; for (uint il=0; ilaql.kernarg_address + align_up(align_up(csize, sizeof(uint)) + nl*sizeof(uint), sizeof(size_t))); implicit[0] = r.globalWorkOffset[0]; implicit[1] = r.globalWorkOffset[1]; implicit[2] = r.globalWorkOffset[2]; implicit[3] = (size_t)get_printf_ptr(); implicit[4] = (size_t)get_vqueue(); implicit[5] = (size_t)aw; __global AmdAqlWrap *me = get_aql_wrap(); aw->enqueue_flags = f; aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_relaxed, memory_scope_device); aw->completion = 0UL; aw->parent_wrap = me; aw->wait_num = 0; aw->aql.header = (0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0); aw->aql.setup = r.workDimension; aw->aql.workgroup_size_x = (ushort)r.localWorkSize[0]; aw->aql.workgroup_size_y = (ushort)r.localWorkSize[1]; aw->aql.workgroup_size_z = (ushort)r.localWorkSize[2]; aw->aql.grid_size_x = (uint)r.globalWorkSize[0]; aw->aql.grid_size_y = (uint)r.globalWorkSize[1]; aw->aql.grid_size_z = (uint)r.globalWorkSize[2]; aw->aql.private_segment_size = rti->private_segment_size; aw->aql.group_segment_size = lo; aw->aql.kernel_object = rti->kernel_object; aw->aql.completion_signal.handle = 0; atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_relaxed, memory_scope_device); atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_READY, memory_order_release, memory_scope_device); return 0; } int __enqueue_kernel_events_varargs(queue_t q, kernel_enqueue_flags_t f, const ndrange_t r, int nwl, const clk_event_t *wl, clk_event_t *ce, void *block, void *capture, uint nl, __private size_t *ll) { uint csize = ((uint *)capture)[0]; uint calign = ((uint *)capture)[1]; const __global struct rtinfo *rti = (const __global struct rtinfo *)block; uint lo = rti->group_segment_size; for (uint il=0; il LSIZE_LIMIT || nwl > vq->wait_size || align_up(align_up(csize, sizeof(uint)) + nl*sizeof(uint), sizeof(size_t)) + NUM_IMPLICIT_ARGS*sizeof(size_t) > vq->arg_size || mul24(mul24((uint)r.localWorkSize[0], (uint)r.localWorkSize[1]), (uint)r.localWorkSize[2]) > CL_DEVICE_MAX_WORK_GROUP_SIZE) return CLK_ENQUEUE_FAILURE; // Get a queue slot __global uint *amask = (__global uint *)vq->aql_slot_mask; int ai = reserve_slot(amask, vq->aql_slot_num, vq->mask_groups); if (ai < 0) return CLK_ENQUEUE_FAILURE; __global AmdEvent *ev = (__global AmdEvent *)NULL; if (ce) { // Get a completion event slot __global uint *emask = (__global uint *)vq->event_slot_mask; int ei = reserve_slot(emask, vq->event_slot_num, 1); if (ei < 0) { release_slot(amask, ai); return CLK_ENQUEUE_FAILURE; } // Initialize completion event ev = (__global AmdEvent *)vq->event_slots + ei; ev->state = CL_SUBMITTED; ev->counter = 2; ev->capture_info = 0; *ce = __builtin_astype(ev, clk_event_t); } __global AmdAqlWrap *aw = (__global AmdAqlWrap *)(vq + 1) + ai; // Set up kernarg copy_captured_context(aw->aql.kernarg_address, capture, csize, calign); __global uint *la = (__global uint *)((__global char *)aw->aql.kernarg_address + align_up(csize, sizeof(uint))); lo = rti->group_segment_size; for (uint il=0; ilaql.kernarg_address + align_up(align_up(csize, sizeof(uint)) + nl*sizeof(uint), sizeof(size_t))); implicit[0] = r.globalWorkOffset[0]; implicit[1] = r.globalWorkOffset[1]; implicit[2] = r.globalWorkOffset[2]; implicit[3] = (size_t)get_printf_ptr(); implicit[4] = (size_t)get_vqueue(); implicit[5] = (size_t)aw; __global AmdAqlWrap *me = get_aql_wrap(); aw->enqueue_flags = f; aw->command_id = atomic_fetch_add_explicit((__global atomic_uint *)&vq->command_counter, (uint)1, memory_order_relaxed, memory_scope_device); aw->completion = ev; aw->parent_wrap = me; if (nwl > 0) copy_retain_waitlist((__global size_t *)aw->wait_list, (const size_t *)wl, nwl); aw->wait_num = nwl; aw->aql.header = (0x1 << 11) | (0x1 << 9) |(0x0 << 8) | (0x2 << 0); aw->aql.setup = r.workDimension; aw->aql.workgroup_size_x = (ushort)r.localWorkSize[0]; aw->aql.workgroup_size_y = (ushort)r.localWorkSize[1]; aw->aql.workgroup_size_z = (ushort)r.localWorkSize[2]; aw->aql.grid_size_x = (uint)r.globalWorkSize[0]; aw->aql.grid_size_y = (uint)r.globalWorkSize[1]; aw->aql.grid_size_z = (uint)r.globalWorkSize[2]; aw->aql.private_segment_size = rti->private_segment_size; aw->aql.group_segment_size = lo; aw->aql.kernel_object = rti->kernel_object; aw->aql.completion_signal.handle = 0; atomic_fetch_add_explicit((__global atomic_uint *)&me->child_counter, (uint)1, memory_order_relaxed, memory_scope_device); atomic_store_explicit((__global atomic_uint *)&aw->state, AQL_WRAP_READY, memory_order_release, memory_scope_device); return 0; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/devenq/events.cl000066400000000000000000000047361415221260100225170ustar00rootroot00000000000000 #include "devenq.h" #define ATTR __attribute__((overloadable, always_inline)) ATTR void retain_event(clk_event_t e) { __global AmdEvent *ev = __builtin_astype(e, __global AmdEvent *); atomic_fetch_add_explicit((__global atomic_uint *)&ev->counter, (uint)1, memory_order_relaxed, memory_scope_device); } ATTR void release_event(clk_event_t e) { __global AmdEvent *ev = __builtin_astype(e, __global AmdEvent *); uint c = atomic_fetch_sub_explicit((__global atomic_uint *)&ev->counter, (uint)1, memory_order_relaxed, memory_scope_device); if (c == 1U) { __global AmdVQueueHeader *vq = get_vqueue(); __global uint *emask = (__global uint *)vq->event_slot_mask; __global AmdEvent *eb = (__global AmdEvent *)vq->event_slots; uint i = ev - eb; release_slot(emask, i); } } ATTR clk_event_t create_user_event(void) { __global AmdVQueueHeader *vq = get_vqueue(); __global uint *emask = (__global uint *)vq->event_slot_mask; int i = reserve_slot(emask, vq->event_slot_num, 1); if (i >= 0) { __global AmdEvent *ev = (__global AmdEvent *)vq->event_slots + i; ev->state = CL_SUBMITTED; ev->counter = 1; ev->capture_info = 0; return __builtin_astype(ev, clk_event_t); } else return __builtin_astype((ulong)0, clk_event_t); } ATTR bool is_valid_event(clk_event_t e) { return __builtin_astype(e, ulong) != (ulong)0; } ATTR void set_user_event_status(clk_event_t e, int s) { __global AmdEvent *ev = __builtin_astype(e, __global AmdEvent *); atomic_store_explicit((__global atomic_uint *)&ev->state, (uint)s, memory_order_release, memory_scope_device); } ATTR void capture_event_profiling_info(clk_event_t e, clk_profiling_info n, __global void *p) { // Currently the second argument must be CLK_PROFILING_COMMAND_EXEC_TIME __global AmdEvent *ev = __builtin_astype(e, __global AmdEvent *); // Set the pointer now in case we're racing with the scheduler atomic_store_explicit((__global atomic_ulong *)&ev->capture_info, (ulong)p, memory_order_relaxed, memory_scope_device); uint state = atomic_load_explicit((__global atomic_uint *)&ev->state, memory_order_acquire, memory_scope_device); if (state == CL_COMPLETE) { __global ulong *t = (__global ulong *)ev->timer; ((__global ulong *)p)[0] = t[PROFILING_COMMAND_END] - t[PROFILING_COMMAND_START]; ((__global ulong *)p)[1] = t[PROFILING_COMMAND_COMPLETE] - t[PROFILING_COMMAND_START]; } } ROCm-Device-Libs-rocm-5.0.0/opencl/src/devenq/getkern.cl000066400000000000000000000011741415221260100226430ustar00rootroot00000000000000 #include "devenq.h" __attribute__((always_inline, const)) uint __get_kernel_work_group_size_impl(void *b, void *c) { return (uint)CL_DEVICE_MAX_WORK_GROUP_SIZE; } __attribute__((always_inline, const)) uint __get_kernel_preferred_work_group_size_multiple_impl(void *b, void *c) { return 64U; } // 2.1 Reference card mentions // uint get_kernel_sub_group_count_for_ndrange(ndrange_t, block); // --> __get_kernel_sub_group_count_for_ndrange_impl(ndrange_t, void *, void *); // uint get_kernel_max_sub_group_size_for_ndrange(ndrange_t, block); // --> __get_kernel_max_sub_group_size_for_ndrange_impl(ndrange_t, void *, void *); ROCm-Device-Libs-rocm-5.0.0/opencl/src/devenq/ndrange.cl000066400000000000000000000100431415221260100226150ustar00rootroot00000000000000 #include "devenq.h" #define ATTR __attribute__((overloadable, always_inline, const)) // 1D variants ATTR ndrange_t ndrange_1D(size_t gws) { ndrange_t ret; ret.workDimension = 1; ret.globalWorkOffset[0] = 0; ret.globalWorkOffset[1] = 0; ret.globalWorkOffset[2] = 0; ret.globalWorkSize[0] = gws; ret.globalWorkSize[1] = 1; ret.globalWorkSize[2] = 1; ret.localWorkSize[0] = min(gws, (size_t)64); ret.localWorkSize[1] = 1; ret.localWorkSize[2] = 1; return ret; } ATTR ndrange_t ndrange_1D(size_t gws, size_t lws) { ndrange_t ret; ret.workDimension = 1; ret.globalWorkOffset[0] = 0; ret.globalWorkOffset[1] = 0; ret.globalWorkOffset[2] = 0; ret.globalWorkSize[0] = gws; ret.globalWorkSize[1] = 1; ret.globalWorkSize[2] = 1; ret.localWorkSize[0] = lws; ret.localWorkSize[1] = 1; ret.localWorkSize[2] = 1; return ret; } ATTR ndrange_t ndrange_1D(size_t goff, size_t gws, size_t lws) { ndrange_t ret; ret.workDimension = 1; ret.globalWorkOffset[0] = goff; ret.globalWorkOffset[1] = 0; ret.globalWorkOffset[2] = 0; ret.globalWorkSize[0] = gws; ret.globalWorkSize[1] = 1; ret.globalWorkSize[2] = 1; ret.localWorkSize[0] = lws; ret.localWorkSize[1] = 1; ret.localWorkSize[2] = 1; return ret; } // 2D variants ATTR ndrange_t ndrange_2D(const size_t gws[2]) { ndrange_t ret; ret.workDimension = 2; ret.globalWorkOffset[0] = 0; ret.globalWorkOffset[1] = 0; ret.globalWorkOffset[2] = 0; ret.globalWorkSize[0] = gws[0]; ret.globalWorkSize[1] = gws[1]; ret.globalWorkSize[2] = 1; ret.localWorkSize[0] = min(gws[0], (size_t)8); ret.localWorkSize[1] = min(gws[1], (size_t)8); ret.localWorkSize[2] = 1; return ret; } ATTR ndrange_t ndrange_2D(const size_t gws[2], const size_t lws[2]) { ndrange_t ret; ret.workDimension = 2; ret.globalWorkOffset[0] = 0; ret.globalWorkOffset[1] = 0; ret.globalWorkOffset[2] = 0; ret.globalWorkSize[0] = gws[0]; ret.globalWorkSize[1] = gws[1]; ret.globalWorkSize[2] = 1; ret.localWorkSize[0] = lws[0]; ret.localWorkSize[1] = lws[1]; ret.localWorkSize[2] = 1; return ret; } ATTR ndrange_t ndrange_2D(const size_t goff[2], const size_t gws[2], const size_t lws[2]) { ndrange_t ret; ret.workDimension = 2; ret.globalWorkOffset[0] = goff[0]; ret.globalWorkOffset[1] = goff[1]; ret.globalWorkOffset[2] = 0; ret.globalWorkSize[0] = gws[0]; ret.globalWorkSize[1] = gws[1]; ret.globalWorkSize[2] = 1; ret.localWorkSize[0] = lws[0]; ret.localWorkSize[1] = lws[1]; ret.localWorkSize[2] = 1; return ret; } // 3D variants ATTR ndrange_t ndrange_3D(const size_t gws[3]) { ndrange_t ret; ret.workDimension = 3; ret.globalWorkOffset[0] = 0; ret.globalWorkOffset[1] = 0; ret.globalWorkOffset[2] = 0; ret.globalWorkSize[0] = gws[0]; ret.globalWorkSize[1] = gws[1]; ret.globalWorkSize[2] = gws[2]; ret.localWorkSize[0] = min(gws[0], (size_t)4); ret.localWorkSize[1] = min(gws[1], (size_t)4); ret.localWorkSize[2] = min(gws[2], (size_t)4); return ret; } ATTR ndrange_t ndrange_3D(const size_t gws[3], const size_t lws[3]) { ndrange_t ret; ret.workDimension = 3; ret.globalWorkOffset[0] = 0; ret.globalWorkOffset[1] = 0; ret.globalWorkOffset[2] = 0; ret.globalWorkSize[0] = gws[0]; ret.globalWorkSize[1] = gws[1]; ret.globalWorkSize[2] = gws[2]; ret.localWorkSize[0] = lws[0]; ret.localWorkSize[1] = lws[1]; ret.localWorkSize[2] = lws[2]; return ret; } ATTR ndrange_t ndrange_3D(const size_t goff[3], const size_t gws[3], const size_t lws[3]) { ndrange_t ret; ret.workDimension = 3; ret.globalWorkOffset[0] = goff[0]; ret.globalWorkOffset[1] = goff[1]; ret.globalWorkOffset[2] = goff[2]; ret.globalWorkSize[0] = gws[0]; ret.globalWorkSize[1] = gws[1]; ret.globalWorkSize[2] = gws[2]; ret.localWorkSize[0] = lws[0]; ret.localWorkSize[1] = lws[1]; ret.localWorkSize[2] = lws[2]; return ret; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/devenq/schedule_pal.cl000066400000000000000000000256611415221260100236430ustar00rootroot00000000000000 #include "devenq.h" typedef struct _SchedulerParam { uint signal; //!< Signal to stop the child queue uint eng_clk; //!< Engine clock in Mhz ulong hw_queue; //!< Address to HW queue ulong hsa_queue; //!< Address to HSA dummy queue uint useATC; //!< GPU access to shader program by ATC. uint scratchSize; //!< Scratch buffer size ulong scratch; //!< GPU address to the scratch buffer uint numMaxWaves; //!< Num max waves on the asic uint releaseHostCP; //!< Releases CP on the host queue union { __global AmdAqlWrap* parentAQL; //!< Host parent AmdAqlWrap packet ulong pad_parentAQL; }; uint dedicatedQueue; //!< Scheduler uses a dedicated queue uint scratchOffset; //!< Scratch buffer offset uint reserved[2]; //!< Processed mask groups by one thread } SchedulerParam; static inline int checkWaitEvents(__global AmdEvent** events, uint numEvents) { for (uint i = 0; i < numEvents; ++i) { int status = atomic_load_explicit((__global atomic_uint*)(&events[i]->state), memory_order_relaxed, memory_scope_device); if (status != CL_COMPLETE) return status < 0 ? -1 : 0; } return 1; } static inline void releaseEvent(__global AmdEvent* ev, __global uint* emask, __global AmdEvent* eb) { uint c = atomic_fetch_sub_explicit((__global atomic_uint *)&ev->counter, 1U, memory_order_relaxed, memory_scope_device); if (c == 1U) { uint i = ev - eb; release_slot(emask, i); } } static inline void releaseWaitEvents(__global AmdEvent** events, uint numEvents, __global uint* emask, __global AmdEvent* eb) { for (uint i = 0; i < numEvents; ++i) { releaseEvent(events[i], emask, eb); } } static inline uint min_command(uint slot_num, __global AmdAqlWrap* wraps) { uint minCommand = 0xffffffff; for (uint idx = 0; idx < slot_num; ++idx) { __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx]; uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state), memory_order_relaxed, memory_scope_device); if ((slotState != AQL_WRAP_FREE) && (slotState != AQL_WRAP_RESERVED)) { minCommand = min(disp->command_id, minCommand); } } return minCommand; } extern uint GetCmdTemplateHeaderSize(void); extern uint GetCmdTemplateDispatchSize(void); extern void EmptyCmdTemplateDispatch(ulong cmdBuf); extern void RunCmdTemplateDispatch( ulong cmdBuf, __global hsa_kernel_dispatch_packet_t* aqlPkt, ulong scratch, ulong hsaQueue, uint scratchSize, uint scratchOffset, uint numMaxWaves, uint useATC); void __amd_scheduler_pal( __global AmdVQueueHeader* queue, __global SchedulerParam* params, uint paramIdx) { __global SchedulerParam* param = ¶ms[paramIdx]; ulong hwDisp = param->hw_queue + GetCmdTemplateHeaderSize(); __global AmdAqlWrap* hostParent = param->parentAQL; __global uint* counter = (__global uint*)(&hostParent->child_counter); __global uint* signal = (__global uint*)(¶m->signal); __global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1]; __global uint* amask = (__global uint *)queue->aql_slot_mask; //! @todo This is an unexplained behavior. //! The scheduler can be launched one more time after termination. if (1 == atomic_load_explicit((__global atomic_uint*)¶m->releaseHostCP, memory_order_acquire, memory_scope_device)) { return; } int launch = 0; int grpId = get_group_id(0); hwDisp += GetCmdTemplateDispatchSize() * grpId; uint mskGrp = queue->mask_groups; for (uint m = 0; m < mskGrp && launch == 0; ++m) { uint maskId = grpId * mskGrp + m; uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[maskId]), memory_order_relaxed, memory_scope_device); int baseIdx = maskId << 5; while (mask != 0) { uint sIdx = ctz(mask); uint idx = baseIdx + sIdx; mask &= ~(1 << sIdx); __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx]; uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state), memory_order_acquire, memory_scope_device); __global AmdAqlWrap* parent = (__global AmdAqlWrap*)(disp->parent_wrap); __global AmdEvent* event = (__global AmdEvent*)(disp->completion); // Check if the current slot is ready for processing if (slotState == AQL_WRAP_READY) { if (launch == 0) { // Attempt to find a new dispatch if nothing was launched yet uint parentState = atomic_load_explicit((__global atomic_uint*)(&parent->state), memory_order_relaxed, memory_scope_device); uint enqueueFlags = atomic_load_explicit((__global atomic_uint*)(&disp->enqueue_flags), memory_order_relaxed, memory_scope_device); // Check the launch flags if (((enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_KERNEL) || (enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP)) && (parentState != AQL_WRAP_DONE)) { continue; } // Check if the wait list is COMPLETE launch = checkWaitEvents((__global AmdEvent**)(disp->wait_list), disp->wait_num); if (launch != 0) { if (event != 0) { event->timer[PROFILING_COMMAND_START] = ((ulong)__builtin_readcyclecounter() * (ulong)param->eng_clk) >> 10; } if (launch > 0) { // Launch child kernel .... RunCmdTemplateDispatch(hwDisp, &disp->aql, param->scratch, param->hsa_queue, param->scratchSize, param->scratchOffset, param->numMaxWaves, param->useATC); } else if (event != 0) { event->state = -1; } atomic_store_explicit((__global atomic_uint*)&disp->state, AQL_WRAP_BUSY, memory_order_relaxed, memory_scope_device); releaseWaitEvents((__global AmdEvent**)(disp->wait_list), disp->wait_num, (__global uint*)queue->event_slot_mask, (__global AmdEvent*)queue->event_slots); break; } } } else if (slotState == AQL_WRAP_MARKER) { bool complete = false; if (disp->wait_num == 0) { uint minCommand = min_command(queue->aql_slot_num, wraps); complete = disp->command_id == minCommand; } else { int status = checkWaitEvents((__global AmdEvent**)(disp->wait_list), disp->wait_num); // Check if the wait list is COMPLETE if (status != 0) { complete = true; releaseWaitEvents((__global AmdEvent**)(disp->wait_list), disp->wait_num, (__global uint*)queue->event_slot_mask, (__global AmdEvent*)queue->event_slots); if (status < 0) event->state = -1; } } if (complete) { // Decrement the child execution counter on the parent atomic_fetch_sub_explicit((__global atomic_uint*)&parent->child_counter, 1, memory_order_relaxed, memory_scope_device); if (event->state >= 0) event->state = CL_COMPLETE; atomic_store_explicit((__global atomic_uint*)&disp->state, AQL_WRAP_FREE, memory_order_relaxed, memory_scope_device); release_slot(amask, idx); releaseEvent(event, (__global uint*)queue->event_slot_mask, (__global AmdEvent*)queue->event_slots); } } else if ((slotState == AQL_WRAP_BUSY) || (slotState == AQL_WRAP_DONE)) { if (slotState == AQL_WRAP_BUSY) { atomic_store_explicit((__global atomic_uint*)&disp->state, AQL_WRAP_DONE, memory_order_relaxed, memory_scope_device); if (event != 0) { event->timer[PROFILING_COMMAND_END] = ((ulong)__builtin_readcyclecounter() * (ulong)param->eng_clk) >> 10; } } // Was CL_EVENT requested? if (event != 0) { // The current dispatch doesn't have any outstanding children if (disp->child_counter == 0) { event->timer[PROFILING_COMMAND_COMPLETE] = ((ulong)__builtin_readcyclecounter() * (ulong)param->eng_clk) >> 10; if (event->state >= 0) { event->state = CL_COMPLETE; } if (event->capture_info != 0) { __global ulong* values = (__global ulong*)event->capture_info; values[0] = event->timer[PROFILING_COMMAND_END] - event->timer[PROFILING_COMMAND_START]; values[1] = event->timer[PROFILING_COMMAND_COMPLETE] - event->timer[PROFILING_COMMAND_START]; } releaseEvent(event, (__global uint *)queue->event_slot_mask, (__global AmdEvent *)queue->event_slots); } } // The current dispatch doesn't have any outstanding children if (disp->child_counter == 0) { // Decrement the child execution counter on the parent atomic_fetch_sub_explicit((__global atomic_uint*)&parent->child_counter, 1, memory_order_relaxed, memory_scope_device); atomic_store_explicit((__global atomic_uint*)&disp->state, AQL_WRAP_FREE, memory_order_relaxed, memory_scope_device); release_slot(amask, idx); } } } } if (launch <= 0) { EmptyCmdTemplateDispatch(hwDisp); } __global atomic_uint *againptr = param->dedicatedQueue ? (__global atomic_uint*)¶m->signal : (__global atomic_uint*)&hostParent->child_counter; uint again = atomic_load_explicit(againptr, memory_order_relaxed, memory_scope_device); if (!again) { //! \todo Write deadcode to the template, but somehow //! the scheduler will be launched one more time. atomic_store_explicit((__global atomic_uint*)hwDisp, 0xdeadc0de, memory_order_relaxed, memory_scope_device); atomic_store_explicit((__global atomic_uint*)¶m->signal, 0, memory_order_relaxed, memory_scope_device); atomic_store_explicit((__global atomic_uint*)¶m->releaseHostCP, 1, memory_order_relaxed, memory_scope_device); } } ROCm-Device-Libs-rocm-5.0.0/opencl/src/devenq/schedule_rocm.cl000066400000000000000000000272641415221260100240300ustar00rootroot00000000000000 #include "ockl_hsa.h" #include "devenq.h" typedef struct _SchedulerParam { ulong kernarg_address; //!< set to the VM address of SchedulerParam ulong hidden_global_offset_x; //!< set to 0 before queuing the scheduler ulong hidden_global_offset_y; //!< set to 0 before queuing the scheduler ulong hidden_global_offset_z; //!< set to 0 before queuing the scheduler ulong thread_counter; //!< set to 0 before queuing the scheduler __global hsa_queue_t* child_queue; //!< set to the device queue the child kernels will be queued to hsa_kernel_dispatch_packet_t scheduler_aql; //!< Dispatch packet used to relaunch the scheduler hsa_signal_t complete_signal; //!< Notify the host queue to continue processing __global AmdVQueueHeader* vqueue_header; //!< The vqueue uint signal; //!< Signal to stop the child queue uint eng_clk; //!< Engine clock in Mhz __global AmdAqlWrap* parentAQL; //!< Host parent AmdAqlWrap packet ulong write_index; //!< Write Index to the child queue } SchedulerParam; static inline int checkWaitEvents(__global AmdEvent** events, uint numEvents) { for (uint i = 0; i < numEvents; ++i) { int status = atomic_load_explicit((__global atomic_uint*)(&events[i]->state), memory_order_relaxed, memory_scope_device); if (status != CL_COMPLETE) return status < 0 ? -1 : 0; } return 1; } static inline void releaseEvent(__global AmdEvent* ev, __global uint* emask, __global AmdEvent* eb) { uint c = atomic_fetch_sub_explicit((__global atomic_uint *)&ev->counter, 1U, memory_order_relaxed, memory_scope_device); if (c == 1U) { uint i = ev - eb; release_slot(emask, i); } } static inline void releaseWaitEvents(__global AmdEvent** events, uint numEvents, __global uint* emask, __global AmdEvent* eb) { for (uint i = 0; i < numEvents; ++i) { releaseEvent(events[i], emask, eb); } } static inline uint min_command(uint slot_num, __global AmdAqlWrap* wraps) { uint minCommand = 0xffffffff; for (uint idx = 0; idx < slot_num; ++idx) { __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx]; uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state), memory_order_relaxed, memory_scope_device); if ((slotState != AQL_WRAP_FREE) && (slotState != AQL_WRAP_RESERVED)) { minCommand = min(disp->command_id, minCommand); } } return minCommand; } static inline void EnqueueDispatch(__global hsa_kernel_dispatch_packet_t* aqlPkt, __global SchedulerParam* param) { __global hsa_queue_t* child_queue = param->child_queue; // ulong index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed); // The original code seen above relies on PCIe 3 atomics, which might not be supported on some systems, so use a device side global // for workaround. ulong index = atomic_fetch_add_explicit((__global atomic_ulong*)¶m->write_index, (ulong)1, memory_order_relaxed, memory_scope_device); const ulong queueMask = child_queue->size - 1; __global hsa_kernel_dispatch_packet_t* dispatch_packet = &(((__global hsa_kernel_dispatch_packet_t*)(child_queue->base_address))[index & queueMask]); *dispatch_packet = *aqlPkt; } static inline void EnqueueScheduler(__global SchedulerParam* param) { __global hsa_queue_t* child_queue = param->child_queue; // ulong index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed); // The original code seen above relies on PCIe 3 atomics, which might not be supported on some systems, so use a device side global // for workaround. ulong index = atomic_fetch_add_explicit((__global atomic_ulong*)¶m->write_index, (ulong)1, memory_order_relaxed, memory_scope_device); const ulong queueMask = child_queue->size - 1; __global hsa_kernel_dispatch_packet_t* dispatch_packet = &(((__global hsa_kernel_dispatch_packet_t*)(child_queue->base_address))[index & queueMask]); *dispatch_packet = param->scheduler_aql; // This is part of the PCIe 3 atomics workaround, to write the final write_index value back to the child_queue __ockl_hsa_queue_store_write_index(child_queue, index + 1, __ockl_memory_order_relaxed); __ockl_hsa_signal_store(child_queue->doorbell_signal, index, __ockl_memory_order_release); } void __amd_scheduler_rocm(__global SchedulerParam* param) { __global AmdVQueueHeader* queue = (__global AmdVQueueHeader*)(param->vqueue_header); __global AmdAqlWrap* wraps = (__global AmdAqlWrap*)&queue[1]; __global uint* amask = (__global uint *)queue->aql_slot_mask; int launch = 0; int grpId = get_group_id(0); uint mskGrp = queue->mask_groups; for (uint m = 0; m < mskGrp && launch == 0; ++m) { uint maskId = grpId * mskGrp + m; uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[maskId]), memory_order_relaxed, memory_scope_device); int baseIdx = maskId << 5; while (mask != 0) { uint sIdx = ctz(mask); uint idx = baseIdx + sIdx; mask &= ~(1 << sIdx); __global AmdAqlWrap* disp = (__global AmdAqlWrap*)&wraps[idx]; uint slotState = atomic_load_explicit((__global atomic_uint*)(&disp->state), memory_order_acquire, memory_scope_device); __global AmdAqlWrap* parent = (__global AmdAqlWrap*)(disp->parent_wrap); __global AmdEvent* event = (__global AmdEvent*)(disp->completion); // Check if the current slot is ready for processing if (slotState == AQL_WRAP_READY) { if (launch == 0) { // Attempt to find a new dispatch if nothing was launched yet uint parentState = atomic_load_explicit((__global atomic_uint*)(&parent->state), memory_order_relaxed, memory_scope_device); uint enqueueFlags = atomic_load_explicit( (__global atomic_uint*)(&disp->enqueue_flags), memory_order_relaxed, memory_scope_device); // Check the launch flags if (((enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_KERNEL) || (enqueueFlags == CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP)) && (parentState != AQL_WRAP_DONE)) { continue; } // Check if the wait list is COMPLETE launch = checkWaitEvents((__global AmdEvent**)(disp->wait_list), disp->wait_num); if (launch != 0) { if (event != 0) { event->timer[PROFILING_COMMAND_START] = ((ulong)__builtin_readcyclecounter() * (ulong)param->eng_clk) >> 10; } if (launch > 0) { // Launch child kernel .... EnqueueDispatch(&disp->aql, param); } else if (event != 0) { event->state = -1; } atomic_store_explicit((__global atomic_uint*)&disp->state, AQL_WRAP_BUSY, memory_order_relaxed, memory_scope_device); releaseWaitEvents((__global AmdEvent**)(disp->wait_list), disp->wait_num, (__global uint*)queue->event_slot_mask, (__global AmdEvent*)queue->event_slots); break; } } } else if (slotState == AQL_WRAP_MARKER) { bool complete = false; if (disp->wait_num == 0) { uint minCommand = min_command(queue->aql_slot_num, wraps); complete = disp->command_id == minCommand; } else { int status = checkWaitEvents((__global AmdEvent**)(disp->wait_list), disp->wait_num); // Check if the wait list is COMPLETE if (status != 0) { complete = true; releaseWaitEvents((__global AmdEvent**)(disp->wait_list), disp->wait_num, (__global uint*)queue->event_slot_mask, (__global AmdEvent*)queue->event_slots); if (status < 0) event->state = -1; } } if (complete) { // Decrement the child execution counter on the parent atomic_fetch_sub_explicit((__global atomic_uint*)&parent->child_counter, 1, memory_order_relaxed, memory_scope_device); if (event->state >= 0) event->state = CL_COMPLETE; atomic_store_explicit((__global atomic_uint*)&disp->state, AQL_WRAP_FREE, memory_order_relaxed, memory_scope_device); release_slot(amask, idx); releaseEvent(event, (__global uint*)queue->event_slot_mask, (__global AmdEvent*)queue->event_slots); } } else if ((slotState == AQL_WRAP_BUSY) || (slotState == AQL_WRAP_DONE)) { if (slotState == AQL_WRAP_BUSY) { atomic_store_explicit((__global atomic_uint*)&disp->state, AQL_WRAP_DONE, memory_order_relaxed, memory_scope_device); if (event != 0) { event->timer[PROFILING_COMMAND_END] = ((ulong)__builtin_readcyclecounter() * (ulong)param->eng_clk) >> 10; } } // Was CL_EVENT requested? if (event != 0) { // The current dispatch doesn't have any outstanding children if (disp->child_counter == 0) { event->timer[PROFILING_COMMAND_COMPLETE] = ((ulong)__builtin_readcyclecounter() * (ulong)param->eng_clk) >> 10; if (event->state >= 0) { event->state = CL_COMPLETE; } if (event->capture_info != 0) { __global ulong* values = (__global ulong*)event->capture_info; values[0] = event->timer[PROFILING_COMMAND_END] - event->timer[PROFILING_COMMAND_START]; values[1] = event->timer[PROFILING_COMMAND_COMPLETE] - event->timer[PROFILING_COMMAND_START]; } releaseEvent(event, (__global uint *)queue->event_slot_mask, (__global AmdEvent *)queue->event_slots); } } // The current dispatch doesn't have any outstanding children if (disp->child_counter == 0) { // Decrement the child execution counter on the parent atomic_fetch_sub_explicit((__global atomic_uint*)&parent->child_counter, 1, memory_order_relaxed, memory_scope_device); atomic_store_explicit((__global atomic_uint*)&disp->state, AQL_WRAP_FREE, memory_order_relaxed, memory_scope_device); release_slot(amask, idx); } } } } ulong threads_done = atomic_fetch_add_explicit((__global atomic_ulong*)¶m->thread_counter, (ulong)1, memory_order_relaxed, memory_scope_device); if (threads_done >= (get_global_size(0) - 1)) { // The last thread finishes the processing __global AmdAqlWrap* hostParent = param->parentAQL; bool complete = atomic_load_explicit((__global atomic_uint*)&hostParent->child_counter, memory_order_relaxed, memory_scope_device) == 0; if (complete) { __ockl_hsa_signal_store(param->complete_signal, 0, __ockl_memory_order_relaxed); } else { param->thread_counter = 0; EnqueueScheduler(param); } } } ROCm-Device-Libs-rocm-5.0.0/opencl/src/geometric/000077500000000000000000000000001415221260100213555ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/geometric/cross.cl000066400000000000000000000015651415221260100230350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define ATTR __attribute__((overloadable, const)) #define GEN(T) \ ATTR T##3 \ cross(T##3 p0, T##3 p1) \ { \ return (T##3)(mad(p0.y, p1.z, -p0.z*p1.y), \ mad(p0.z, p1.x, -p0.x*p1.z), \ mad(p0.x, p1.y, -p0.y*p1.x)); \ } \ \ ATTR T##4 \ cross(T##4 p0, T##4 p1) \ { \ return (T##4)(mad(p0.y, p1.z, -p0.z*p1.y), \ mad(p0.z, p1.x, -p0.x*p1.z), \ mad(p0.x, p1.y, -p0.y*p1.x), \ (T)0); \ } GEN(float) GEN(double) GEN(half) ROCm-Device-Libs-rocm-5.0.0/opencl/src/geometric/distance.cl000066400000000000000000000011631415221260100234700ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define ATTR __attribute__((overloadable, const)) #define GENN(N,T) \ ATTR T \ distance(T##N p0, T##N p1) \ { \ return length(p0 - p1); \ } #define GEN(T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GENN(,T) GEN(float) GEN(double) GEN(half) ROCm-Device-Libs-rocm-5.0.0/opencl/src/geometric/dot.cl000066400000000000000000000014671415221260100224730ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define ATTR __attribute__((overloadable, const)) #define GEN(T) \ ATTR T \ dot(T p0, T p1) \ { \ return p0 * p1; \ } \ ATTR T \ dot(T##2 p0, T##2 p1) \ { \ return mad(p0.y, p1.y, p0.x*p1.x); \ } \ ATTR T \ dot(T##3 p0, T##3 p1) \ { \ return mad(p0.z, p1.z, mad(p0.y, p1.y, p0.x*p1.x)); \ } \ ATTR T \ dot(T##4 p0, T##4 p1) \ { \ return mad(p0.w, p1.w, mad(p0.z, p1.z, mad(p0.y, p1.y, p0.x*p1.x))); \ } GEN(float) GEN(double) GEN(half) ROCm-Device-Libs-rocm-5.0.0/opencl/src/geometric/fast_distance.cl000066400000000000000000000010701415221260100245020ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) #define GENN(N,T) \ ATTR T \ fast_distance(T##N p0, T##N p1) \ { \ return fast_length(p0 - p1); \ } #define GEN(T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GENN(,T) GEN(float) ROCm-Device-Libs-rocm-5.0.0/opencl/src/geometric/fast_length.cl000066400000000000000000000012021415221260100241660ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) ATTR float fast_length(float p) { return fabs(p); } ATTR float fast_length(float2 p) { return half_sqrt(dot(p, p)); } ATTR float fast_length(float3 p) { return half_sqrt(dot(p, p)); } ATTR float fast_length(float4 p) { return half_sqrt(dot(p, p)); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/geometric/fast_normalize.cl000066400000000000000000000011571415221260100247160ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) #define GEN(N) \ ATTR float##N \ fast_normalize(float##N p) \ { \ float l2 = dot(p, p); \ float##N n = p * half_rsqrt(l2); \ return l2 == 0.0f ? p : n; \ } GEN(4) GEN(3) GEN(2) ATTR float fast_normalize(float p) { return sign(p); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/geometric/length.cl000066400000000000000000000065711415221260100231670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define ATTR __attribute__((overloadable, const)) ATTR float length(float p) { return fabs(p); } ATTR float length(float2 p) { float l2 = dot(p, p); float r; if (l2 < FLT_MIN) { p *= 0x1.0p+86f; r = sqrt(dot(p, p)) * 0x1.0p-86f; } else if (l2 == INFINITY) { p *= 0x1.0p-65f; r = sqrt(dot(p, p)) * 0x1.0p+65f; } else r = sqrt(l2); return r; } ATTR float length(float3 p) { float l2 = dot(p, p); float r; if (l2 < FLT_MIN) { p *= 0x1.0p+86f; r = sqrt(dot(p, p)) * 0x1.0p-86f; } else if (l2 == INFINITY) { p *= 0x1.0p-66f; r = sqrt(dot(p, p)) * 0x1.0p+66f; } else r = sqrt(l2); return r; } ATTR float length(float4 p) { float l2 = dot(p, p); float r; if (l2 < FLT_MIN) { p *= 0x1.0p+86f; r = sqrt(dot(p, p)) * 0x1.0p-86f; } else if (l2 == INFINITY) { p *= 0x1.0p-66f; r = sqrt(dot(p, p)) * 0x1.0p+66f; } else r = sqrt(l2); return r; } ATTR double length(double p) { return fabs(p); } ATTR double length(double2 p) { double l2 = dot(p, p); double r; if (l2 < DBL_MIN) { p *= 0x1.0p+563; r = sqrt(dot(p, p)) * 0x1.0p-563; } else if (l2 == INFINITY) { p *= 0x1.0p-513; r = sqrt(dot(p, p)) * 0x1.0p+513; } else r = sqrt(l2); return r; } ATTR double length(double3 p) { double l2 = dot(p, p); double r; if (l2 < DBL_MIN) { p *= 0x1.0p+563; r = sqrt(dot(p, p)) * 0x1.0p-563; } else if (l2 == INFINITY) { p *= 0x1.0p-514; r = sqrt(dot(p, p)) * 0x1.0p+514; } else r = sqrt(l2); return r; } ATTR double length(double4 p) { double l2 = dot(p, p); double r; if (l2 < DBL_MIN) { p *= 0x1.0p+563; r = sqrt(dot(p, p)) * 0x1.0p-563; } else if (l2 == INFINITY) { p *= 0x1.0p-514; r = sqrt(dot(p, p)) * 0x1.0p+514; } else r = sqrt(l2); return r; } ATTR half length(half p) { return fabs(p); } ATTR half length(half2 p) { half l2 = dot(p, p); half r; if (l2 < HALF_MIN) { p = p * 0x1.0p+10h * 0x1.0p+7h; r = sqrt(dot(p, p)) * 0x1.0p-17h; } else if (l2 == (half)INFINITY) { p *= 0x1.0p-9h; r = sqrt(dot(p, p)) * 0x1.0p+9h; } else r = sqrt(l2); return r; } ATTR half length(half3 p) { half l2 = dot(p, p); half r; if (l2 < HALF_MIN) { p = p * 0x1.0p+10h * 0x1.0p+7h; r = sqrt(dot(p, p)) * 0x1.0p-17h; } else if (l2 == (half)INFINITY) { p *= 0x1.0p-10h; r = sqrt(dot(p, p)) * 0x1.0p+10h; } else r = sqrt(l2); return r; } ATTR half length(half4 p) { half l2 = dot(p, p); half r; if (l2 < HALF_MIN) { p = p * 0x1.0p+10h * 0x1.0p+7h; r = sqrt(dot(p, p)) * 0x1.0p-17h; } else if (l2 == (half)INFINITY) { p *= 0x1.0p-10h; r = sqrt(dot(p, p)) * 0x1.0p+10h; } else r = sqrt(l2); return r; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/geometric/normalize.cl000066400000000000000000000107511415221260100237010ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define ATTR __attribute__((overloadable, const)) ATTR float normalize(float p) { return sign(p); } ATTR float2 normalize(float2 p) { if (all(p == (float2)0.0F)) return p; float l2 = dot(p, p); if (l2 < FLT_MIN) { p *= 0x1.0p+86F; l2 = dot(p, p); } else if (l2 == INFINITY) { p *= 0x1.0p-65f; l2 = dot(p, p); if (l2 == INFINITY) { p = copysign(select((float2)0.0F, (float2)1.0F, isinf(p)), p); l2 = dot(p, p); } } return p * rsqrt(l2); } ATTR float3 normalize(float3 p) { if (all(p == (float3)0.0F)) return p; float l2 = dot(p, p); if (l2 < FLT_MIN) { p *= 0x1.0p+86F; l2 = dot(p, p); } else if (l2 == INFINITY) { p *= 0x1.0p-66f; l2 = dot(p, p); if (l2 == INFINITY) { p = copysign(select((float3)0.0F, (float3)1.0F, isinf(p)), p); l2 = dot(p, p); } } return p * rsqrt(l2); } ATTR float4 normalize(float4 p) { if (all(p == (float4)0.0F)) return p; float l2 = dot(p, p); if (l2 < FLT_MIN) { p *= 0x1.0p+86F; l2 = dot(p, p); } else if (l2 == INFINITY) { p *= 0x1.0p-66f; l2 = dot(p, p); if (l2 == INFINITY) { p = copysign(select((float4)0.0F, (float4)1.0F, isinf(p)), p); l2 = dot(p, p); } } return p * rsqrt(l2); } ATTR double normalize(double p) { return sign(p); } ATTR double2 normalize(double2 p) { if (all(p == (double2)0.0)) return p; double l2 = dot(p, p); if (l2 < DBL_MIN) { p *= 0x1.0p+563; l2 = dot(p, p); } else if (l2 == INFINITY) { p *= 0x1.0p-513; l2 = dot(p, p); if (l2 == INFINITY) { p = copysign(select((double2)0.0, (double2)1.0, isinf(p)), p); l2 = dot(p, p); } } return p * rsqrt(l2); } ATTR double3 normalize(double3 p) { if (all(p == (double3)0.0)) return p; double l2 = dot(p, p); if (l2 < DBL_MIN) { p *= 0x1.0p+563; l2 = dot(p, p); } else if (l2 == INFINITY) { p *= 0x1.0p-514; l2 = dot(p, p); if (l2 == INFINITY) { p = copysign(select((double3)0.0, (double3)1.0, isinf(p)), p); l2 = dot(p, p); } } return p * rsqrt(l2); } ATTR double4 normalize(double4 p) { if (all(p == (double4)0.0)) return p; double l2 = dot(p, p); if (l2 < DBL_MIN) { p *= 0x1.0p+563; l2 = dot(p, p); } else if (l2 == INFINITY) { p *= 0x1.0p-514; l2 = dot(p, p); if (l2 == INFINITY) { p = copysign(select((double4)0.0, (double4)1.0, isinf(p)), p); l2 = dot(p, p); } } return p * rsqrt(l2); } ATTR half normalize(half p) { return sign(p); } ATTR half2 normalize(half2 p) { if (all(p == (half2)0.0)) return p; half l2 = dot(p, p); if (l2 < HALF_MIN) { p = p * 0x1.0p+10h * 0x1.0p+7h; l2 = dot(p, p); } else if (l2 == (half)INFINITY) { p *= 0x1.0p-9h; l2 = dot(p, p); if (l2 == (half)INFINITY) { p = copysign(select((half2)0.0, (half2)1.0, isinf(p)), p); l2 = dot(p, p); } } return p * rsqrt(l2); } ATTR half3 normalize(half3 p) { if (all(p == (half3)0.0)) return p; half l2 = dot(p, p); if (l2 < HALF_MIN) { p = p * 0x1.0p+10h * 0x1.0p+7h; l2 = dot(p, p); } else if (l2 == (half)INFINITY) { p *= 0x1.0p-10h; l2 = dot(p, p); if (l2 == (half)INFINITY) { p = copysign(select((half3)0.0, (half3)1.0, isinf(p)), p); l2 = dot(p, p); } } return p * rsqrt(l2); } ATTR half4 normalize(half4 p) { if (all(p == (half4)0.0)) return p; half l2 = dot(p, p); if (l2 < HALF_MIN) { p = p * 0x1.0p+10h * 0x1.0p+7h; l2 = dot(p, p); } else if (l2 == (half)INFINITY) { p *= 0x1.0p-10h; l2 = dot(p, p); if (l2 == INFINITY) { p = copysign(select((half4)0.0, (half4)1.0, isinf(p)), p); l2 = dot(p, p); } } return p * rsqrt(l2); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/image/000077500000000000000000000000001415221260100204615ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/image/imwrap.cl000066400000000000000000000421461415221260100223070ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #include "ockl.h" #include "oclc.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_mipmap_image : enable static __constant int channel_order_map[32] = { CLK_A, CLK_R, CLK_Rx, CLK_RG, CLK_RGx, CLK_RA, CLK_RGB, CLK_RGBx, CLK_RGBA, CLK_BGRA, CLK_ARGB, 666, // XXX CLK_ABGR, CLK_sRGB, CLK_sRGBx, CLK_sRGBA, CLK_sBGRA, CLK_INTENSITY, CLK_LUMINANCE, CLK_DEPTH, CLK_DEPTH_STENCIL }; static __constant int channel_data_type_map[32] = { CLK_SNORM_INT8, CLK_SNORM_INT16, CLK_UNORM_INT8, CLK_UNORM_INT16, CLK_UNORM_INT24, CLK_UNORM_SHORT_555, CLK_UNORM_SHORT_565, CLK_UNORM_INT_101010, CLK_SIGNED_INT8, CLK_SIGNED_INT16, CLK_SIGNED_INT32, CLK_UNSIGNED_INT8, CLK_UNSIGNED_INT16, CLK_UNSIGNED_INT32, CLK_HALF_FLOAT, CLK_FLOAT }; #define LOWER_sampler(S) __builtin_astype(S, SSHARP) #define LOWER_ro_1D(I) __builtin_astype(I, TSHARP) #define LOWER_ro_1Da(I) __builtin_astype(I, TSHARP) #define LOWER_ro_1Db(I) __builtin_astype(I, TSHARP) #define LOWER_ro_2D(I) __builtin_astype(I, TSHARP) #define LOWER_ro_2Da(I) __builtin_astype(I, TSHARP) #define LOWER_ro_2Dd(I) __builtin_astype(I, TSHARP) #define LOWER_ro_2Dad(I) __builtin_astype(I, TSHARP) #define LOWER_ro_3D(I) __builtin_astype(I, TSHARP) #define LOWER_wo_1D(I) __builtin_astype(I, TSHARP) #define LOWER_wo_1Da(I) __builtin_astype(I, TSHARP) #define LOWER_wo_1Db(I) __builtin_astype(I, TSHARP) #define LOWER_wo_2D(I) __builtin_astype(I, TSHARP) #define LOWER_wo_2Da(I) __builtin_astype(I, TSHARP) #define LOWER_wo_2Dd(I) __builtin_astype(I, TSHARP) #define LOWER_wo_2Dad(I) __builtin_astype(I, TSHARP) #define LOWER_wo_3D(I) __builtin_astype(I, TSHARP) #define LOWER_rw_1D(I) __builtin_astype(I, TSHARP) #define LOWER_rw_1Da(I) __builtin_astype(I, TSHARP) #define LOWER_rw_1Db(I) __builtin_astype(I, TSHARP) #define LOWER_rw_2D(I) __builtin_astype(I, TSHARP) #define LOWER_rw_2Da(I) __builtin_astype(I, TSHARP) #define LOWER_rw_2Dd(I) __builtin_astype(I, TSHARP) #define LOWER_rw_2Dad(I) __builtin_astype(I, TSHARP) #define LOWER_rw_3D(I) __builtin_astype(I, TSHARP) #define _C(X,Y) X ## Y #define C(X,Y) _C(X,Y) #define PFX __ockl_image_ #define i32_fsuf i #define u32_fsuf ui #define f32_fsuf f #define f16_fsuf h #define i32_ksuf #define u32_ksuf #define f32_ksuf #define f16_ksuf h #define i32_rcast as_int4 #define u32_rcast as_uint4 #define f32_rcast #define f16_rcast #define _1D_ity image1d_t #define _1Da_ity image1d_array_t #define _1Db_ity image1d_buffer_t #define _2D_ity image2d_t #define _2Da_ity image2d_array_t #define _2Dd_ity image2d_depth_t #define _2Dad_ity image2d_array_depth_t #define _3D_ity image3d_t #define _1D_f32_pty float4 #define _1D_f16_pty half4 #define _1D_i32_pty int4 #define _1D_u32_pty uint4 #define _1Da_f32_pty float4 #define _1Da_f16_pty half4 #define _1Da_i32_pty int4 #define _1Da_u32_pty uint4 #define _1Db_f32_pty float4 #define _1Db_f16_pty half4 #define _1Db_i32_pty int4 #define _1Db_u32_pty uint4 #define _2D_f32_pty float4 #define _2D_f16_pty half4 #define _2D_i32_pty int4 #define _2D_u32_pty uint4 #define _2Da_f32_pty float4 #define _2Da_f16_pty half4 #define _2Da_i32_pty int4 #define _2Da_u32_pty uint4 #define _2Dd_f32_pty float #define _2Dad_f32_pty float #define _3D_f32_pty float4 #define _3D_f16_pty half4 #define _3D_i32_pty int4 #define _3D_u32_pty uint4 #define _1D_f32_parg p #define _1D_f16_parg p #define _1D_i32_parg as_float4(p) #define _1D_u32_parg as_float4(p) #define _1Da_f32_parg p #define _1Da_f16_parg p #define _1Da_i32_parg as_float4(p) #define _1Da_u32_parg as_float4(p) #define _1Db_f32_parg p #define _1Db_f16_parg p #define _1Db_i32_parg as_float4(p) #define _1Db_u32_parg as_float4(p) #define _2D_f32_parg p #define _2D_f16_parg p #define _2D_i32_parg as_float4(p) #define _2D_u32_parg as_float4(p) #define _2Da_f32_parg p #define _2Da_f16_parg p #define _2Da_i32_parg as_float4(p) #define _2Da_u32_parg as_float4(p) #define _2Dd_f32_parg p #define _2Dad_f32_parg p #define _3D_f32_parg p #define _3D_f16_parg p #define _3D_i32_parg as_float4(p) #define _3D_u32_parg as_float4(p) #define _1D_i32_cty int #define _1D_f32_cty float #define _1Da_i32_cty int2 #define _1Da_f32_cty float2 #define _1Db_i32_cty int #define _2D_i32_cty int2 #define _2D_f32_cty float2 #define _2Da_i32_cty int4 #define _2Da_f32_cty float4 #define _2Dd_i32_cty int2 #define _2Dd_f32_cty float2 #define _2Dad_i32_cty int4 #define _2Dad_f32_cty float4 #define _3D_i32_cty int4 #define _3D_f32_cty float4 #define _1D_i32_carg convert_float(c) #define _1D_f32_carg c #define _1Da_i32_carg convert_float2(c) #define _1Da_f32_carg c #define _1Db_i32_carg c #define _2D_i32_carg convert_float2(c) #define _2D_f32_carg c #define _2Da_i32_carg convert_float4(c) #define _2Da_f32_carg c #define _2Dd_i32_carg convert_float2(c) #define _2Dd_f32_carg c #define _2Dad_i32_carg convert_float4(c) #define _2Dad_f32_carg c #define _3D_i32_carg convert_float4(c) #define _3D_f32_carg c #define _1D_gpars float dx, float dy #define _1Da_gpars float dx, float dy #define _2D_gpars float2 dx, float2 dy #define _2Da_gpars float2 dx, float2 dy #define _2Dd_gpars float2 dx, float2 dy #define _2Dad_gpars float2 dx, float2 dy #define _3D_gpars float4 dx, float4 dy #define RATTR __attribute__((overloadable, pure)) #define WATTR __attribute__((overloadable)) #define GATTR __attribute__((overloadable, const)) #define FATTR __attribute__((pure)) #define SGEN(IT,PT,CT) \ RATTR IT##_##PT##_pty \ C(read_image,PT##_fsuf)(read_only IT##_ity i, sampler_t s, IT##_##CT##_cty c) \ { \ return PT##_rcast(C(PFX,C(sample,C(PT##_ksuf,IT)))(LOWER_ro##IT(i), LOWER_sampler(s), IT##_##CT##_carg)); \ } #define SGENL(IT,PT,CT) \ RATTR IT##_##PT##_pty \ C(read_image,PT##_fsuf)(read_only IT##_ity i, sampler_t s, IT##_##CT##_cty c, float l) \ { \ return PT##_rcast(C(PFX,C(sample,C(PT##_ksuf,C(_lod,IT))))(LOWER_ro##IT(i), LOWER_sampler(s), IT##_##CT##_carg, l)); \ } #define SGENG(IT,PT,CT) \ RATTR IT##_##PT##_pty \ C(read_image,PT##_fsuf)(read_only IT##_ity i, sampler_t s, IT##_##CT##_cty c, IT##_gpars) \ { \ return PT##_rcast(C(PFX,C(sample,C(PT##_ksuf,C(_grad,IT))))(LOWER_ro##IT(i), LOWER_sampler(s), IT##_##CT##_carg, dx, dy)); \ } #define SGENX(IT,PT,CT) \ SGEN(IT,PT,CT) \ SGENL(IT,PT,CT) \ SGENG(IT,PT,CT) #define RGEN(IT,PT,CT) \ RATTR IT##_##PT##_pty \ C(read_image,PT##_fsuf)(read_only IT##_ity i, IT##_##CT##_cty c) \ { \ return PT##_rcast(C(PFX,C(load,C(PT##_ksuf,IT)))(LOWER_ro##IT(i), c)); \ } \ \ RATTR IT##_##PT##_pty \ C(read_image,PT##_fsuf)(read_write IT##_ity i, IT##_##CT##_cty c) \ { \ return PT##_rcast(C(PFX,C(load,C(PT##_ksuf,IT)))(LOWER_rw##IT(i), c)); \ } #define WGEN(IT,PT,CT) \ WATTR void \ C(write_image,PT##_fsuf)(write_only IT##_ity i, IT##_##CT##_cty c, IT##_##PT##_pty p) \ { \ C(PFX,C(store,C(PT##_ksuf,IT)))(LOWER_wo##IT(i), c, IT##_##PT##_parg); \ } \ \ WATTR void \ C(write_image,PT##_fsuf)(read_write IT##_ity i, IT##_##CT##_cty c, IT##_##PT##_pty p) \ { \ C(PFX,C(store,C(PT##_ksuf,IT)))(LOWER_rw##IT(i), c, IT##_##PT##_parg); \ } #define WGENL(IT,PT,CT) \ WATTR void \ C(write_image,PT##_fsuf)(write_only IT##_ity i, IT##_##CT##_cty c, int l, IT##_##PT##_pty p) \ { \ C(PFX,C(store,C(PT##_ksuf,C(_lod,IT))))(LOWER_wo##IT(i), c, l, IT##_##PT##_parg); \ } \ \ WATTR void \ C(write_image,PT##_fsuf)(read_write IT##_ity i, IT##_##CT##_cty c, int l, IT##_##PT##_pty p) \ { \ C(PFX,C(store,C(PT##_ksuf,C(_lod,IT))))(LOWER_rw##IT(i), c, l, IT##_##PT##_parg); \ } #define WGENX(IT,PT,CT) \ WGEN(IT,PT,CT) \ WGENL(IT,PT,CT) SGEN(_2D,f32,i32) SGENX(_2D,f32,f32) SGEN(_2D,f16,i32) SGENX(_2D,f16,f32) SGEN(_2D,i32,i32) SGENX(_2D,i32,f32) SGEN(_2D,u32,i32) SGENX(_2D,u32,f32) SGEN(_3D,f32,i32) SGENX(_3D,f32,f32) SGEN(_3D,f16,i32) SGENX(_3D,f16,f32) SGEN(_3D,i32,i32) SGENX(_3D,i32,f32) SGEN(_3D,u32,i32) SGENX(_3D,u32,f32) SGEN(_2Da,f32,i32) SGENX(_2Da,f32,f32) SGEN(_2Da,f16,i32) SGENX(_2Da,f16,f32) SGEN(_2Da,i32,i32) SGENX(_2Da,i32,f32) SGEN(_2Da,u32,i32) SGENX(_2Da,u32,f32) SGEN(_1D,f32,i32) SGENX(_1D,f32,f32) SGEN(_1D,f16,i32) SGENX(_1D,f16,f32) SGEN(_1D,i32,i32) SGENX(_1D,i32,f32) SGEN(_1D,u32,i32) SGENX(_1D,u32,f32) SGEN(_1Da,f32,i32) SGENX(_1Da,f32,f32) SGEN(_1Da,f16,i32) SGENX(_1Da,f16,f32) SGEN(_1Da,i32,i32) SGENX(_1Da,i32,f32) SGEN(_1Da,u32,i32) SGENX(_1Da,u32,f32) SGEN(_2Dd,f32,i32) SGENX(_2Dd,f32,f32) SGEN(_2Dad,f32,i32) SGENX(_2Dad,f32,f32) RGEN(_2D,f32,i32) RGEN(_2D,f16,i32) RGEN(_2D,i32,i32) RGEN(_2D,u32,i32) RGEN(_3D,f32,i32) RGEN(_3D,f16,i32) RGEN(_3D,i32,i32) RGEN(_3D,u32,i32) RGEN(_2Da,f32,i32) RGEN(_2Da,f16,i32) RGEN(_2Da,i32,i32) RGEN(_2Da,u32,i32) RGEN(_1D,f32,i32) RGEN(_1D,f16,i32) RGEN(_1D,i32,i32) RGEN(_1D,u32,i32) RGEN(_1Db,f32,i32) RGEN(_1Db,f16,i32) RGEN(_1Db,i32,i32) RGEN(_1Db,u32,i32) RGEN(_1Da,f32,i32) RGEN(_1Da,f16,i32) RGEN(_1Da,i32,i32) RGEN(_1Da,u32,i32) RGEN(_2Dd,f32,i32) RGEN(_2Dad,f32,i32) WGENX(_2D,f32,i32) WGENX(_2D,f16,i32) WGENX(_2D,i32,i32) WGENX(_2D,u32,i32) WGENX(_2Da,f32,i32) WGENX(_2Da,f16,i32) WGENX(_2Da,i32,i32) WGENX(_2Da,u32,i32) WGENX(_1D,f32,i32) WGENX(_1D,f16,i32) WGENX(_1D,i32,i32) WGENX(_1D,u32,i32) WGEN(_1Db,f32,i32) WGEN(_1Db,f16,i32) WGEN(_1Db,i32,i32) WGEN(_1Db,u32,i32) WGENX(_1Da,f32,i32) WGENX(_1Da,f16,i32) WGENX(_1Da,i32,i32) WGENX(_1Da,u32,i32) WGENX(_2Dd,f32,i32) WGENX(_2Dad,f32,i32) WGENX(_3D,f32,i32) WGENX(_3D,f16,i32) WGENX(_3D,i32,i32) WGENX(_3D,u32,i32) #define ro_qual read_only #define wo_qual write_only #define rw_qual read_write #define GD3GEN(Q) \ GATTR int4 \ get_image_dim(Q##_qual image3d_t i) \ { \ return (int4)(get_image_width(i), get_image_height(i), get_image_depth(i), 0); \ } GD3GEN(ro) GD3GEN(wo) GD3GEN(rw) #define GD2GENQ(Q,T) \ GATTR int2 \ get_image_dim(Q##_qual T##_ity i) \ { \ return (int2)(get_image_width(i), get_image_height(i)); \ } #define GD2GEN(T) \ GD2GENQ(ro,T) \ GD2GENQ(wo,T) \ GD2GENQ(rw,T) GD2GEN(_2D) GD2GEN(_2Da) GD2GEN(_2Dd) GD2GEN(_2Dad) #define GGENQT(Q,N,T) \ GATTR int \ get_image_##N(Q##_qual T##_ity i) { \ return C(PFX,C(N,T))(LOWER_##Q##T(i)); \ } #define GGENT(N,T) \ GGENQT(ro,N,T) \ GGENQT(wo,N,T) \ GGENQT(rw,N,T) #define GGENX(N) \ GGENT(N,_1D) \ GGENT(N,_1Da) \ GGENT(N,_2D) \ GGENT(N,_2Da) \ GGENT(N,_2Dd) \ GGENT(N,_2Dad) \ GGENT(N,_3D) #define GGEN(N) \ GGENX(N) \ GGENT(N,_1Db) \ GGEN(width) GGENX(num_mip_levels) // int get depth _3D #define GNZGEN(Q) \ GATTR int \ get_image_depth(Q##_qual image3d_t i) \ { \ return C(PFX,depth_3D)(LOWER_##Q##_3D(i)); \ } GNZGEN(ro) GNZGEN(wo) GNZGEN(rw) // size_t get image_array_size _1Da, _2Da, _2Dad #define GASGENQ(Q,T) \ GATTR size_t \ get_image_array_size(Q##_qual T##_ity i) \ { \ return C(PFX,C(array_size,T))(LOWER_##Q##T(i)); \ } #define GASGEN(T) \ GASGENQ(ro,T) \ GASGENQ(wo,T) \ GASGENQ(rw,T) GASGEN(_1Da) GASGEN(_2Da) GASGEN(_2Dad) #define GCOGENQ(Q,T) \ GATTR int \ get_image_channel_order(Q##_qual T##_ity i) { \ return channel_order_map[C(PFX,C(channel_order,T))(LOWER_##Q##T(i))]; \ } #define GCOGEN(T) \ GCOGENQ(ro,T) \ GCOGENQ(wo,T) \ GCOGENQ(rw,T) GCOGEN(_1D) GCOGEN(_1Da) GCOGEN(_1Db) GCOGEN(_2D) GCOGEN(_2Da) GCOGEN(_2Dd) GCOGEN(_2Dad) GCOGEN(_3D) #define GDTGENQ(Q,T) \ GATTR int \ get_image_channel_data_type(Q##_qual T##_ity i) { \ return channel_data_type_map[C(PFX,C(channel_data_type,T))(LOWER_##Q##T(i))]; \ } #define GDTGEN(T) \ GDTGENQ(ro,T) \ GDTGENQ(wo,T) \ GDTGENQ(rw,T) GDTGEN(_1D) GDTGEN(_1Da) GDTGEN(_1Db) GDTGEN(_2D) GDTGEN(_2Da) GDTGEN(_2Dd) GDTGEN(_2Dad) GDTGEN(_3D) #define GNYGENQ(Q,T) \ GATTR int \ get_image_height(Q##_qual T##_ity i) { \ return C(PFX,C(height,T))(LOWER_##Q##T(i)); \ } #define GNYGEN(T) \ GNYGENQ(ro,T) \ GNYGENQ(wo,T) \ GNYGENQ(rw,T) GNYGEN(_2D) GNYGEN(_2Da) GNYGEN(_2Dd) GNYGEN(_2Dad) GNYGEN(_3D) FATTR float4 amd_fetch4_ff(read_only image2d_t im, float2 coord, int comp) { sampler_t s = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST | CLK_ADDRESS_NONE; switch (comp) { case 1: return __ockl_image_gather4g_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord); case 2: return __ockl_image_gather4b_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord); case 3: return __ockl_image_gather4a_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord); default: return __ockl_image_gather4r_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord); } } FATTR float4 amd_fetch4_fsf(read_only image2d_t im, sampler_t s, float2 coord, int comp) { switch (comp) { case 1: return __ockl_image_gather4g_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord); case 2: return __ockl_image_gather4b_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord); case 3: return __ockl_image_gather4a_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord); default: return __ockl_image_gather4r_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord); } } FATTR float4 amd_fetch4_fi(read_only image2d_t im, int2 coord, int comp) { sampler_t s = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST | CLK_ADDRESS_NONE; float2 fcoord = convert_float2(coord); switch (comp) { case 1: return __ockl_image_gather4g_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord); case 2: return __ockl_image_gather4b_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord); case 3: return __ockl_image_gather4a_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord); default: return __ockl_image_gather4r_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord); } } FATTR float4 amd_fetch4_fsi(read_only image2d_t im, sampler_t s, int2 coord, int comp) { float2 fcoord = convert_float2(coord); switch (comp) { case 1: return __ockl_image_gather4g_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord); case 2: return __ockl_image_gather4b_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord); case 3: return __ockl_image_gather4a_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord); default: return __ockl_image_gather4r_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord); } } FATTR int4 amd_fetch4_if(read_only image2d_t im, float2 coord, int comp) { sampler_t s = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST | CLK_ADDRESS_NONE; if (__oclc_ISA_version < 9000) { coord -= 0.5f; } switch (comp) { case 1: return as_int4(__ockl_image_gather4g_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord)); case 2: return as_int4(__ockl_image_gather4b_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord)); case 3: return as_int4(__ockl_image_gather4a_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord)); default: return as_int4(__ockl_image_gather4r_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord)); } } FATTR int4 amd_fetch4_isf(read_only image2d_t im, sampler_t s, float2 coord, int comp) { if (__oclc_ISA_version < 9000) { coord -= 0.5f; } switch (comp) { case 1: return as_int4(__ockl_image_gather4g_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord)); case 2: return as_int4(__ockl_image_gather4b_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord)); case 3: return as_int4(__ockl_image_gather4a_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord)); default: return as_int4(__ockl_image_gather4r_2D(LOWER_ro_2D(im), LOWER_sampler(s), coord)); } } FATTR int4 amd_fetch4_ii(read_only image2d_t im, int2 coord, int comp) { sampler_t s = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST | CLK_ADDRESS_NONE; float2 fcoord = convert_float2(coord); if (__oclc_ISA_version < 9000) { fcoord -= 0.5f; } switch (comp) { case 1: return as_int4(__ockl_image_gather4g_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord)); case 2: return as_int4(__ockl_image_gather4b_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord)); case 3: return as_int4(__ockl_image_gather4a_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord)); default: return as_int4(__ockl_image_gather4r_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord)); } } FATTR int4 amd_fetch4_isi(read_only image2d_t im, sampler_t s, int2 coord, int comp) { float2 fcoord = convert_float2(coord); if (__oclc_ISA_version < 9000) { fcoord -= 0.5f; } switch (comp) { case 1: return as_int4(__ockl_image_gather4g_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord)); case 2: return as_int4(__ockl_image_gather4b_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord)); case 3: return as_int4(__ockl_image_gather4a_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord)); default: return as_int4(__ockl_image_gather4r_2D(LOWER_ro_2D(im), LOWER_sampler(s), fcoord)); } } FATTR uint4 amd_fetch4_uf(read_only image2d_t im, float2 coord, int comp) { return as_uint4(amd_fetch4_if(im, coord, comp)); } FATTR uint4 amd_fetch4_usf(read_only image2d_t im, sampler_t s, float2 coord, int comp) { return as_uint4(amd_fetch4_isf(im, s, coord, comp)); } FATTR uint4 amd_fetch4_ui(read_only image2d_t im, int2 coord, int comp) { return as_uint4(amd_fetch4_ii(im, coord, comp)); } FATTR uint4 amd_fetch4_usi(read_only image2d_t im, sampler_t s, int2 coord, int comp) { return as_uint4(amd_fetch4_isi(im, s, coord, comp)); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/image/isamp.cl000066400000000000000000000127611415221260100221210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "oclc.h" static __constant uint SI_samplers[] = { 0x1000b1b6, 0x00fff000, 0x00000000, 0x00000000, // 0x10 0x100031b6, 0x00fff000, 0x00000000, 0x00000000, // 0x11 0x1000b092, 0x00fff000, 0x00000000, 0x00000000, // 0x12 0x10003092, 0x00fff000, 0x00000000, 0x00000000, // 0x13 0x1000b1b6, 0x00fff000, 0x00000000, 0x00000000, // 0x14 0x100031b6, 0x00fff000, 0x00000000, 0x00000000, // 0x15 0x1000b000, 0x00fff000, 0x00000000, 0x00000000, // 0x16 0x10003000, 0x00fff000, 0x00000000, 0x00000000, // 0x17 0x1000b049, 0x00fff000, 0x00000000, 0x00000000, // 0x18 0x10003049, 0x00fff000, 0x00000000, 0x00000000, // 0x19 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1a 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1b 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1c 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1d 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1e 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1f 0x1000b1b6, 0x00fff000, 0x00500000, 0x00000000, // 0x20 0x100031b6, 0x00fff000, 0x00500000, 0x00000000, // 0x21 0x1000b092, 0x00fff000, 0x00500000, 0x00000000, // 0x22 0x10003092, 0x00fff000, 0x00500000, 0x00000000, // 0x23 0x1000b1b6, 0x00fff000, 0x00500000, 0x00000000, // 0x24 0x100031b6, 0x00fff000, 0x00500000, 0x00000000, // 0x25 0x1000b000, 0x00fff000, 0x00500000, 0x00000000, // 0x26 0x10003000, 0x00fff000, 0x00500000, 0x00000000, // 0x27 0x1000b049, 0x00fff000, 0x00500000, 0x00000000, // 0x28 0x10003049, 0x00fff000, 0x00500000, 0x00000000, // 0x29 }; static __constant uint GFX9_samplers[] = { 0x1000b1b6, 0x00fff000, 0x80000000, 0x00000000, // 0x10 0x100031b6, 0x00fff000, 0x80000000, 0x00000000, // 0x11 0x1000b092, 0x00fff000, 0x80000000, 0x00000000, // 0x12 0x10003092, 0x00fff000, 0x80000000, 0x00000000, // 0x13 0x1000b1b6, 0x00fff000, 0x80000000, 0x00000000, // 0x14 0x100031b6, 0x00fff000, 0x80000000, 0x00000000, // 0x15 0x1000b000, 0x00fff000, 0x80000000, 0x00000000, // 0x16 0x10003000, 0x00fff000, 0x80000000, 0x00000000, // 0x17 0x1000b049, 0x00fff000, 0x80000000, 0x00000000, // 0x18 0x10003049, 0x00fff000, 0x80000000, 0x00000000, // 0x19 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1a 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1b 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1c 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1d 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1e 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1f 0x1000b1b6, 0x00fff000, 0x80500000, 0x00000000, // 0x20 0x100031b6, 0x00fff000, 0x80500000, 0x00000000, // 0x21 0x1000b092, 0x00fff000, 0x80500000, 0x00000000, // 0x22 0x10003092, 0x00fff000, 0x80500000, 0x00000000, // 0x23 0x1000b1b6, 0x00fff000, 0x80500000, 0x00000000, // 0x24 0x100031b6, 0x00fff000, 0x80500000, 0x00000000, // 0x25 0x1000b000, 0x00fff000, 0x80500000, 0x00000000, // 0x26 0x10003000, 0x00fff000, 0x80500000, 0x00000000, // 0x27 0x1000b049, 0x00fff000, 0x80500000, 0x00000000, // 0x28 0x10003049, 0x00fff000, 0x80500000, 0x00000000, // 0x29 }; static __constant uint GFX10_samplers[] = { 0x1000b1b6, 0x00fff000, 0x20000000, 0x40000000, // 0x10 0x100031b6, 0x00fff000, 0x20000000, 0x40000000, // 0x11 0x1000b092, 0x00fff000, 0x20000000, 0x40000000, // 0x12 0x10003092, 0x00fff000, 0x20000000, 0x40000000, // 0x13 0x1000b1b6, 0x00fff000, 0x20000000, 0x40000000, // 0x14 0x100031b6, 0x00fff000, 0x20000000, 0x40000000, // 0x15 0x1000b000, 0x00fff000, 0x20000000, 0x40000000, // 0x16 0x10003000, 0x00fff000, 0x20000000, 0x40000000, // 0x17 0x1000b049, 0x00fff000, 0x20000000, 0x40000000, // 0x18 0x10003049, 0x00fff000, 0x20000000, 0x40000000, // 0x19 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1a 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1b 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1c 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1d 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1e 0x00000000, 0x00000000, 0x00000000, 0x00000000, // 0x1f 0x1000b1b6, 0x00fff000, 0x20500000, 0x40000000, // 0x20 0x100031b6, 0x00fff000, 0x20500000, 0x40000000, // 0x21 0x1000b092, 0x00fff000, 0x20500000, 0x40000000, // 0x22 0x10003092, 0x00fff000, 0x20500000, 0x40000000, // 0x23 0x1000b1b6, 0x00fff000, 0x20500000, 0x40000000, // 0x24 0x100031b6, 0x00fff000, 0x20500000, 0x40000000, // 0x25 0x1000b000, 0x00fff000, 0x20500000, 0x40000000, // 0x26 0x10003000, 0x00fff000, 0x20500000, 0x40000000, // 0x27 0x1000b049, 0x00fff000, 0x20500000, 0x40000000, // 0x28 0x10003049, 0x00fff000, 0x20500000, 0x40000000, // 0x29 }; typedef struct { int x, y, z, w; } __sampler_t; __attribute__((const)) __constant __sampler_t * __translate_sampler_initializer(int i) { if (__oclc_ISA_version < 9000) { return (__constant __sampler_t *)&SI_samplers[(i - 16) << 2]; } else if (__oclc_ISA_version < 10000) { return (__constant __sampler_t *)&GFX9_samplers[(i - 16) << 2]; } else { return (__constant __sampler_t *)&GFX10_samplers[(i - 16) << 2]; } } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/000077500000000000000000000000001415221260100210345ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/abs.cl000066400000000000000000000022521415221260100221220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) #define GENN(N,T) \ ATTR u##T##N \ abs(T##N x) \ { \ int##N px = convert_int##N(x); \ int##N nx = -px; \ return convert_u##T##N(max(px,nx)); \ } \ \ ATTR u##T##N \ abs(u##T##N x) \ { \ return x; \ } #define GEN(T) \ GENN(16,T) \ GENN(8,T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GENN(,T) GEN(char) GEN(short) #define LGENN(N,T) \ ATTR u##T##N \ abs(T##N x) \ { \ return convert_u##T##N(select(-x, x, x > (T)0)); \ } \ \ ATTR u##T##N \ abs(u##T##N x) \ { \ return x; \ } #define LGEN1(T) \ ATTR u##T \ abs(T x) \ { \ T mx = -x; \ return as_u##T(x > (T)0 ? x : mx); \ } \ \ ATTR u##T \ abs(u##T x) \ { \ return x; \ } #define LGEN(T) \ LGENN(16,T) \ LGENN(8,T) \ LGENN(4,T) \ LGENN(3,T) \ LGENN(2,T) \ LGEN1(T) LGEN(int) LGEN(long) ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/abs_diff.cl000066400000000000000000000026571415221260100231230ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) #define GENN(N,T) \ ATTR u##T##N \ abs_diff(T##N x, T##N y) \ { \ int##N xx = convert_int##N(x); \ int##N yy = convert_int##N(y); \ int##N d = max(xx,yy) - min(xx,yy); \ return convert_u##T##N(d); \ } \ \ ATTR u##T##N \ abs_diff(u##T##N x, u##T##N y) \ { \ uint##N xx = convert_uint##N(x); \ uint##N yy = convert_uint##N(y); \ uint##N d = max(xx,yy) - min(xx,yy); \ return convert_u##T##N(d); \ } #define GEN(T) \ GENN(16,T) \ GENN(8,T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GENN(,T) GEN(char) GEN(short) GEN(int) #define LGEN(N) \ ATTR ulong##N \ abs_diff(long##N x, long##N y) \ { \ return as_ulong##N(select(y - x, x - y, x > y)); \ } \ \ ATTR ulong##N \ abs_diff(ulong##N x, ulong##N y) \ { \ return select(y - x, x - y, x > y); \ } LGEN(16) LGEN(8) LGEN(4) LGEN(3) LGEN(2) ATTR ulong abs_diff(long x, long y) { long xmy = x - y; long ymx = y - x; return x > y ? xmy : ymx; } ATTR ulong abs_diff(ulong x, ulong y) { ulong xmy = x - y; ulong ymx = y - x; return x > y ? xmy : ymx; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/add_sat.cl000066400000000000000000000036001415221260100227520ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "int.h" #define ATTR __attribute__((overloadable, const)) #define char_min CHAR_MIN #define char_max CHAR_MAX #define short_min SHRT_MIN #define short_max SHRT_MAX #define uchar_max UCHAR_MAX #define ushort_max USHRT_MAX #define GENN(T) \ ATTR T \ add_sat(T x, T y) \ { \ T s; \ bool c = __builtin_add_overflow(x, y, &s); \ return c ? (x < 0 ? T##_min : T##_max) : s; \ } \ \ ATTR u##T \ add_sat(u##T x, u##T y) \ { \ u##T s; \ bool c = __builtin_add_overflow(x, y, &s); \ return c ? u##T##_max : s; \ } GENN(char) GENN(short) #define BEXPATTR __attribute__((overloadable)) BEXP(char,add_sat) BEXP(uchar,add_sat) BEXP(short,add_sat) BEXP(ushort,add_sat) BEXP(int,add_sat) BEXP(uint,add_sat) BEXP(long,add_sat) BEXP(ulong,add_sat) BEXPATTR int add_sat(int x, int y) { return __ockl_add_sat_i32(x, y); } BEXPATTR uint add_sat(uint x, uint y) { return __ockl_add_sat_u32(x, y); } BEXPATTR long add_sat(long x, long y) { return __ockl_add_sat_i64(x, y); } BEXPATTR ulong add_sat(ulong x, ulong y) { return __ockl_add_sat_u64(x, y); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/clz.cl000066400000000000000000000020501415221260100221410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "int.h" #define UEXPATTR __attribute__((overloadable, const)) UEXP(char,clz) UEXP(uchar,clz) UEXP(short,clz) UEXP(ushort,clz) UEXP(int,clz) UEXP(uint,clz) UEXP(long,clz) UEXP(ulong,clz) UEXPATTR char clz(char x) { return (char)__ockl_clz_u8((uchar)x); } UEXPATTR uchar clz(uchar x) { return __ockl_clz_u8(x); } UEXPATTR short clz(short x) { return (short)__ockl_clz_u16((ushort)x); } UEXPATTR ushort clz(ushort x) { return __ockl_clz_u16(x); } UEXPATTR int clz(int x) { return (int)__ockl_clz_u32((uint)x); } UEXPATTR uint clz(uint x) { return __ockl_clz_u32(x); } UEXPATTR long clz(long x) { return (long)__ockl_clz_u64((ulong)x); } UEXPATTR ulong clz(ulong x) { return __ockl_clz_u64(x); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/ctz.cl000066400000000000000000000020501415221260100221510ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "int.h" #define UEXPATTR __attribute__((overloadable, const)) UEXP(char,ctz) UEXP(uchar,ctz) UEXP(short,ctz) UEXP(ushort,ctz) UEXP(int,ctz) UEXP(uint,ctz) UEXP(long,ctz) UEXP(ulong,ctz) UEXPATTR char ctz(char x) { return (char)__ockl_ctz_u8((uchar)x); } UEXPATTR uchar ctz(uchar x) { return __ockl_ctz_u8(x); } UEXPATTR short ctz(short x) { return (short)__ockl_ctz_u16((ushort)x); } UEXPATTR ushort ctz(ushort x) { return __ockl_ctz_u16(x); } UEXPATTR int ctz(int x) { return (int)__ockl_ctz_u32((uint)x); } UEXPATTR uint ctz(uint x) { return __ockl_ctz_u32(x); } UEXPATTR long ctz(long x) { return (long)__ockl_ctz_u64((ulong)x); } UEXPATTR ulong ctz(ulong x) { return __ockl_ctz_u64(x); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/hadd.cl000066400000000000000000000020601415221260100222520ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) #define GENN(N,T) \ ATTR T##N \ hadd(T##N x, T##N y) \ { \ return convert_##T##N((convert_int##N(x) + convert_int##N(y)) >> 1); \ } \ \ ATTR u##T##N \ hadd(u##T##N x, u##T##N y) \ { \ return convert_u##T##N((convert_uint##N(x) + convert_uint##N(y)) >> 1); \ } #define GEN(T) \ GENN(16,T) \ GENN(8,T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GENN(,T) GEN(char) GEN(short) #define LGENN(N,T) \ ATTR T##N \ hadd(T##N x, T##N y) \ { \ T##N c = (x & (T)1) & y; \ return (x >> 1) + (y >> 1) + c; \ } #define LGEN(T) \ LGENN(16,T) \ LGENN(8,T) \ LGENN(4,T) \ LGENN(3,T) \ LGENN(2,T) \ LGENN(,T) LGEN(int) LGEN(uint) LGEN(long) LGEN(ulong) ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/iclamp.cl000066400000000000000000000015251415221260100226240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) #define GENN(N,T) \ ATTR T##N \ clamp(T##N x, T lo, T hi) \ { \ return min(max(x, lo), hi); \ } \ \ ATTR T##N \ clamp(T##N x, T##N lo, T##N hi) \ { \ return min(max(x, lo), hi); \ } #define GEN1(T) \ ATTR T \ clamp(T x, T lo, T hi) \ { \ return min(max(x, lo), hi); \ } #define GEN(T) \ GENN(16,T) \ GENN(8,T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GEN1(T) GEN(char) GEN(uchar) GEN(short) GEN(ushort) GEN(int) GEN(uint) GEN(long) GEN(ulong) ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/int.h000066400000000000000000000055241415221260100220050ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ULIST2(F) F(x.s0), F(x.s1) #define ULIST3(F) F(x.s0), F(x.s1), F(x.s2) #define ULIST4(F) ULIST2(F), F(x.s2), F(x.s3) #define ULIST8(F) ULIST4(F), F(x.s4), F(x.s5), F(x.s6), F(x.s7) #define ULIST16(F) ULIST8(F), F(x.s8), F(x.s9), F(x.sa), F(x.sb), F(x.sc), F(x.sd), F(x.se), F(x.sf) #define UEXPN(N,T,F) \ UEXPATTR T##N \ F(T##N x) \ { \ return (T##N) ( ULIST##N(F) ); \ } #define UEXP(T,F) \ UEXPN(16,T,F) \ UEXPN(8,T,F) \ UEXPN(4,T,F) \ UEXPN(3,T,F) \ UEXPN(2,T,F) #define BLIST2(F) F(x.s0, y.s0), F(x.s1, y.s1) #define BLIST3(F) F(x.s0, y.s0), F(x.s1, y.s1), F(x.s2, y.s2) #define BLIST4(F) BLIST2(F), F(x.s2, y.s2), F(x.s3, y.s3) #define BLIST8(F) BLIST4(F), F(x.s4, y.s4), F(x.s5, y.s5), F(x.s6, y.s6), F(x.s7, y.s7) #define BLIST16(F) BLIST8(F), F(x.s8, y.s8), F(x.s9, y.s9), F(x.sa, y.sa), F(x.sb, y.sb), F(x.sc, y.sc), F(x.sd, y.sd), F(x.se, y.se), F(x.sf, y.sf) #define BEXPN(N,T,F) \ BEXPATTR T##N \ F(T##N x, T##N y) \ { \ return (T##N) ( BLIST##N(F) ); \ } #define BEXP(T,F) \ BEXPN(16,T,F) \ BEXPN(8,T,F) \ BEXPN(4,T,F) \ BEXPN(3,T,F) \ BEXPN(2,T,F) #define TLIST2(F) F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define TLIST3(F) F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1), F(a.s2, b.s2, c.s2) #define TLIST4(F) TLIST2(F), F(a.s2, b.s2, c.s2), F(a.s3, b.s3, c.s3) #define TLIST8(F) TLIST4(F), F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define TLIST16(F) TLIST8(F), F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define TEXPN(N,T,F) \ TEXPATTR T##N \ F(T##N a, T##N b, T##N c) \ { \ return (T##N) ( TLIST##N(F) ); \ } #define TEXP(T,F) \ TEXPN(16,T,F) \ TEXPN(8,T,F) \ TEXPN(4,T,F) \ TEXPN(3,T,F) \ TEXPN(2,T,F) static inline long _gpu_mul_hi_i64(long x, long y) { ulong x0 = (ulong)x & 0xffffffffUL; long x1 = x >> 32; ulong y0 = (ulong)y & 0xffffffffUL; long y1 = y >> 32; ulong z0 = x0*y0; long t = x1*y0 + (z0 >> 32); long z1 = t & 0xffffffffL; long z2 = t >> 32; z1 = x0*y1 + z1; return x1*y1 + z2 + (z1 >> 32); } static inline ulong _gpu_mul_hi_u64(ulong x, ulong y) { ulong x0 = x & 0xffffffffUL; ulong x1 = x >> 32; ulong y0 = y & 0xffffffffUL; ulong y1 = y >> 32; ulong z0 = x0*y0; ulong t = x1*y0 + (z0 >> 32); ulong z1 = t & 0xffffffffUL; ulong z2 = t >> 32; z1 = x0*y1 + z1; return x1*y1 + z2 + (z1 >> 32); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/mad24.cl000066400000000000000000000011561415221260100222660ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "int.h" #define TEXPATTR __attribute__((overloadable, const)) TEXP(int,mad24) TEXP(uint,mad24) TEXPATTR int mad24(int a, int b, int c) { return ((a << 8) >> 8) * ((b << 8) >> 8) + c; } TEXPATTR uint mad24(uint a, uint b, uint c) { return ((a << 8) >> 8) * ((b << 8) >> 8) + c; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/mad_hi.cl000066400000000000000000000012401415221260100225720ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) #define GENN(N,T) \ ATTR T##N \ mad_hi(T##N a, T##N b, T##N c) \ { \ return mul_hi(a, b) + c; \ } #define GEN(T) \ GENN(16,T) \ GENN(8,T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GENN(,T) GEN(char) GEN(uchar) GEN(short) GEN(ushort) GEN(int) GEN(uint) GEN(long) GEN(ulong) ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/mad_sat.cl000066400000000000000000000050051415221260100227640ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "int.h" #define TEXPATTR __attribute__((overloadable, const)) TEXP(char,mad_sat) TEXP(uchar,mad_sat) TEXP(short,mad_sat) TEXP(ushort,mad_sat) TEXP(int,mad_sat) TEXP(uint,mad_sat) TEXP(long,mad_sat) TEXP(ulong,mad_sat) TEXPATTR char mad_sat(char a, char b, char c) { return (char)clamp(mad24((int)a, (int)b, (int)c), CHAR_MIN, CHAR_MAX); } TEXPATTR uchar mad_sat(uchar a, uchar b, uchar c) { return (uchar)min(mad24((uint)a, (uint)b, (uint)c), (uint)UCHAR_MAX); } TEXPATTR short mad_sat(short a, short b, short c) { return (short)clamp(mad24((int)a, (int)b, (int)c), SHRT_MIN, SHRT_MAX); } TEXPATTR ushort mad_sat(ushort a, ushort b, ushort c) { return (ushort)min(mad24((uint)a, (uint)b, (uint)c), (uint)USHRT_MAX); } TEXPATTR int mad_sat(int a, int b, int c) { long d = as_long((int2)(a * b, mul_hi(a, b))) + (long)c; return (int)clamp(d, (long)INT_MIN, (long)INT_MAX); } TEXPATTR uint mad_sat(uint a, uint b, uint c) { ulong d = as_ulong((uint2)(a * b, mul_hi(a, b))) + (ulong)c; return (uint)min(d, (ulong)UINT_MAX); } TEXPATTR long mad_sat(long a, long b, long c) { ulong a0 = (ulong)a & 0xffffffffUL; long a1 = a >> 32; ulong b0 = (ulong)b & 0xffffffffUL; long b1 = b >> 32; ulong s0 = a0*b0; long t = a1*b0 + (s0 >> 32); long s1 = a0*b1 + (t & 0xffffffffL); long s2 = t >> 32; long lo = (s1 << 32) | (s0 & 0xffffffffL); long hi = a1*b1 + s2 + (s1 >> 32); t = lo + c; hi += ((ulong)0xffffffffffffffffUL - (ulong)c < (ulong)lo); lo = t; hi -= c < 0L; lo = (hi < 0L) & ((hi != -1L) | (lo >= 0L)) ? 0x8000000000000000L : lo; lo = (hi >= 0L) & ((hi > 0L) | (lo < 0L)) ? 0x7fffffffffffffffL : lo; return lo; } TEXPATTR ulong mad_sat(ulong a, ulong b, ulong c) { ulong a0 = a & 0xffffffffUL; ulong a1 = a >> 32; ulong b0 = b & 0xffffffffUL; ulong b1 = b >> 32; ulong s0 = a0*b0; ulong t = a1*b0 + (s0 >> 32); ulong s1 = t & 0xffffffffUL; ulong s2 = t >> 32; s1 = a0*b1 + s1; ulong lo = (s1 << 32) | (s0 & 0xffffffffUL); ulong hi = a1*b1 + s2 + (s1 >> 32); t = lo + c; hi += 0xffffffffffffffffUL - c < lo; lo = t; return hi > 0UL ? 0xffffffffffffffffUL : lo; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/max.cl000066400000000000000000000015141415221260100221420ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) #define GENN(N,T) \ ATTR T##N \ max(T##N x, T y) \ { \ T##N vy = (T##N)y; \ return select(x, vy, x < vy); \ } \ \ ATTR T##N \ max(T##N x, T##N y) \ { \ return select(x, y, x < y); \ } #define GEN1(T) \ ATTR T \ max(T x, T y) \ { \ return x < y ? y : x; \ } #define GEN(T) \ GENN(16,T) \ GENN(8,T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GEN1(T) GEN(char) GEN(uchar) GEN(short) GEN(ushort) GEN(int) GEN(uint) GEN(long) GEN(ulong) ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/min.cl000066400000000000000000000015141415221260100221400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) #define GENN(N,T) \ ATTR T##N \ min(T##N x, T y) \ { \ T##N yv = (T##N)y; \ return select(x, yv, yv < x); \ } \ \ ATTR T##N \ min(T##N x, T##N y) \ { \ return select(x, y, y < x); \ } #define GEN1(T) \ ATTR T \ min(T x, T y) \ { \ return y < x ? y : x; \ } #define GEN(T) \ GENN(16,T) \ GENN(8,T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GEN1(T) GEN(char) GEN(uchar) GEN(short) GEN(ushort) GEN(int) GEN(uint) GEN(long) GEN(ulong) ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/mul24.cl000066400000000000000000000011011415221260100223100ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "int.h" #define BEXPATTR __attribute__((overloadable, const)) BEXP(int,mul24) BEXP(uint,mul24) BEXPATTR int mul24(int x, int y) { return __ockl_mul24_i32(x, y); } BEXPATTR uint mul24(uint x, uint y) { return __ockl_mul24_u32(x, y); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/mul_hi.cl000066400000000000000000000023721415221260100226350ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "int.h" #define ATTR __attribute__((overloadable, const)) #define char_shift 8 #define short_shift 16 #define GENN(N,T) \ ATTR T##N \ mul_hi(T##N x, T##N y) \ { \ return convert_##T##N(mul24(convert_int##N(x), convert_int##N(y)) >> T##_shift); \ } \ \ ATTR u##T##N \ mul_hi(u##T##N x, u##T##N y) \ { \ return convert_u##T##N(mul24(convert_uint##N(x), convert_uint##N(y)) >> T##_shift); \ } #define GEN(T) \ GENN(16,T) \ GENN(8,T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GENN(,T) GEN(char) GEN(short) #define BEXPATTR ATTR BEXP(int,mul_hi) BEXP(uint,mul_hi) BEXP(long,mul_hi) BEXP(ulong,mul_hi) BEXPATTR int mul_hi(int x, int y) { return __ockl_mul_hi_i32(x, y); } BEXPATTR uint mul_hi(uint x, uint y) { return __ockl_mul_hi_u32(x, y); } BEXPATTR long mul_hi(long x, long y) { return __ockl_mul_hi_i64(x, y); } BEXPATTR ulong mul_hi(ulong x, ulong y) { return __ockl_mul_hi_u64(x, y); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/popcount.cl000066400000000000000000000023121415221260100232210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "int.h" #define UEXPATTR __attribute__((overloadable, const)) UEXP(char,popcount) UEXP(uchar,popcount) UEXP(short,popcount) UEXP(ushort,popcount) UEXP(int,popcount) UEXP(uint,popcount) UEXP(long,popcount) UEXP(ulong,popcount) UEXPATTR char popcount(char x) { return (char)__ockl_popcount_u32((uint)(uchar)x); } UEXPATTR uchar popcount(uchar x) { return (uchar)__ockl_popcount_u32((uint)x); } UEXPATTR short popcount(short x) { return (short)__ockl_popcount_u32((uint)(ushort)x); } UEXPATTR ushort popcount(ushort x) { return (ushort)__ockl_popcount_u32((uint)x); } UEXPATTR int popcount(int x) { return (int)__ockl_popcount_u32((uint)x); } UEXPATTR uint popcount(uint x) { return __ockl_popcount_u32(x); } UEXPATTR long popcount(long x) { return (long)__ockl_popcount_u64((ulong)x); } UEXPATTR ulong popcount(ulong x) { return __ockl_popcount_u64(x); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/rhadd.cl000066400000000000000000000020741415221260100224410ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) #define GENN(N,T) \ ATTR T##N \ rhadd(T##N x, T##N y) \ { \ return convert_##T##N((convert_int##N(x) + convert_int##N(y) + 1) >> 1); \ } \ \ ATTR u##T##N \ rhadd(u##T##N x, u##T##N y) \ { \ return convert_u##T##N((convert_uint##N(x) + convert_uint##N(y) + 1U) >> 1); \ } #define GEN(T) \ GENN(16,T) \ GENN(8,T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GENN(,T) GEN(char) GEN(short) #define LGENN(N,T) \ ATTR T##N \ rhadd(T##N x, T##N y) \ { \ T##N c = (x | y) & (T)1; \ return (x >> 1) + (y >> 1) + c; \ } #define LGEN(T) \ LGENN(16,T) \ LGENN(8,T) \ LGENN(4,T) \ LGENN(3,T) \ LGENN(2,T) \ LGENN(,T) LGEN(int) LGEN(uint) LGEN(long) LGEN(ulong) ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/rotate.cl000066400000000000000000000030621415221260100226530ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "int.h" #define ATTR __attribute__((overloadable, const)) #define char_bits 8 #define short_bits 16 #define int_bits 32 #define long_bits 64 #define GENN(N,T) \ ATTR T##N \ rotate(T##N x, T##N y) \ { \ uint##N s = convert_uint##N(as_u##T##N(y)) & (uint)(T##_bits - 1); \ uint##N v = convert_uint##N(as_u##T##N(x)); \ return convert_##T##N((v << s) | (v >> (T##_bits - s))); \ } \ \ ATTR u##T##N \ rotate(u##T##N x, u##T##N y) \ { \ uint##N s = convert_uint##N(y) & (uint)(T##_bits - 1); \ uint##N v = convert_uint##N(x); \ return convert_u##T##N((v << s) | (v >> ((uint)T##_bits - s))); \ } #define GEN(T) \ GENN(16,T) \ GENN(8,T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GENN(,T) GEN(char) GEN(short) #define LGENN(N,T) \ ATTR T##N \ rotate(T##N x, T##N y) \ { \ u##T##N s = as_u##T##N(y) & (u##T)(T##_bits - 1); \ u##T##N v = as_u##T##N(x); \ return as_##T##N((v << s) | (v >> ((u##T)T##_bits - s))); \ } \ \ ATTR u##T##N \ rotate(u##T##N x, u##T##N y) \ { \ y &= (u##T)(T##_bits - 1); \ return (x << y) | (x >> ((u##T)T##_bits - y)); \ } #define LGEN(T) \ LGENN(16,T) \ LGENN(8,T) \ LGENN(4,T) \ LGENN(3,T) \ LGENN(2,T) \ LGENN(,T) LGEN(int) LGEN(long) ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/sub_sat.cl000066400000000000000000000036001415221260100230130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "int.h" #define ATTR __attribute__((overloadable, const)) #define char_min CHAR_MIN #define char_max CHAR_MAX #define short_min SHRT_MIN #define short_max SHRT_MAX #define uchar_max UCHAR_MAX #define ushort_max USHRT_MAX #define GENN(T) \ ATTR T \ sub_sat(T x, T y) \ { \ T s; \ bool c = __builtin_sub_overflow(x, y, &s); \ return c ? (x < 0 ? T##_min : T##_max) : s; \ } \ \ ATTR u##T \ sub_sat(u##T x, u##T y) \ { \ u##T s; \ bool c = __builtin_sub_overflow(x, y, &s); \ return c ? 0 : s; \ } GENN(char) GENN(short) #define BEXPATTR __attribute__((overloadable)) BEXP(char,sub_sat) BEXP(uchar,sub_sat) BEXP(short,sub_sat) BEXP(ushort,sub_sat) BEXP(int,sub_sat) BEXP(uint,sub_sat) BEXP(long,sub_sat) BEXP(ulong,sub_sat) BEXPATTR int sub_sat(int x, int y) { return __ockl_sub_sat_i32(x, y); } BEXPATTR uint sub_sat(uint x, uint y) { return __ockl_sub_sat_u32(x, y); } BEXPATTR long sub_sat(long x, long y) { return __ockl_sub_sat_i64(x, y); } BEXPATTR ulong sub_sat(ulong x, ulong y) { return __ockl_sub_sat_u64(x, y); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/integer/upsample.cl000066400000000000000000000024661415221260100232120ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) #define ATTR __attribute__((overloadable, const)) #define char_shift 8 #define short_shift 16 #define char_up short #define short_up int #define GENN(N,T) \ ATTR C(T##_up,N) \ upsample(T##N hi, u##T##N lo) \ { \ return C(convert_,C(T##_up,N))((convert_uint##N(as_u##T##N(hi)) << T##_shift) | convert_uint##N(lo)); \ } \ \ ATTR C(u,C(T##_up,N)) \ upsample(u##T##N hi, u##T##N lo) \ { \ return C(convert_u,C(T##_up,N))((convert_uint##N(hi) << T##_shift) | convert_uint##N(lo)); \ } #define GEN(T) \ GENN(16,T) \ GENN(8,T) \ GENN(4,T) \ GENN(3,T) \ GENN(2,T) \ GENN(,T) GEN(char) GEN(short) #define LGEN(N) \ ATTR long##N \ upsample(int##N hi, uint##N lo) \ { \ return as_long##N((convert_ulong##N(as_uint##N(hi)) << 32) | convert_ulong##N(lo)); \ } \ \ ATTR ulong##N \ upsample(uint##N hi, uint##N lo) \ { \ return (convert_ulong##N(hi) << 32) | convert_ulong##N(lo); \ } LGEN(16) LGEN(8) LGEN(4) LGEN(3) LGEN(2) LGEN() ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/000077500000000000000000000000001415221260100203305ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/halfmath.cl000066400000000000000000000100021415221260100224250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ struct redret { int i; float r; }; // For trigs extern struct redret __half_red(float); extern float2 __half_scr(float); extern float __half_tr(float, int); #define IATTR __attribute__((overloadable)) #define CATTR __attribute__((overloadable, const)) #if !defined USE_CLP #define LISTU2(F) F(x.s0), F(x.s1) #define LISTU3(F) F(x.s0), F(x.s1), F(x.s2) #define LISTU4(F) LISTU2(F), F(x.s2), F(x.s3) #define LISTU8(F) LISTU4(F), F(x.s4), F(x.s5), F(x.s6), F(x.s7) #define LISTU16(F) LISTU8(F), F(x.s8), F(x.s9), F(x.sa), F(x.sb), \ F(x.sc), F(x.sd), F(x.se), F(x.sf) #define EXPUN(N,F) \ IATTR float##N \ F(float##N x) \ { \ return (float##N) ( LISTU##N(F) ); \ } #define EXPU(F) \ EXPUN(16,F) \ EXPUN(8,F) \ EXPUN(4,F) \ EXPUN(3,F) \ EXPUN(2,F) #define LISTB2(F) F(x.s0,y.s0), F(x.s1,y.s1) #define LISTB3(F) F(x.s0,y.s0), F(x.s1,y.s1), F(x.s2,y.s2) #define LISTB4(F) LISTB2(F), F(x.s2,y.s2), F(x.s3,y.s3) #define LISTB8(F) LISTB4(F), F(x.s4,y.s4), F(x.s5,y.s5), F(x.s6,y.s6), F(x.s7,y.s7) #define LISTB16(F) LISTB8(F), F(x.s8,y.s8), F(x.s9,y.s9), F(x.sa,y.sa), F(x.sb,y.sb), \ F(x.sc,y.sc), F(x.sd,y.sd), F(x.se,y.se), F(x.sf,y.sf) #define EXPBN(N,F) \ IATTR float##N \ F(float##N x, float##N y) \ { \ return (float##N) ( LISTB##N(F) ); \ } #define EXPB(F) \ EXPBN(16,F) \ EXPBN(8,F) \ EXPBN(4,F) \ EXPBN(3,F) \ EXPBN(2,F) EXPB(half_divide) EXPB(half_powr) EXPU(half_cos) EXPU(half_exp2) EXPU(half_exp) EXPU(half_exp10) EXPU(half_log2) EXPU(half_log) EXPU(half_log10) EXPU(half_recip) EXPU(half_rsqrt) EXPU(half_sin) EXPU(half_sqrt) EXPU(half_tan) #endif // !USE_CLP CATTR float half_divide(float x, float y) { return x / y; } IATTR float half_powr(float x, float y) { return powr(x, y); } IATTR float half_cos(float x) { float dx = fabs(x); int ax = as_int(dx); struct redret red =__half_red(dx); float r0 = red.r; int regn = red.i; float2 scr = __half_scr(r0); float cc = scr.y; float ss = -scr.x; float c = (regn & 1) != 0 ? ss : cc; c = as_float(as_int(c) ^ ((regn > 1) << 31)); c = ax > 0x47800000 ? 1.0f : c; c = ax >= 0x7f800000 ? as_float(0x7fc00000) : c; return c; } CATTR float half_exp2(float x) { return native_exp2(x); } CATTR float half_exp(float x) { return native_exp(x); } CATTR float half_exp10(float x) { return native_exp10(x); } CATTR float half_log2(float x) { return native_log2(x); } CATTR float half_log(float x) { return native_log(x); } CATTR float half_log10(float x) { return native_log10(x); } CATTR float half_recip(float x) { return native_recip(x); } CATTR float half_rsqrt(float x) { return native_rsqrt(x); } IATTR float half_sin(float x) { int ix = as_int(x); float dx = fabs(x); int ax = as_int(dx); struct redret red = __half_red(dx); float r0 = red.r; int regn = red.i; float2 scr = __half_scr(r0); float ss = scr.x; float cc = scr.y; float s = (regn & 1) != 0 ? cc : ss; s = as_float(as_int(s) ^ ((regn > 1) << 31)); s = ax > 0x47800000 ? 1.0f : s; s = as_float(as_int(s) ^ (ix ^ ax)); s = x == 0.0f ? x : s; s = ax >= 0x7f800000 ? as_float(0x7fc00000) : s; return s; } CATTR float half_sqrt(float x) { return native_sqrt(x); } IATTR float half_tan(float x) { int ix = as_int(x); float dx = fabs(x); int ax = as_int(dx); struct redret red = __half_red(dx); float r0 = red.r; int regn = red.i; float t = __half_tr(r0, regn); t = as_float(as_int(t) ^ (ix ^ ax)); t = x == 0.0f ? x : t; t = ax > 0x47800000 ? 0.0f : t; t = ax >= 0x7f800000 ? as_float(0x7fc00000) : t; return t; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/halfred.cl000066400000000000000000000017221415221260100222570ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ // Trigonometric reduction for half_cos,sin,tan struct redret { int i; float r; }; struct redret __half_red(float x) { const float twobypi = 0x1.45f306p-1f; const float pb2_a = 0x1.92p+0f; const float pb2_b = 0x1.fap-12f; const float pb2_c = 0x1.54p-20f; const float pb2_d = 0x1.10p-30f; const float pb2_e = 0x1.68p-39f; const float pb2_f = 0x1.846988p-48f; float fn = rint(x * twobypi); struct redret ret; ret.i = (int)fn & 0x3; ret.r = mad(fn, -pb2_f, mad(fn, -pb2_e, mad(fn, -pb2_d, mad(fn, -pb2_c, mad(fn, -pb2_b, mad(fn, -pb2_a, x)))))); return ret; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/halfscr.cl000066400000000000000000000010261415221260100222710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ float2 __half_scr(float x) { float y = x * 0x1.45f306p-3f; float s = __builtin_amdgcn_sinf(y); float result = fabs(x) < 0x1.0p-20f ? x : s; return (float2)(result, __builtin_amdgcn_cosf(y) ); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/halftr.cl000066400000000000000000000014061415221260100221310ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ __attribute__((const)) float __half_tr(float x, int regn) { float r = x * x; float a = mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f); float b = mad(r, mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f), 1.15588821434688393452299f); float t = mad(x*r, a * __builtin_amdgcn_rcpf(b), x); float tr = -__builtin_amdgcn_rcpf(t); return regn & 1 ? tr : t; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/native.cl000066400000000000000000000056171415221260100221470ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #define ATTR __attribute__((overloadable, const)) #if !defined USE_CLP #define LISTU2(F) F(x.s0), F(x.s1) #define LISTU3(F) F(x.s0), F(x.s1), F(x.s2) #define LISTU4(F) LISTU2(F), F(x.s2), F(x.s3) #define LISTU8(F) LISTU4(F), F(x.s4), F(x.s5), F(x.s6), F(x.s7) #define LISTU16(F) LISTU8(F), F(x.s8), F(x.s9), F(x.sa), F(x.sb), \ F(x.sc), F(x.sd), F(x.se), F(x.sf) #define EXPUN(N,F) \ ATTR float##N \ F(float##N x) \ { \ return (float##N) ( LISTU##N(F) ); \ } #define EXPU(F) \ EXPUN(16,F) \ EXPUN(8,F) \ EXPUN(4,F) \ EXPUN(3,F) \ EXPUN(2,F) #define LISTB2(F) F(x.s0,y.s0), F(x.s1,y.s1) #define LISTB3(F) F(x.s0,y.s0), F(x.s1,y.s1), F(x.s2,y.s2) #define LISTB4(F) LISTB2(F), F(x.s2,y.s2), F(x.s3,y.s3) #define LISTB8(F) LISTB4(F), F(x.s4,y.s4), F(x.s5,y.s5), F(x.s6,y.s6), F(x.s7,y.s7) #define LISTB16(F) LISTB8(F), F(x.s8,y.s8), F(x.s9,y.s9), F(x.sa,y.sa), F(x.sb,y.sb), \ F(x.sc,y.sc), F(x.sd,y.sd), F(x.se,y.se), F(x.sf,y.sf) #define EXPBN(N,F) \ ATTR float##N \ F(float##N x, float##N y) \ { \ return (float##N) ( LISTB##N(F) ); \ } #define EXPB(F) \ EXPBN(16,F) \ EXPBN(8,F) \ EXPBN(4,F) \ EXPBN(3,F) \ EXPBN(2,F) EXPB(native_divide) EXPB(native_powr) EXPU(native_tan) EXPU(native_cos) EXPU(native_exp) EXPU(native_exp2) EXPU(native_exp10) EXPU(native_log) EXPU(native_log2) EXPU(native_log10) EXPU(native_recip) EXPU(native_rsqrt) EXPU(native_sin) EXPU(native_sqrt) #endif // !USE_CLP ATTR float native_divide(float x, float y) { return x * native_recip(y); } ATTR float native_powr(float x, float y) { return native_exp2(native_log2(x)*y); } ATTR float native_tan(float x) { x *= 0x1.45f306p-3f; return native_sin(x) * native_recip(native_cos(x)); } ATTR float native_cos(float x) { return __ocml_native_cos_f32(x); } ATTR float native_exp2(float x) { return __ocml_native_exp2_f32(x); } ATTR float native_exp(float f) { return __ocml_native_exp_f32(f); } ATTR float native_exp10(float f) { return __ocml_native_exp10_f32(f); } ATTR float native_log2(float x) { return __ocml_native_log2_f32(x); } ATTR float native_log(float f) { return __ocml_native_log_f32(f); } ATTR float native_log10(float f) { return __ocml_native_log10_f32(f); } ATTR float native_recip(float x) { return __ocml_native_recip_f32(x); } ATTR float native_rsqrt(float x) { return __ocml_native_rsqrt_f32(x); } ATTR float native_sin(float x) { return __ocml_native_sin_f32(x); } ATTR float native_sqrt(float x) { return __ocml_native_sqrt_f32(x); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/wrapb.cl000066400000000000000000000051251415221260100217660ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) #define ATTR __attribute__((overloadable)) #define float_ssuf _f32 #define double_ssuf _f64 #define half_ssuf _f16 #define half_psuf _2f16 #define SNAME(F,T) C(__ocml_,C(F,T##_ssuf)) #define PNAME(F,T) C(__ocml_,C(F,T##_psuf)) #define SLST2(F,T) SNAME(F,T)(x.s0,y.s0), SNAME(F,T)(x.s1,y.s1) #define SLST3(F,T) SNAME(F,T)(x.s0,y.s0), SNAME(F,T)(x.s1,y.s1), SNAME(F,T)(x.s2,y.s2) #define SLST4(F,T) SLST2(F,T), SNAME(F,T)(x.s2,y.s2), SNAME(F,T)(x.s3,y.s3) #define SLST8(F,T) SLST4(F,T), SNAME(F,T)(x.s4,y.s4), SNAME(F,T)(x.s5,y.s5), SNAME(F,T)(x.s6,y.s6), SNAME(F,T)(x.s7,y.s7) #define SLST16(F,T) SLST8(F,T), SNAME(F,T)(x.s8,y.s8), SNAME(F,T)(x.s9,y.s9), SNAME(F,T)(x.sa,y.sa), SNAME(F,T)(x.sb,y.sb), \ SNAME(F,T)(x.sc,y.sc), SNAME(F,T)(x.sd,y.sd), SNAME(F,T)(x.se,y.se), SNAME(F,T)(x.sf,y.sf) #define PLST3(F,T) PNAME(F,T)(x.s01,y.s01), SNAME(F,T)(x.s2,y.s2) #define PLST4(F,T) PNAME(F,T)(x.s01,y.s01), PNAME(F,T)(x.s23,y.s23) #define PLST8(F,T) PLST4(F,T), PNAME(F,T)(x.s45,y.s45),PNAME(F,T)(x.s67,y.s67) #define PLST16(F,T) PLST8(F,T), PNAME(F,T)(x.s89,y.s89),PNAME(F,T)(x.sab,y.sab), PNAME(F,T)(x.scd,y.scd),PNAME(F,T)(x.sef,y.sef) #define SWRAPNT(N,F,T) \ ATTR T##N \ F(T##N x, T##N y) \ { \ return (T##N) ( SLST##N(F,T) ); \ } #define PWRAPNT(N,F,T) \ ATTR T##N \ F(T##N x, T##N y) \ { \ return (T##N) ( PLST##N(F,T) ); \ } #define WRAP1T(F,T) \ ATTR T \ F(T x, T y) \ { \ return SNAME(F,T)(x, y); \ } #define WRAP2T(F,T) \ ATTR T##2 \ F(T##2 x, T##2 y) \ { \ return PNAME(F,T)(x, y); \ } #define SWRAPT(F,T) \ SWRAPNT(16,F,T) \ SWRAPNT(8,F,T) \ SWRAPNT(4,F,T) \ SWRAPNT(3,F,T) \ SWRAPNT(2,F,T) \ WRAP1T(F,T) #define PWRAPT(F,T) \ PWRAPNT(16,F,T) \ PWRAPNT(8,F,T) \ PWRAPNT(4,F,T) \ PWRAPNT(3,F,T) \ WRAP2T(F,T) \ WRAP1T(F,T) #if !defined USE_CLP #define WRAP(F) \ SWRAPT(F,float) \ SWRAPT(F,double) \ PWRAPT(F,half) #else #define WRAP(F) \ WRAP1T(F,float) \ WRAP1T(F,double) \ WRAP1T(F,half) \ WRAP2T(F,half) #endif WRAP(atan2) WRAP(atan2pi) WRAP(copysign) WRAP(fdim) WRAP(fmod) WRAP(hypot) WRAP(maxmag) WRAP(minmag) WRAP(nextafter) WRAP(pow) WRAP(powr) WRAP(remainder) ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/wrapbp.cl000066400000000000000000000070041415221260100221440ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) #define ATTR __attribute__((overloadable)) #define float_ssuf _f32 #define double_ssuf _f64 #define half_ssuf _f16 #define half_psuf _2f16 #define SNAME(F,T) C(__ocml_,C(F,T##_ssuf)) #define PNAME(F,T) C(__ocml_,C(F,T##_psuf)) #define SEVN(N,F,T,P) \ P v##N; \ T r##N = SNAME(F,T)(x.s##N, &v##N) #define PEVN(N,F,T,P) \ P##2 v##N; \ T##2 r##N = PNAME(F,T)(x.s##N, &v##N) #define SEVAL2(F,T,P) SEVN(0,F,T,P); SEVN(1,F,T,P) #define SEVAL3(F,T,P) SEVAL2(F,T,P); SEVN(2,F,T,P) #define SEVAL4(F,T,P) SEVAL2(F,T,P); SEVN(2,F,T,P); SEVN(3,F,T,P) #define SEVAL8(F,T,P) SEVAL4(F,T,P); SEVN(4,F,T,P); SEVN(5,F,T,P); SEVN(6,F,T,P); SEVN(7,F,T,P) #define SEVAL16(F,T,P) SEVAL8(F,T,P); SEVN(8,F,T,P); SEVN(9,F,T,P); SEVN(a,F,T,P); SEVN(b,F,T,P); SEVN(c,F,T,P); SEVN(d,F,T,P); SEVN(e,F,T,P); SEVN(f,F,T,P) #define PEVAL3(F,T,P) PEVN(01,F,T,P); SEVN(2,F,T,P) #define PEVAL4(F,T,P) PEVN(01,F,T,P); PEVN(23,F,T,P) #define PEVAL8(F,T,P) PEVAL4(F,T,P); PEVN(45,F,T,P); PEVN(67,F,T,P) #define PEVAL16(F,T,P) PEVAL8(F,T,P); PEVN(89,F,T,P); PEVN(ab,F,T,P); PEVN(cd,F,T,P); PEVN(ef,F,T,P) #define SLST2(V) V##0, V##1 #define SLST3(V) SLST2(V), V##2 #define SLST4(V) SLST2(V), V##2, V##3 #define SLST8(V) SLST4(V), V##4, V##5, V##6, V##7 #define SLST16(V) SLST8(V), V##8, V##9, V##a, V##b, V##c, V##d, V##e, V##f #define PLST3(V) V##01, V##2 #define PLST4(V) V##01, V##23 #define PLST8(V) PLST4(V), V##45, V##67 #define PLST16(V) PLST8(V), V##89, V##ab, V##cd, V##ef #define SWRAPNTAP(N,F,T,A,P) \ ATTR T##N \ F(T##N x, A P##N * v) \ { \ SEVAL##N(F,T,P); \ *v = (P##N)( SLST##N(v) ); \ return (T##N) ( SLST##N(r) ); \ } #define PWRAPNTAP(N,F,T,A,P) \ ATTR T##N \ F(T##N x, A P##N * v) \ { \ PEVAL##N(F,T,P); \ *v = (P##N)( PLST##N(v) ); \ return (T##N) ( PLST##N(r) ); \ } #define WRAP1TAP(F,T,A,P) \ ATTR T \ F(T x, A P * v) \ { \ P v0; \ T r0 = SNAME(F,T)(x, &v0); \ *v = v0; \ return r0; \ } #define WRAP2TAP(F,T,A,P) \ ATTR T##2 \ F(T##2 x, A P##2 * v) \ { \ P##2 v01; \ T##2 r01 = PNAME(F,T)(x, &v01); \ *v = v01; \ return r01; \ } #define SWRAPTAP(F,T,A,P) \ SWRAPNTAP(16,F,T,A,P) \ SWRAPNTAP(8,F,T,A,P) \ SWRAPNTAP(4,F,T,A,P) \ SWRAPNTAP(3,F,T,A,P) \ SWRAPNTAP(2,F,T,A,P) \ WRAP1TAP(F,T,A,P) #define PWRAPTAP(F,T,A,P) \ PWRAPNTAP(16,F,T,A,P) \ PWRAPNTAP(8,F,T,A,P) \ PWRAPNTAP(4,F,T,A,P) \ PWRAPNTAP(3,F,T,A,P) \ WRAP2TAP(F,T,A,P) \ WRAP1TAP(F,T,A,P) #define SWRAPTP(F,T,P) \ SWRAPTAP(F,T,__private,P) \ SWRAPTAP(F,T,__local,P) \ SWRAPTAP(F,T,__global,P) \ SWRAPTAP(F,T,,P) #define PWRAPTP(F,T,P) \ PWRAPTAP(F,T,__private,P) \ PWRAPTAP(F,T,__local,P) \ PWRAPTAP(F,T,__global,P) \ PWRAPTAP(F,T,,P) SWRAPTP(fract,float,float) SWRAPTP(fract,double,double) PWRAPTP(fract,half,half) SWRAPTP(frexp,float,int) SWRAPTP(frexp,double,int) PWRAPTP(frexp,half,int) SWRAPTP(lgamma_r,float,int) SWRAPTP(lgamma_r,double,int) PWRAPTP(lgamma_r,half,int) SWRAPTP(modf,float,float) SWRAPTP(modf,double,double) PWRAPTP(modf,half,half) SWRAPTP(sincos,float,float) SWRAPTP(sincos,double,double) PWRAPTP(sincos,half,half) ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/wrapbs.cl000066400000000000000000000111641415221260100221510ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) #define ATTR __attribute__((overloadable)) #define float_ssuf _f32 #define double_ssuf _f64 #define half_ssuf _f16 #define half_psuf _2f16 #define SNAME(F,T) C(__ocml_,C(F,T##_ssuf)) #define PNAME(F,T) C(__ocml_,C(F,T##_psuf)) #define SLST2(F,T) SNAME(F,T)(x.s0,y.s0), SNAME(F,T)(x.s1,y.s1) #define SLST3(F,T) SNAME(F,T)(x.s0,y.s0), SNAME(F,T)(x.s1,y.s1), SNAME(F,T)(x.s2,y.s2) #define SLST4(F,T) SLST2(F,T), SNAME(F,T)(x.s2,y.s2), SNAME(F,T)(x.s3,y.s3) #define SLST8(F,T) SLST4(F,T), SNAME(F,T)(x.s4,y.s4), SNAME(F,T)(x.s5,y.s5), SNAME(F,T)(x.s6,y.s6), SNAME(F,T)(x.s7,y.s7) #define SLST16(F,T) SLST8(F,T), SNAME(F,T)(x.s8,y.s8), SNAME(F,T)(x.s9,y.s9), SNAME(F,T)(x.sa,y.sa), SNAME(F,T)(x.sb,y.sb), \ SNAME(F,T)(x.sc,y.sc), SNAME(F,T)(x.sd,y.sd), SNAME(F,T)(x.se,y.se), SNAME(F,T)(x.sf,y.sf) #define SLST2S(F,T) SNAME(F,T)(x.s0,y), SNAME(F,T)(x.s1,y) #define SLST3S(F,T) SNAME(F,T)(x.s0,y), SNAME(F,T)(x.s1,y), SNAME(F,T)(x.s2,y) #define SLST4S(F,T) SLST2S(F,T), SNAME(F,T)(x.s2,y), SNAME(F,T)(x.s3,y) #define SLST8S(F,T) SLST4S(F,T), SNAME(F,T)(x.s4,y), SNAME(F,T)(x.s5,y), SNAME(F,T)(x.s6,y), SNAME(F,T)(x.s7,y) #define SLST16S(F,T) SLST8S(F,T), SNAME(F,T)(x.s8,y), SNAME(F,T)(x.s9,y), SNAME(F,T)(x.sa,y), SNAME(F,T)(x.sb,y), \ SNAME(F,T)(x.sc,y), SNAME(F,T)(x.sd,y), SNAME(F,T)(x.se,y), SNAME(F,T)(x.sf,y) #define PLST3(F,T) PNAME(F,T)(x.s01,y.s01), SNAME(F,T)(x.s2,y.s2) #define PLST4(F,T) PNAME(F,T)(x.s01,y.s01), PNAME(F,T)(x.s23,y.s23) #define PLST8(F,T) PLST4(F,T), PNAME(F,T)(x.s45,y.s45), PNAME(F,T)(x.s67,y.s67) #define PLST16(F,T) PLST8(F,T), PNAME(F,T)(x.s89,y.s89), PNAME(F,T)(x.sab,y.sab), PNAME(F,T)(x.scd,y.scd), PNAME(F,T)(x.sef,y.sef) #define PLST3S(F,T) PNAME(F,T)(x.s01,yy), SNAME(F,T)(x.s2,y) #define PLST4S(F,T) PNAME(F,T)(x.s01,yy), PNAME(F,T)(x.s23,yy) #define PLST8S(F,T) PLST4S(F,T), PNAME(F,T)(x.s45,yy), PNAME(F,T)(x.s67,yy) #define PLST16S(F,T) PLST8S(F,T), PNAME(F,T)(x.s89,yy), PNAME(F,T)(x.sab,yy), PNAME(F,T)(x.scd,yy), PNAME(F,T)(x.sef,yy) #define SWRAPTN(N,F,TX,TY) \ ATTR TX##N \ F(TX##N x, TY##N y) \ { \ return (TX##N) ( SLST##N(F,TX) ); \ } #define SWRAPSTN(N,F,TX,TY) \ ATTR TX##N \ F(TX##N x, TY y) \ { \ return (TX##N) ( SLST##N##S(F,TX) ); \ } #define PWRAPTN(N,F,TX,TY) \ ATTR TX##N \ F(TX##N x, TY##N y) \ { \ return (TX##N) ( PLST##N(F,TX) ); \ } #define PWRAPSTN(N,F,TX,TY) \ ATTR TX##N \ F(TX##N x, TY y) \ { \ TY##2 yy = (TY##2)y; \ return (TX##N) ( PLST##N##S(F,TX) ); \ } #define WRAPT1(F,TX,TY) \ ATTR TX \ F(TX x, TY y) \ { \ return SNAME(F,TX)(x, y); \ } #define WRAPT2(F,TX,TY) \ ATTR TX##2 \ F(TX##2 x, TY##2 y) \ { \ return PNAME(F,TX)(x, y); \ } #define WRAPT2S(F,TX,TY) \ ATTR TX##2 \ F(TX##2 x, TY y) \ { \ return PNAME(F,TX)(x, (TY##2)y); \ } #define SWRAPT(F,TX,TY) \ SWRAPTN(16,F,TX,TY) \ SWRAPTN(8,F,TX,TY) \ SWRAPTN(4,F,TX,TY) \ SWRAPTN(3,F,TX,TY) \ SWRAPTN(2,F,TX,TY) \ WRAPT1(F,TX,TY) #define SWRAPST(F,TX,TY) \ SWRAPTN(16,F,TX,TY) \ SWRAPSTN(16,F,TX,TY) \ SWRAPTN(8,F,TX,TY) \ SWRAPSTN(8,F,TX,TY) \ SWRAPTN(4,F,TX,TY) \ SWRAPSTN(4,F,TX,TY) \ SWRAPTN(3,F,TX,TY) \ SWRAPSTN(3,F,TX,TY) \ SWRAPTN(2,F,TX,TY) \ SWRAPSTN(2,F,TX,TY) \ WRAPT1(F,TX,TY) #define PWRAPT(F,TX,TY) \ PWRAPTN(16,F,TX,TY) \ PWRAPTN(8,F,TX,TY) \ PWRAPTN(4,F,TX,TY) \ PWRAPTN(3,F,TX,TY) \ WRAPT2(F,TX,TY) \ WRAPT1(F,TX,TY) #define PWRAPST(F,TX,TY) \ PWRAPTN(16,F,TX,TY) \ PWRAPSTN(16,F,TX,TY) \ PWRAPTN(8,F,TX,TY) \ PWRAPSTN(8,F,TX,TY) \ PWRAPTN(4,F,TX,TY) \ PWRAPSTN(4,F,TX,TY) \ PWRAPTN(3,F,TX,TY) \ PWRAPSTN(3,F,TX,TY) \ WRAPT2(F,TX,TY) \ WRAPT2S(F,TX,TY) \ WRAPT1(F,TX,TY) SWRAPST(fmax,float,float) SWRAPST(fmax,double,double) PWRAPST(fmax,half,half) SWRAPST(fmin,float,float) SWRAPST(fmin,double,double) PWRAPST(fmin,half,half) SWRAPST(ldexp,float,int) SWRAPST(ldexp,double,int) PWRAPST(ldexp,half,int) SWRAPST(max,float,float) SWRAPST(max,double,double) PWRAPST(max,half,half) SWRAPST(min,float,float) SWRAPST(min,double,double) PWRAPST(min,half,half) SWRAPT(pown,float,int) SWRAPT(pown,double,int) PWRAPT(pown,half,int) SWRAPT(rootn,float,int) SWRAPT(rootn,double,int) PWRAPT(rootn,half,int) ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/wrapt.cl000066400000000000000000000054571415221260100220200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) #define ATTR __attribute__((overloadable)) #define float_ssuf _f32 #define float_psuf _2f32 #define double_ssuf _f64 #define half_ssuf _f16 #define half_psuf _2f16 #define SNAME(F,T) C(__ocml_,C(F,T##_ssuf)) #define PNAME(F,T) C(__ocml_,C(F,T##_psuf)) #define SLST2(F,T) SNAME(F,T)(a.s0,b.s0,c.s0), SNAME(F,T)(a.s1,b.s1,c.s1) #define SLST3(F,T) SNAME(F,T)(a.s0,b.s0,c.s0), SNAME(F,T)(a.s1,b.s1,c.s1), SNAME(F,T)(a.s2,b.s2,c.s2) #define SLST4(F,T) SLST2(F,T), SNAME(F,T)(a.s2,b.s2,c.s2), SNAME(F,T)(a.s3,b.s3,c.s3) #define SLST8(F,T) SLST4(F,T), SNAME(F,T)(a.s4,b.s4,c.s4), SNAME(F,T)(a.s5,b.s5,c.s5), \ SNAME(F,T)(a.s6,b.s6,c.s6), SNAME(F,T)(a.s7,b.s7,c.s7) #define SLST16(F,T) SLST8(F,T), SNAME(F,T)(a.s8,b.s8,c.s8), SNAME(F,T)(a.s9,b.s9,c.s9), \ SNAME(F,T)(a.sa,b.sa,c.sa), SNAME(F,T)(a.sb,b.sb,c.sb), \ SNAME(F,T)(a.sc,b.sc,c.sc), SNAME(F,T)(a.sd,b.sd,c.sd), \ SNAME(F,T)(a.se,b.se,c.se), SNAME(F,T)(a.sf,b.sf,c.sf) #define PLST3(F,T) PNAME(F,T)(a.s01,b.s01,c.s01), SNAME(F,T)(a.s2,b.s2,c.s2) #define PLST4(F,T) PNAME(F,T)(a.s01,b.s01,c.s01), PNAME(F,T)(a.s23,b.s23,c.s23) #define PLST8(F,T) PLST4(F,T), PNAME(F,T)(a.s45,b.s45,c.s45), PNAME(F,T)(a.s67,b.s67,c.s67) #define PLST16(F,T) PLST8(F,T), PNAME(F,T)(a.s89,b.s89,c.s89), PNAME(F,T)(a.sab,b.sab,c.sab), \ PNAME(F,T)(a.scd,b.scd,c.scd), PNAME(F,T)(a.sef,b.sef,c.sef) #define SWRAPNT(N,F,T) \ ATTR T##N \ F(T##N a, T##N b, T##N c) \ { \ return (T##N) ( SLST##N(F,T) ); \ } #define PWRAPNT(N,F,T) \ ATTR T##N \ F(T##N a, T##N b, T##N c) \ { \ return (T##N) ( PLST##N(F,T) ); \ } #define WRAP1T(F,T) \ ATTR T \ F(T a, T b, T c) \ { \ return SNAME(F,T)(a, b, c); \ } #define WRAP2T(F,T) \ ATTR T##2 \ F(T##2 a, T##2 b, T##2 c) \ { \ return PNAME(F,T)(a, b, c); \ } #define SWRAPT(F,T) \ SWRAPNT(16,F,T) \ SWRAPNT(8,F,T) \ SWRAPNT(4,F,T) \ SWRAPNT(3,F,T) \ SWRAPNT(2,F,T) \ WRAP1T(F,T) #define PWRAPT(F,T) \ PWRAPNT(16,F,T) \ PWRAPNT(8,F,T) \ PWRAPNT(4,F,T) \ PWRAPNT(3,F,T) \ WRAP2T(F,T) \ WRAP1T(F,T) #if !defined USE_CLP #define WRAP(F) \ PWRAPT(F,float) \ SWRAPT(F,double) \ PWRAPT(F,half) #else #define WRAP(F) \ WRAP1T(F,float) \ WRAP1T(F,double) \ WRAP1T(F,half) \ WRAP2T(F,half) #endif WRAP(fma) WRAP(mad) ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/wraptp.cl000066400000000000000000000063571415221260100222000ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) #define ATTR __attribute__((overloadable)) #define float_ssuf _f32 #define double_ssuf _f64 #define half_ssuf _f16 #define half_psuf _2f16 #define SNAME(F,T) C(__ocml_,C(F,T##_ssuf)) #define PNAME(F,T) C(__ocml_,C(F,T##_psuf)) #define SEVN(N,F,T,P) \ P v##N; \ T r##N = SNAME(F,T)(x.s##N, y.s##N, &v##N) #define PEVN(N,F,T,P) \ P##2 v##N; \ T##2 r##N = PNAME(F,T)(x.s##N, y.s##N, &v##N) #define SEVAL2(F,T,P) SEVN(0,F,T,P); SEVN(1,F,T,P) #define SEVAL3(F,T,P) SEVAL2(F,T,P); SEVN(2,F,T,P) #define SEVAL4(F,T,P) SEVAL2(F,T,P); SEVN(2,F,T,P); SEVN(3,F,T,P) #define SEVAL8(F,T,P) SEVAL4(F,T,P); SEVN(4,F,T,P); SEVN(5,F,T,P); SEVN(6,F,T,P); SEVN(7,F,T,P) #define SEVAL16(F,T,P) SEVAL8(F,T,P); SEVN(8,F,T,P); SEVN(9,F,T,P); SEVN(a,F,T,P); SEVN(b,F,T,P); SEVN(c,F,T,P); SEVN(d,F,T,P); SEVN(e,F,T,P); SEVN(f,F,T,P) #define PEVAL3(F,T,P) PEVN(01,F,T,P); SEVN(2,F,T,P) #define PEVAL4(F,T,P) PEVN(01,F,T,P); PEVN(23,F,T,P) #define PEVAL8(F,T,P) PEVAL4(F,T,P); PEVN(45,F,T,P); PEVN(67,F,T,P) #define PEVAL16(F,T,P) PEVAL8(F,T,P); PEVN(89,F,T,P); PEVN(ab,F,T,P); PEVN(cd,F,T,P); PEVN(ef,F,T,P) #define SLST2(V) V##0, V##1 #define SLST3(V) SLST2(V), V##2 #define SLST4(V) SLST2(V), V##2, V##3 #define SLST8(V) SLST4(V), V##4, V##5, V##6, V##7 #define SLST16(V) SLST8(V), V##8, V##9, V##a, V##b, V##c, V##d, V##e, V##f #define PLST3(V) V##01, V##2 #define PLST4(V) V##01, V##23 #define PLST8(V) PLST4(V), V##45, V##67 #define PLST16(V) PLST8(V), V##89, V##ab, V##cd, V##ef #define SWRAPNTAP(N,F,T,A,P) \ ATTR T##N \ F(T##N x, T##N y, A P##N * v) \ { \ SEVAL##N(F,T,P); \ *v = (P##N)( SLST##N(v) ); \ return (T##N) ( SLST##N(r) ); \ } #define PWRAPNTAP(N,F,T,A,P) \ ATTR T##N \ F(T##N x, T##N y, A P##N * v) \ { \ PEVAL##N(F,T,P); \ *v = (P##N)( PLST##N(v) ); \ return (T##N) ( PLST##N(r) ); \ } #define WRAP1TAP(F,T,A,P) \ ATTR T \ F(T x, T y, A P * v) \ { \ P v0; \ T r0 = SNAME(F,T)(x, y, &v0); \ *v = v0; \ return r0; \ } #define WRAP2TAP(F,T,A,P) \ ATTR T##2 \ F(T##2 x, T##2 y, A P##2 * v) \ { \ P##2 v01; \ T##2 r01 = PNAME(F,T)(x, y, &v01); \ *v = v01; \ return r01; \ } #define SWRAPTAP(F,T,A,P) \ SWRAPNTAP(16,F,T,A,P) \ SWRAPNTAP(8,F,T,A,P) \ SWRAPNTAP(4,F,T,A,P) \ SWRAPNTAP(3,F,T,A,P) \ SWRAPNTAP(2,F,T,A,P) \ WRAP1TAP(F,T,A,P) #define PWRAPTAP(F,T,A,P) \ PWRAPNTAP(16,F,T,A,P) \ PWRAPNTAP(8,F,T,A,P) \ PWRAPNTAP(4,F,T,A,P) \ PWRAPNTAP(3,F,T,A,P) \ WRAP2TAP(F,T,A,P) \ WRAP1TAP(F,T,A,P) #define SWRAPTP(F,T,P) \ SWRAPTAP(F,T,__private,P) \ SWRAPTAP(F,T,__local,P) \ SWRAPTAP(F,T,__global,P) \ SWRAPTAP(F,T,,P) #define PWRAPTP(F,T,P) \ PWRAPTAP(F,T,__private,P) \ PWRAPTAP(F,T,__local,P) \ PWRAPTAP(F,T,__global,P) \ PWRAPTAP(F,T,,P) SWRAPTP(remquo,float,int) SWRAPTP(remquo,double,int) PWRAPTP(remquo,half,int) ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/wrapu.cl000066400000000000000000000052631415221260100220140ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) #define ATTR __attribute__((overloadable)) #define float_ssuf _f32 #define double_ssuf _f64 #define half_ssuf _f16 #define half_psuf _2f16 #define SNAME(F,T) C(__ocml_,C(F,T##_ssuf)) #define PNAME(F,T) C(__ocml_,C(F,T##_psuf)) #define SLST2(F,T) SNAME(F,T)(x.s0), SNAME(F,T)(x.s1) #define SLST3(F,T) SLST2(F,T), SNAME(F,T)(x.s2) #define SLST4(F,T) SLST2(F,T), SNAME(F,T)(x.s2), SNAME(F,T)(x.s3) #define SLST8(F,T) SLST4(F,T), SNAME(F,T)(x.s4), SNAME(F,T)(x.s5), SNAME(F,T)(x.s6), SNAME(F,T)(x.s7) #define SLST16(F,T) SLST8(F,T), SNAME(F,T)(x.s8), SNAME(F,T)(x.s9), SNAME(F,T)(x.sa), SNAME(F,T)(x.sb), \ SNAME(F,T)(x.sc), SNAME(F,T)(x.sd), SNAME(F,T)(x.se), SNAME(F,T)(x.sf) #define PLST3(F,T) PNAME(F,T)(x.s01), SNAME(F,T)(x.s2) #define PLST4(F,T) PNAME(F,T)(x.s01), PNAME(F,T)(x.s23) #define PLST8(F,T) PLST4(F,T), PNAME(F,T)(x.s45), PNAME(F,T)(x.s67) #define PLST16(F,T) PLST8(F,T), PNAME(F,T)(x.s89), PNAME(F,T)(x.sab), PNAME(F,T)(x.scd), PNAME(F,T)(x.sef) #define SWRAPNT(N,F,T) \ ATTR T##N \ F(T##N x) \ { \ return (T##N) ( SLST##N(F,T) ); \ } #define PWRAPNT(N,F,T) \ ATTR T##N \ F(T##N x) \ { \ return (T##N) ( PLST##N(F,T) ); \ } #define WRAP1T(F,T) \ ATTR T \ F(T x) \ { \ return SNAME(F,T)(x); \ } #define WRAP2T(F,T) \ ATTR T##2 \ F(T##2 x) \ { \ return PNAME(F,T)(x); \ } #define SWRAPT(F,T) \ SWRAPNT(16,F,T) \ SWRAPNT(8,F,T) \ SWRAPNT(4,F,T) \ SWRAPNT(3,F,T) \ SWRAPNT(2,F,T) \ WRAP1T(F,T) #define PWRAPT(F,T) \ PWRAPNT(16,F,T) \ PWRAPNT(8,F,T) \ PWRAPNT(4,F,T) \ PWRAPNT(3,F,T) \ WRAP2T(F,T) \ WRAP1T(F,T) #if !defined USE_CLP #define WRAP(F) \ SWRAPT(F,float) \ SWRAPT(F,double) \ PWRAPT(F,half) #else #define WRAP(F) \ WRAP1T(F,float) \ WRAP1T(F,double) \ WRAP1T(F,half) \ WRAP2T(F,half) #endif WRAP(acos) WRAP(acosh) WRAP(acospi) WRAP(asin) WRAP(asinh) WRAP(asinpi) WRAP(atan) WRAP(atanh) WRAP(atanpi) WRAP(cbrt) WRAP(ceil) WRAP(cos) WRAP(cosh) WRAP(cospi) WRAP(erfc) WRAP(erf) WRAP(exp) WRAP(exp2) WRAP(exp10) WRAP(expm1) WRAP(fabs) WRAP(floor) WRAP(lgamma) WRAP(log) WRAP(log2) WRAP(log10) WRAP(log1p) WRAP(logb) WRAP(rint) WRAP(round) WRAP(rsqrt) WRAP(sin) WRAP(sinh) WRAP(sinpi) WRAP(sqrt) WRAP(tan) WRAP(tanh) WRAP(tanpi) WRAP(tgamma) WRAP(trunc) ROCm-Device-Libs-rocm-5.0.0/opencl/src/math/wrapu2.cl000066400000000000000000000045161415221260100220760ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) #define ATTR __attribute__((overloadable)) #define float_ssuf _f32 #define double_ssuf _f64 #define half_ssuf _f16 #define half_psuf _2f16 #define SNAME(F,T) C(__ocml_,C(F,T##_ssuf)) #define PNAME(F,T) C(__ocml_,C(F,T##_psuf)) #define SLST2(F,T) SNAME(F,T)(x.s0), SNAME(F,T)(x.s1) #define SLST3(F,T) SNAME(F,T)(x.s0), SNAME(F,T)(x.s1), SNAME(F,T)(x.s2) #define SLST4(F,T) SLST2(F,T), SNAME(F,T)(x.s2), SNAME(F,T)(x.s3) #define SLST8(F,T) SLST4(F,T), SNAME(F,T)(x.s4), SNAME(F,T)(x.s5), SNAME(F,T)(x.s6), SNAME(F,T)(x.s7) #define SLST16(F,T) SLST8(F,T), SNAME(F,T)(x.s8), SNAME(F,T)(x.s9), SNAME(F,T)(x.sa), SNAME(F,T)(x.sb), \ SNAME(F,T)(x.sc), SNAME(F,T)(x.sd), SNAME(F,T)(x.se), SNAME(F,T)(x.sf) #define PLST3(F,T) PNAME(F,T)(x.s01), SNAME(F,T)(x.s2) #define PLST4(F,T) PNAME(F,T)(x.s01), PNAME(F,T)(x.s23) #define PLST8(F,T) PLST4(F,T), PNAME(F,T)(x.s45), PNAME(F,T)(x.s67) #define PLST16(F,T) PLST8(F,T), PNAME(F,T)(x.s89), PNAME(F,T)(x.sab), PNAME(F,T)(x.scd), PNAME(F,T)(x.sef) #define SWRAPN(N,F,OT,IT,ST) \ ATTR OT##N \ F(IT##N x) \ { \ return (OT##N) ( SLST##N(F,ST) ); \ } #define PWRAPN(N,F,OT,IT,ST) \ ATTR OT##N \ F(IT##N x) \ { \ return (OT##N) ( PLST##N(F,ST) ); \ } #define WRAP1(F,OT,IT,ST) \ ATTR OT \ F(IT x) \ { \ return SNAME(F,ST)(x); \ } #define WRAP2(F,OT,IT,ST) \ ATTR OT##2 \ F(IT##2 x) \ { \ return PNAME(F,ST)(x); \ } #define SWRAP(F,OT,IT,ST) \ SWRAPN(16,F,OT,IT,ST) \ SWRAPN(8,F,OT,IT,ST) \ SWRAPN(4,F,OT,IT,ST) \ SWRAPN(3,F,OT,IT,ST) \ SWRAPN(2,F,OT,IT,ST) \ WRAP1(F,OT,IT,ST) #define PWRAP(F,OT,IT,ST) \ PWRAPN(16,F,OT,IT,ST) \ PWRAPN(8,F,OT,IT,ST) \ PWRAPN(4,F,OT,IT,ST) \ PWRAPN(3,F,OT,IT,ST) \ WRAP2(F,OT,IT,ST) \ WRAP1(F,OT,IT,ST) SWRAP(ilogb,int,float,float) SWRAP(ilogb,int,double,double) PWRAP(ilogb,int,half,half) SWRAP(nan,float,uint,float) SWRAP(nan,double,ulong,double) PWRAP(nan,half,ushort,half) ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/000077500000000000000000000000001415221260100204565ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/bfm.cl000066400000000000000000000016611415221260100215460ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(bfm) #define L2 F(a.s0, b.s0), F(a.s1, b.s1) #define L3 L2, F(a.s2, b.s2) #define L4 L3, F(a.s3, b.s3) #define L8 L4, F(a.s4, b.s4), F(a.s5, b.s5), F(a.s6, b.s6), F(a.s7, b.s7) #define L16 L8, F(a.s8, b.s8), F(a.s9, b.s9), F(a.sa, b.sa), F(a.sb, b.sb), \ F(a.sc, b.sc), F(a.sd, b.sd), F(a.se, b.se), F(a.sf, b.sf) #define GEN(N) \ ATTR uint##N \ amd_bfm(uint##N a, uint##N b) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_bfm(uint a, uint b) { return F(a, b); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/bitalign.cl000066400000000000000000000020661415221260100225730ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(bitalign) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_bitalign(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_bitalign(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/bytealign.cl000066400000000000000000000020711415221260100227540ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(bytealign) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_bytealign(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_bytealign(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/fmax3.cl000066400000000000000000000020631415221260100220150ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_F32(max3) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR float##N \ amd_max3(float##N a, float##N b, float##N c) \ { \ return (float##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR float amd_max3(float a, float b, float c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/fmed3.cl000066400000000000000000000020741415221260100217770ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_F32(median3) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR float##N \ amd_median3(float##N a, float##N b, float##N c) \ { \ return (float##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR float amd_median3(float a, float b, float c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/fmin3.cl000066400000000000000000000020631415221260100220130ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_F32(min3) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR float##N \ amd_min3(float##N a, float##N b, float##N c) \ { \ return (float##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR float amd_min3(float a, float b, float c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/ibfe.cl000066400000000000000000000020421415221260100217010ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_I32(bfe) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR int##N \ amd_bfe(int##N a, uint##N b, uint##N c) \ { \ return (int##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR int amd_bfe(int a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/imax3.cl000066400000000000000000000020411415221260100220140ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_I32(max3) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR int##N \ amd_max3(int##N a, int##N b, int##N c) \ { \ return (int##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR int amd_max3(int a, int b, int c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/imed3.cl000066400000000000000000000020521415221260100217760ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_I32(median3) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR int##N \ amd_median3(int##N a, int##N b, int##N c) \ { \ return (int##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR int amd_median3(int a, int b, int c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/imin3.cl000066400000000000000000000020411415221260100220120ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_I32(min3) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR int##N \ amd_min3(int##N a, int##N b, int##N c) \ { \ return (int##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR int amd_min3(int a, int b, int c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/lerp.cl000066400000000000000000000020521415221260100217370ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(lerp) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_lerp(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_lerp(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/mqsad.cl000066400000000000000000000020641415221260100221050ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U64(mqsad) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR ulong##N \ amd_mqsad(ulong##N a, uint##N b, ulong##N c) \ { \ return (ulong##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR ulong amd_mqsad(ulong a, uint b, ulong c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/msad.cl000066400000000000000000000020521415221260100217210ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(msad) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_msad(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_msad(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/pack.cl000066400000000000000000000007121415221260100217140ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) ATTR uint amd_pack(float4 v) { return OCKL_MANGLE_U32(pack)(v); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/qsad.cl000066400000000000000000000020611415221260100217250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U64(qsad) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR ulong##N \ amd_qsad(ulong##N a, uint##N b, ulong##N c) \ { \ return (ulong##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR ulong amd_qsad(ulong a, uint b, ulong c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/sad.cl000066400000000000000000000020471415221260100215500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(sad) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_sad(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_sad(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/sad4.cl000066400000000000000000000011471415221260100216340ustar00rootroot00000000000000 /*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" __attribute__((overloadable, const)) uint amd_sad4(uint4 x, uint4 y, uint z) { uint a = OCKL_MANGLE_U32(sad)(x.s0,y.s0,z); a = OCKL_MANGLE_U32(sad)(x.s1,y.s1,a); a = OCKL_MANGLE_U32(sad)(x.s2,y.s2,a); return OCKL_MANGLE_U32(sad)(x.s3,y.s3,a); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/sadd.cl000066400000000000000000000020521415221260100217100ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(sadd) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_sadd(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_sadd(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/sadhi.cl000066400000000000000000000020551415221260100220700ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(sadhi) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_sadhi(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_sadhi(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/sadw.cl000066400000000000000000000020521415221260100217330ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(sadw) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_sadw(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_sadw(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/ubfe.cl000066400000000000000000000020471415221260100217220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(bfe) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_bfe(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_bfe(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/umax3.cl000066400000000000000000000020521415221260100220320ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(max3) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_max3(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_max3(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/umed3.cl000066400000000000000000000020631415221260100220140ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(median3) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_median3(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_median3(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/umin3.cl000066400000000000000000000020521415221260100220300ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define F OCKL_MANGLE_U32(min3) #define L2 F(a.s0, b.s0, c.s0), F(a.s1, b.s1, c.s1) #define L3 L2, F(a.s2, b.s2, c.s2) #define L4 L3, F(a.s3, b.s3, c.s3) #define L8 L4, F(a.s4, b.s4, c.s4), F(a.s5, b.s5, c.s5), F(a.s6, b.s6, c.s6), F(a.s7, b.s7, c.s7) #define L16 L8, F(a.s8, b.s8, c.s8), F(a.s9, b.s9, c.s9), F(a.sa, b.sa, c.sa), F(a.sb, b.sb, c.sb), \ F(a.sc, b.sc, c.sc), F(a.sd, b.sd, c.sd), F(a.se, b.se, c.se), F(a.sf, b.sf, c.sf) #define GEN(N) \ ATTR uint##N \ amd_min3(uint##N a, uint##N b, uint##N c) \ { \ return (uint##N)( L##N ); \ } GEN(16) GEN(8) GEN(4) GEN(3) GEN(2) ATTR uint amd_min3(uint a, uint b, uint c) { return F(a, b, c); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/media/unpack.cl000066400000000000000000000022241415221260100222570ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) #define _F(N) __ockl_unpack##N##_f32 #define F(N) _F(N) #define L2(N) F(N)(a.s0), F(N)(a.s1) #define L3(N) L2(N), F(N)(a.s2) #define L4(N) L3(N), F(N)(a.s3) #define L8(N) L4(N), F(N)(a.s4), F(N)(a.s5), F(N)(a.s6), F(N)(a.s7) #define L16(N) L8(N), F(N)(a.s8), F(N)(a.s9), F(N)(a.sa), F(N)(a.sb), F(N)(a.sc), F(N)(a.sd), F(N)(a.se), F(N)(a.sf) #define GENN(N,B) \ ATTR float##N \ amd_unpack##B(uint##N a) \ { \ return (float##N)( L##N(B) ); \ } #define GEN(B) \ GENN(16,B) \ GENN(8,B) \ GENN(4,B) \ GENN(3,B) \ GENN(2,B) GEN(0) GEN(1) GEN(2) GEN(3) ATTR float amd_unpack0(uint a) { return F(0)(a); } ATTR float amd_unpack1(uint a) { return F(1)(a); } ATTR float amd_unpack2(uint a) { return F(2)(a); } ATTR float amd_unpack3(uint a) { return F(3)(a); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/misc/000077500000000000000000000000001415221260100203325ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/misc/amdblit.cl000066400000000000000000000355151415221260100222770ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #if !defined NO_BLIT #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable static const uint SplitCount = 3; __attribute__((always_inline)) void __amd_copyBufferToImage( __global uint *src, __write_only image2d_array_t dst, ulong4 srcOrigin, int4 dstOrigin, int4 size, uint4 format, ulong4 pitch) { ulong idxSrc; int4 coordsDst; uint4 pixel; __global uint* srcUInt = src; __global ushort* srcUShort = (__global ushort*)src; __global uchar* srcUChar = (__global uchar*)src; ushort tmpUShort; uint tmpUInt; coordsDst.x = get_global_id(0); coordsDst.y = get_global_id(1); coordsDst.z = get_global_id(2); coordsDst.w = 0; if ((coordsDst.x >= size.x) || (coordsDst.y >= size.y) || (coordsDst.z >= size.z)) { return; } idxSrc = (coordsDst.z * pitch.y + coordsDst.y * pitch.x + coordsDst.x) * format.z + srcOrigin.x; coordsDst.x += dstOrigin.x; coordsDst.y += dstOrigin.y; coordsDst.z += dstOrigin.z; // Check components switch (format.x) { case 1: // Check size if (format.y == 1) { pixel.x = (uint)srcUChar[idxSrc]; } else if (format.y == 2) { pixel.x = (uint)srcUShort[idxSrc]; } else { pixel.x = srcUInt[idxSrc]; } break; case 2: // Check size if (format.y == 1) { tmpUShort = srcUShort[idxSrc]; pixel.x = (uint)(tmpUShort & 0xff); pixel.y = (uint)(tmpUShort >> 8); } else if (format.y == 2) { tmpUInt = srcUInt[idxSrc]; pixel.x = (tmpUInt & 0xffff); pixel.y = (tmpUInt >> 16); } else { pixel.x = srcUInt[idxSrc++]; pixel.y = srcUInt[idxSrc]; } break; case 4: // Check size if (format.y == 1) { tmpUInt = srcUInt[idxSrc]; pixel.x = tmpUInt & 0xff; pixel.y = (tmpUInt >> 8) & 0xff; pixel.z = (tmpUInt >> 16) & 0xff; pixel.w = (tmpUInt >> 24) & 0xff; } else if (format.y == 2) { tmpUInt = srcUInt[idxSrc++]; pixel.x = tmpUInt & 0xffff; pixel.y = (tmpUInt >> 16); tmpUInt = srcUInt[idxSrc]; pixel.z = tmpUInt & 0xffff; pixel.w = (tmpUInt >> 16); } else { pixel.x = srcUInt[idxSrc++]; pixel.y = srcUInt[idxSrc++]; pixel.z = srcUInt[idxSrc++]; pixel.w = srcUInt[idxSrc]; } break; } // Write the final pixel write_imageui(dst, coordsDst, pixel); } __attribute__((always_inline)) void __amd_copyImageToBuffer( __read_only image2d_array_t src, __global uint* dstUInt, __global ushort* dstUShort, __global uchar* dstUChar, int4 srcOrigin, ulong4 dstOrigin, int4 size, uint4 format, ulong4 pitch) { ulong idxDst; int4 coordsSrc; uint4 texel; coordsSrc.x = get_global_id(0); coordsSrc.y = get_global_id(1); coordsSrc.z = get_global_id(2); coordsSrc.w = 0; if ((coordsSrc.x >= size.x) || (coordsSrc.y >= size.y) || (coordsSrc.z >= size.z)) { return; } idxDst = (coordsSrc.z * pitch.y + coordsSrc.y * pitch.x + coordsSrc.x) * format.z + dstOrigin.x; coordsSrc.x += srcOrigin.x; coordsSrc.y += srcOrigin.y; coordsSrc.z += srcOrigin.z; texel = read_imageui(src, coordsSrc); // Check components switch (format.x) { case 1: // Check size switch (format.y) { case 1: dstUChar[idxDst] = (uchar)texel.x; break; case 2: dstUShort[idxDst] = (ushort)texel.x; break; case 4: dstUInt[idxDst] = texel.x; break; } break; case 2: // Check size switch (format.y) { case 1: dstUShort[idxDst] = (ushort)texel.x | ((ushort)texel.y << 8); break; case 2: dstUInt[idxDst] = texel.x | (texel.y << 16); break; case 4: dstUInt[idxDst++] = texel.x; dstUInt[idxDst] = texel.y; break; } break; case 4: // Check size switch (format.y) { case 1: dstUInt[idxDst] = (uint)texel.x | (texel.y << 8) | (texel.z << 16) | (texel.w << 24); break; case 2: dstUInt[idxDst++] = texel.x | (texel.y << 16); dstUInt[idxDst] = texel.z | (texel.w << 16); break; case 4: dstUInt[idxDst++] = texel.x; dstUInt[idxDst++] = texel.y; dstUInt[idxDst++] = texel.z; dstUInt[idxDst] = texel.w; break; } break; } } __attribute__((always_inline)) void __amd_copyImage( __read_only image2d_array_t src, __write_only image2d_array_t dst, int4 srcOrigin, int4 dstOrigin, int4 size) { int4 coordsDst; int4 coordsSrc; coordsDst.x = get_global_id(0); coordsDst.y = get_global_id(1); coordsDst.z = get_global_id(2); coordsDst.w = 0; if ((coordsDst.x >= size.x) || (coordsDst.y >= size.y) || (coordsDst.z >= size.z)) { return; } coordsSrc = srcOrigin + coordsDst; coordsDst += dstOrigin; uint4 texel; texel = read_imageui(src, coordsSrc); write_imageui(dst, coordsDst, texel); } __attribute__((always_inline)) void __amd_copyImage1DA( __read_only image2d_array_t src, __write_only image2d_array_t dst, int4 srcOrigin, int4 dstOrigin, int4 size) { int4 coordsDst; int4 coordsSrc; coordsDst.x = get_global_id(0); coordsDst.y = get_global_id(1); coordsDst.z = get_global_id(2); coordsDst.w = 0; if ((coordsDst.x >= size.x) || (coordsDst.y >= size.y) || (coordsDst.z >= size.z)) { return; } coordsSrc = srcOrigin + coordsDst; coordsDst += dstOrigin; if (srcOrigin.w != 0) { coordsSrc.z = coordsSrc.y; coordsSrc.y = 0; } if (dstOrigin.w != 0) { coordsDst.z = coordsDst.y; coordsDst.y = 0; } uint4 texel; texel = read_imageui(src, coordsSrc); write_imageui(dst, coordsDst, texel); } __attribute__((always_inline)) void __amd_copyBufferRect( __global uchar* src, __global uchar* dst, ulong4 srcRect, ulong4 dstRect, ulong4 size) { ulong x = get_global_id(0); ulong y = get_global_id(1); ulong z = get_global_id(2); if ((x >= size.x) || (y >= size.y) || (z >= size.z)) { return; } ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y; ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y; dst[offsDst] = src[offsSrc]; } __attribute__((always_inline)) void __amd_copyBufferRectAligned( __global uint* src, __global uint* dst, ulong4 srcRect, ulong4 dstRect, ulong4 size) { ulong x = get_global_id(0); ulong y = get_global_id(1); ulong z = get_global_id(2); if ((x >= size.x) || (y >= size.y) || (z >= size.z)) { return; } ulong offsSrc = srcRect.z + x + y * srcRect.x + z * srcRect.y; ulong offsDst = dstRect.z + x + y * dstRect.x + z * dstRect.y; if (size.w == 16) { __global uint4* src4 = (__global uint4*)src; __global uint4* dst4 = (__global uint4*)dst; dst4[offsDst] = src4[offsSrc]; } else { dst[offsDst] = src[offsSrc]; } } __attribute__((always_inline)) void __amd_copyBuffer( __global uchar* srcI, __global uchar* dstI, ulong srcOrigin, ulong dstOrigin, ulong size, uint remain) { ulong id = get_global_id(0); if (id >= size) { return; } __global uchar* src = srcI + srcOrigin; __global uchar* dst = dstI + dstOrigin; if (remain == 8) { dst[id] = src[id]; } else { if (id < (size - 1)) { __global uint* srcD = (__global uint*)(src); __global uint* dstD = (__global uint*)(dst); dstD[id] = srcD[id]; } else { for (uint i = 0; i < remain; ++i) { dst[id * 4 + i] = src[id * 4 + i]; } } } } __attribute__((always_inline)) void __amd_copyBufferAligned( __global uint* src, __global uint* dst, ulong srcOrigin, ulong dstOrigin, ulong size, uint alignment) { ulong id = get_global_id(0); if (id >= size) { return; } ulong offsSrc = id + srcOrigin; ulong offsDst = id + dstOrigin; if (alignment == 16) { __global uint4* src4 = (__global uint4*)src; __global uint4* dst4 = (__global uint4*)dst; dst4[offsDst] = src4[offsSrc]; } else { dst[offsDst] = src[offsSrc]; } } __attribute__((always_inline)) void __amd_fillBuffer( __global uchar* bufUChar, __global uint* bufUInt, __constant uchar* pattern, uint patternSize, ulong offset, ulong size) { ulong id = get_global_id(0); if (id >= size) { return; } if (bufUInt) { __global uint* element = &bufUInt[offset + id * patternSize]; __constant uint* pt = (__constant uint*)pattern; for (uint i = 0; i < patternSize; ++i) { element[i] = pt[i]; } } else { __global uchar* element = &bufUChar[offset + id * patternSize]; for (uint i = 0; i < patternSize; ++i) { element[i] = pattern[i]; } } } __attribute__((always_inline)) void __amd_fillBufferAligned( __global uchar* bufUChar, __global ushort* bufUShort, __global uint* bufUInt, __global ulong* bufULong, __constant uchar* pattern, uint patternSize, ulong offset, ulong size) { ulong id = get_global_id(0); if (id >= size) { return; } if (bufULong) { __global ulong* element = &bufULong[offset + id * patternSize]; __constant ulong* pt = (__constant ulong*)pattern; for (uint i = 0; i < patternSize; ++i) { element[i] = pt[i]; } } else if (bufUInt) { __global uint* element = &bufUInt[offset + id * patternSize]; __constant uint* pt = (__constant uint*)pattern; for (uint i = 0; i < patternSize; ++i) { element[i] = pt[i]; } } else if (bufUShort) { __global ushort* element = &bufUShort[offset + id * patternSize]; __constant ushort* pt = (__constant ushort*)pattern; for (uint i = 0; i < patternSize; ++i) { element[i] = pt[i]; } } else { __global uchar* element = &bufUChar[offset + id * patternSize]; for (uint i = 0; i < patternSize; ++i) { element[i] = pattern[i]; } } } __attribute__((always_inline)) void __amd_fillImage( __write_only image2d_array_t image, float4 patternFLOAT4, int4 patternINT4, uint4 patternUINT4, int4 origin, int4 size, uint type) { int4 coords; coords.x = get_global_id(0); coords.y = get_global_id(1); coords.z = get_global_id(2); coords.w = 0; if ((coords.x >= size.x) || (coords.y >= size.y) || (coords.z >= size.z)) { return; } coords += origin; int SizeX = get_global_size(0); int AdjustedSizeX = size.x + origin.x; for (uint i = 0; i < SplitCount; ++i) { // Check components switch (type) { case 0: write_imagef(image, coords, patternFLOAT4); break; case 1: write_imagei(image, coords, patternINT4); break; case 2: write_imageui(image, coords, patternUINT4); break; } coords.x += SizeX; if (coords.x >= AdjustedSizeX) return; } } __attribute__((always_inline)) void __amd_streamOpsWrite( __global atomic_uint* ptrUint, __global atomic_ulong* ptrUlong, ulong value) { // The launch parameters for this shader is a 1 grid work-item // 32-bit write if (ptrUint) { atomic_store_explicit(ptrUint, (uint)value, memory_order_relaxed, memory_scope_all_svm_devices); } // 64-bit write else { atomic_store_explicit(ptrUlong, value, memory_order_relaxed, memory_scope_all_svm_devices); } } __attribute__((always_inline)) void __amd_streamOpsWait( __global atomic_uint* ptrUint, __global atomic_ulong* ptrUlong, ulong value, ulong compareOp, ulong mask) { // The launch parameters for this shader is a 1 grid work-item switch (compareOp) { case 0: //GEQ if (ptrUint) { while ((int)(atomic_load_explicit(ptrUint, memory_order_relaxed, memory_scope_all_svm_devices) & (uint)mask) < (uint)value) { __builtin_amdgcn_s_sleep(1); } } else { while ((long)(atomic_load_explicit(ptrUlong, memory_order_relaxed, memory_scope_all_svm_devices) & mask) < value) { __builtin_amdgcn_s_sleep(1); } } break; case 1: // EQ if (ptrUint) { while ((atomic_load_explicit(ptrUint, memory_order_relaxed, memory_scope_all_svm_devices) & (uint)mask) != (uint)value) { __builtin_amdgcn_s_sleep(1); } } else { while ((atomic_load_explicit(ptrUlong, memory_order_relaxed, memory_scope_all_svm_devices) & mask) != value) { __builtin_amdgcn_s_sleep(1); } } break; case 2: //AND if (ptrUint) { while (!((atomic_load_explicit(ptrUint, memory_order_relaxed, memory_scope_all_svm_devices) & (uint)mask) & (uint)value)) { __builtin_amdgcn_s_sleep(1); } } else { while (!((atomic_load_explicit(ptrUlong, memory_order_relaxed, memory_scope_all_svm_devices) & mask) & value)) { __builtin_amdgcn_s_sleep(1); } } break; case 3: //NOR if (ptrUint) { while (((atomic_load_explicit(ptrUint, memory_order_relaxed, memory_scope_all_svm_devices) & (uint)mask) | (uint)value) == ~0U) { __builtin_amdgcn_s_sleep(1); } } else { while (((atomic_load_explicit(ptrUlong, memory_order_relaxed, memory_scope_all_svm_devices) & mask) | value) == ~0UL) { __builtin_amdgcn_s_sleep(1); } } break; } } #endif ROCm-Device-Libs-rocm-5.0.0/opencl/src/misc/asqf.cl000066400000000000000000000020621415221260100216040ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" __attribute__((overloadable, always_inline, const)) cl_mem_fence_flags get_fence(void *a) { return OCKL_MANGLE_T(is_local,addr)(a) ? CLK_LOCAL_MEM_FENCE : CLK_GLOBAL_MEM_FENCE; } __attribute__((overloadable, always_inline, const)) cl_mem_fence_flags get_fence(const void *a) { return OCKL_MANGLE_T(is_local,addr)(a) ? CLK_LOCAL_MEM_FENCE : CLK_GLOBAL_MEM_FENCE; } __attribute__((always_inline, const)) __global void * __to_global(void *a) { return OCKL_MANGLE_T(to,global)(a); } __attribute__((always_inline, const)) __local void * __to_local(void *a) { return OCKL_MANGLE_T(to,local)(a); } __attribute__((always_inline, const)) __private void * __to_private(void *a) { return OCKL_MANGLE_T(to,private)(a); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/misc/atom.cl000066400000000000000000000256301415221260100216200ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable #define ATTR __attribute__((overloadable)) // Cast away volatile before calling clang builtin #define VOLATILE #define AC_int(X) X #define AC_uint(X) X #define AC_long(X) X #define AC_ulong(X) X #define AC_intptr_t(X) X #define AC_uintptr_t(X) X #define AC_size_t(X) X #define AC_ptrdiff_t(X) X #define AC_float(X) as_int(X) #define AC_double(X) as_long(X) #define RC_int(X) X #define RC_uint(X) X #define RC_long(X) X #define RC_ulong(X) X #define RC_intptr_t(X) X #define RC_uintptr_t(X) X #define RC_size_t(X) X #define RC_ptrdiff_t(X) X #define RC_float(X) as_float(X) #define RC_double(X) as_double(X) #define AT_int atomic_int #define AT_uint atomic_uint #define AT_long atomic_long #define AT_ulong atomic_ulong #define AT_intptr_t atomic_intptr_t #define AT_uintptr_t atomic_uintptr_t #define AT_size_t atomic_size_t #define AT_ptrdiff_t atomic_ptrdiff_t #define AT_float atomic_int #define AT_double atomic_long #define ET_int int #define ET_uint uint #define ET_long long #define ET_ulong ulong #define ET_intptr_t intptr_t #define ET_uintptr_t uintptr_t #define ET_size_t size_t #define ET_ptrdiff_t ptrdiff_t #define ET_float int #define ET_double long #define OCL12_MEMORY_ORDER memory_order_relaxed #define OCL12_MEMORY_SCOPE memory_scope_device #define F_inc __opencl_atomic_fetch_add #define F_dec __opencl_atomic_fetch_sub // extension and 1.2 functions #define GEN1(T,A,O) \ ATTR T \ atom_##O(volatile A T *p, T v) \ { \ return __opencl_atomic_fetch_##O((VOLATILE A atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } #define GEN2(T,A,O) \ ATTR T \ atomic_##O(volatile A T *p, T v) \ { \ return __opencl_atomic_fetch_##O((VOLATILE A atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } #define OPSA(F,T,A) \ F(T,A,add) \ F(T,A,sub) \ F(T,A,max) \ F(T,A,min) \ F(T,A,and) \ F(T,A,or) \ F(T,A,xor) #define OPS(F,T) \ OPSA(F,T,__local) \ OPSA(F,T,__global) \ OPSA(F,T,) #define ALL() \ OPS(GEN1,int) \ OPS(GEN2,int) \ OPS(GEN1,uint) \ OPS(GEN2,uint) \ OPS(GEN1,long) \ OPS(GEN1,ulong) ALL() // Handle inc and dec #undef GEN1 #undef GEN2 #undef OPSA #define OPSA(F,T,A) \ F(T,A,inc) \ F(T,A,dec) #define GEN1(T,A,O) \ ATTR T \ atom_##O(volatile A T *p) \ { \ return F_##O((VOLATILE A atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } #define GEN2(T,A,O) \ ATTR T \ atomic_##O(volatile A T *p) \ { \ return F_##O((VOLATILE A atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } ALL() // Handle xchg #undef GEN1 #undef GEN2 #undef OPSA #undef OPS #define GEN1(T,A) \ ATTR T \ atom_xchg(volatile A T *p, T v) \ { \ return __opencl_atomic_exchange((VOLATILE A atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } #define GEN2(T,A) \ ATTR T \ atomic_xchg(volatile A T *p, T v) \ { \ return __opencl_atomic_exchange((VOLATILE A atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } #define OPS(F,T) \ F(T,__local) \ F(T,__global) \ F(T,) \ ALL() #define G(A) \ ATTR float \ atomic_xchg(volatile A float *p, float v) \ { \ return as_float(__opencl_atomic_exchange((VOLATILE A atomic_int *)p, as_int(v), OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE)); \ } G(__local) G(__global) G() // Handle cmpxchg #undef GEN1 #undef GEN2 #undef G #define GEN1(T,A) \ ATTR T \ atom_cmpxchg(volatile A T *p, T e, T d) \ { \ __opencl_atomic_compare_exchange_strong((VOLATILE A atomic_##T *)p, &e, d, OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ return e; \ } #define GEN2(T,A) \ ATTR T \ atomic_cmpxchg(volatile A T *p, T e, T d) \ { \ __opencl_atomic_compare_exchange_strong((VOLATILE A atomic_##T *)p, &e, d, OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ return e; \ } ALL() #undef GEN1 #undef GEN2 #undef ALL // 2.0 functions #undef EXPLICIT_ASPACES #define GENIA(A,T) \ ATTR void \ atomic_init(volatile A atomic_##T *p, T v) \ { \ __opencl_atomic_init((VOLATILE A atomic_##T *)p, v); \ } #define GENSA(A,T) \ ATTR void \ atomic_store(volatile A atomic_##T *p, T v) \ { \ __opencl_atomic_store((VOLATILE A atomic_##T *)p, v, memory_order_seq_cst, memory_scope_device); \ } \ \ ATTR void \ atomic_store_explicit(volatile A atomic_##T *p, T v, memory_order o) \ { \ __opencl_atomic_store((VOLATILE A atomic_##T *)p, v, o, memory_scope_device); \ } \ \ ATTR void \ atomic_store_explicit(volatile A atomic_##T *p, T v, memory_order o, memory_scope s) \ { \ __opencl_atomic_store((VOLATILE A atomic_##T *)p, v, o, s); \ } #define GENLA(A,T) \ ATTR T \ atomic_load(volatile A atomic_##T *p) \ { \ return __opencl_atomic_load((VOLATILE A atomic_##T *)p, memory_order_seq_cst, memory_scope_device); \ } \ \ ATTR T \ atomic_load_explicit(volatile A atomic_##T *p, memory_order o) \ { \ return __opencl_atomic_load((VOLATILE A atomic_##T *)p, o, memory_scope_device); \ } \ \ ATTR T \ atomic_load_explicit(volatile A atomic_##T *p, memory_order o, memory_scope s) \ { \ return __opencl_atomic_load((VOLATILE A atomic_##T *)p, o, s); \ } #define GENXA(A,T) \ ATTR T \ atomic_exchange(volatile A atomic_##T *p, T v) \ { \ return RC_##T(__opencl_atomic_exchange((VOLATILE A AT_##T *)p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \ } \ \ ATTR T \ atomic_exchange_explicit(volatile A atomic_##T *p, T v, memory_order o) \ { \ return RC_##T(__opencl_atomic_exchange((VOLATILE A AT_##T *)p, AC_##T(v), o, memory_scope_device)); \ } \ \ ATTR T \ atomic_exchange_explicit(volatile A atomic_##T *p, T v, memory_order o, memory_scope s) \ { \ return RC_##T(__opencl_atomic_exchange((VOLATILE A AT_##T *)p, AC_##T(v), o, s)); \ } #define GENCXAA(AP,AE,T,K) \ ATTR bool \ atomic_compare_exchange_##K(volatile AP atomic_##T *p, AE T *e, T d) \ { \ return __opencl_atomic_compare_exchange_##K((VOLATILE AP AT_##T *) p, (AE ET_##T *) e, AC_##T(d), memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); \ } \ \ ATTR bool \ atomic_compare_exchange_##K##_explicit(volatile AP atomic_##T *p, AE T *e, T d, memory_order os, memory_order of) \ { \ return __opencl_atomic_compare_exchange_##K((VOLATILE AP AT_##T *)p, (AE ET_##T *)e, AC_##T(d), os, of, memory_scope_device); \ } \ \ ATTR bool \ atomic_compare_exchange_##K##_explicit(volatile AP atomic_##T *p, AE T *e, T d, memory_order os, memory_order of, memory_scope s) \ { \ return __opencl_atomic_compare_exchange_##K((VOLATILE AP AT_##T *) p, (AE ET_##T *)e, AC_##T(d), os, of, s); \ } #if defined EXPLICIT_ASPACES #define GENCXA(A,T,K) \ GENCXAA(A,__global,T,K) \ GENCXAA(A,__local,T,K) \ GENCXAA(A,__private,T,K) \ GENCXAA(A,,T,K) #else #define GENCXA(A,T,K) GENCXAA(A,,T,K) #endif #define GENFOA(A,T,O) \ ATTR T \ atomic_fetch_##O(volatile A atomic_##T *p, T v) \ { \ return RC_##T(__opencl_atomic_fetch_##O((VOLATILE A AT_##T *)p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \ } \ \ ATTR T \ atomic_fetch_##O##_explicit(volatile A atomic_##T *p, T v, memory_order o) \ { \ return RC_##T(__opencl_atomic_fetch_##O((VOLATILE A AT_##T *)p, AC_##T(v), o, memory_scope_device)); \ } \ \ ATTR T \ atomic_fetch_##O##_explicit(volatile A atomic_##T *p, T v, memory_order o, memory_scope s) \ { \ return RC_##T(__opencl_atomic_fetch_##O((VOLATILE A AT_##T *) p, AC_##T(v), o, s)); \ } #define CXA(A,T) \ GENCXA(A,T,strong) \ GENCXA(A,T,weak) #define FOA(A,T) \ GENFOA(A,T,add) \ GENFOA(A,T,sub) \ GENFOA(A,T,or) \ GENFOA(A,T,xor) \ GENFOA(A,T,and) \ GENFOA(A,T,min) \ GENFOA(A,T,max) \ #define ALLIA(A,F) \ F(A,int) \ F(A,uint) \ F(A,long) \ F(A,ulong) #define ALLA(A,F) \ ALLIA(A,F) \ F(A,float) \ F(A,double) #if defined EXPLICIT_ASPACES #define ALLI(F) \ ALLIA(__global, F) \ ALLIA(__local, F) \ ALLIA(, F) #else #define ALLI(F) ALLIA(, F) #endif #if defined EXPLICIT_ASPACES #define ALL(F) \ ALLA(__global,F) \ ALLA(__local, F) \ ALLA(, F) #else #define ALL(F) ALLA(, F) #endif ALL(GENIA) ALL(GENLA) ALL(GENSA) ALL(GENXA) ALL(CXA) ALLI(FOA) // These are needed for uintptr_t #define UIP(A) \ ATTR ulong \ atomic_fetch_add(volatile A atomic_ulong *p, long v) \ { \ return __opencl_atomic_fetch_add((VOLATILE A atomic_ulong *)p, (ulong)v, memory_order_seq_cst, memory_scope_device); \ } \ \ ATTR ulong \ atomic_fetch_add_explicit(volatile A atomic_ulong *p, long v, memory_order o) \ { \ return __opencl_atomic_fetch_add((VOLATILE A atomic_ulong *)p, (ulong)v, o, memory_scope_device); \ } \ \ ATTR ulong \ atomic_fetch_add_explicit(volatile A atomic_ulong *p, long v, memory_order o, memory_scope s) \ { \ return __opencl_atomic_fetch_add((VOLATILE A atomic_ulong *)p, (ulong)v, o, s); \ } \ \ ATTR ulong \ atomic_fetch_sub(volatile A atomic_ulong *p, long v) \ { \ return __opencl_atomic_fetch_sub((VOLATILE A atomic_ulong *)p, (ulong)v, memory_order_seq_cst, memory_scope_device); \ } \ \ ATTR ulong \ atomic_fetch_sub_explicit(volatile A atomic_ulong *p, long v, memory_order o) \ { \ return __opencl_atomic_fetch_sub((VOLATILE A atomic_ulong *)p, (ulong)v, o, memory_scope_device); \ } \ \ ATTR ulong \ atomic_fetch_sub_explicit(volatile A atomic_ulong *p, long v, memory_order o, memory_scope s) \ { \ return __opencl_atomic_fetch_sub((VOLATILE A atomic_ulong *)p, (ulong)v, o, s); \ } #if defined EXPLICIT_ASPACES UIP(__global) UIP(__local) #endif UIP() // flag functions #define FLG(A) \ ATTR bool \ atomic_flag_test_and_set(volatile A atomic_flag *p) \ { \ return __opencl_atomic_exchange((VOLATILE A atomic_int *)p, 1, memory_order_seq_cst, memory_scope_device); \ } \ \ ATTR bool \ atomic_flag_test_and_set_explicit(volatile A atomic_flag *p, memory_order o) \ { \ return __opencl_atomic_exchange((VOLATILE A atomic_int *)p, 1, o, memory_scope_device); \ } \ \ ATTR bool \ atomic_flag_test_and_set_explicit(volatile A atomic_flag *p, memory_order o, memory_scope s) \ { \ return __opencl_atomic_exchange((VOLATILE A atomic_int *)p, 1, o, s); \ } \ \ ATTR void \ atomic_flag_clear(volatile A atomic_flag *p) \ { \ __opencl_atomic_store((VOLATILE A atomic_int *)p, 0, memory_order_seq_cst, memory_scope_device); \ } \ \ ATTR void \ atomic_flag_clear_explicit(volatile A atomic_flag *p, memory_order o) \ { \ __opencl_atomic_store((VOLATILE A atomic_int *)p, 0, o, memory_scope_device); \ } \ \ ATTR void \ atomic_flag_clear_explicit(volatile A atomic_flag *p, memory_order o, memory_scope s) \ { \ __opencl_atomic_store((VOLATILE A atomic_int *)p, 0, o, s); \ } \ #if defined EXPLICIT_ASPACES FLG(__global) FLG(__local) #endif FLG() ROCm-Device-Libs-rocm-5.0.0/opencl/src/misc/awif.cl000066400000000000000000000123341415221260100216030ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #pragma OPENCL EXTENSION cl_khr_subgroups : enable __attribute__((overloadable)) void mem_fence(cl_mem_fence_flags flags) { atomic_work_item_fence(flags, memory_order_acq_rel, memory_scope_work_group); } __attribute__((overloadable)) void read_mem_fence(cl_mem_fence_flags flags) { atomic_work_item_fence(flags, memory_order_acquire, memory_scope_work_group); } __attribute__((overloadable)) void write_mem_fence(cl_mem_fence_flags flags) { atomic_work_item_fence(flags, memory_order_release, memory_scope_work_group); } #if !defined LOW_LEVEL_APPROACH __attribute__((overloadable)) void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope) { // We're tying global-happens-before and local-happens-before together as does HSA if (order != memory_order_relaxed) { switch (scope) { case memory_scope_work_item: break; case memory_scope_sub_group: switch (order) { case memory_order_relaxed: break; case memory_order_acquire: __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront"); break; case memory_order_release: __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront"); break; case memory_order_acq_rel: __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "wavefront"); break; case memory_order_seq_cst: __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "wavefront"); break; } break; case memory_scope_work_group: switch (order) { case memory_order_relaxed: break; case memory_order_acquire: __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); break; case memory_order_release: __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); break; case memory_order_acq_rel: __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "workgroup"); break; case memory_order_seq_cst: __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); break; } break; case memory_scope_device: switch (order) { case memory_order_relaxed: break; case memory_order_acquire: __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); break; case memory_order_release: __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); break; case memory_order_acq_rel: __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); break; case memory_order_seq_cst: __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); break; } break; case memory_scope_all_svm_devices: switch (order) { case memory_order_relaxed: break; case memory_order_acquire: __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, ""); break; case memory_order_release: __builtin_amdgcn_fence(__ATOMIC_RELEASE, ""); break; case memory_order_acq_rel: __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, ""); break; case memory_order_seq_cst: __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); break; } break; } } } #else // LGKMC (LDS, GDS, Konstant, Message) is 4 bits // EXPC (Export) is 3 bits // VMC (VMem) is 4 bits #define LGKMC_MAX 0xf #define EXPC_MAX 0x7 #define VMC_MAX 0xf #define WAITCNT_IMM(LGKMC, EXPC, VMC) ((LGKMC << 8) | (EXPC << 4) | VMC) __attribute__((target("vi-insts,ci-insts"))) __attribute__((overloadable)) void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope) { if (order != memory_order_relaxed) { // Strip CLK_IMAGE_MEM_FENCE flags &= CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE; if (flags == CLK_LOCAL_MEM_FENCE) { __builtin_amdgcn_s_waitcnt(WAITCNT_IMM(0, EXPC_MAX, VMC_MAX)); } else if (flags == CLK_GLOBAL_MEM_FENCE) { if (order != memory_order_acquire) { __builtin_amdgcn_s_waitcnt(WAITCNT_IMM(LGKMC_MAX, EXPC_MAX, 0)); __builtin_amdgcn_s_dcache_wb(); } if ((scope == memory_scope_device) | (scope == memory_scope_all_svm_devices)) { if (order != memory_order_release) { __builtin_amdgcn_buffer_wbinvl1_vol(); __builtin_amdgcn_s_dcache_inv_vol(); } } } else if (flags == (CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE)) { __builtin_amdgcn_s_waitcnt(order == memory_order_acquire ? WAITCNT_IMM(0, EXPC_MAX, VMC_MAX) : WAITCNT_IMM(0, EXPC_MAX, 0)); if (order != memory_order_acquire) __builtin_amdgcn_s_dcache_wb(); if ((scope == memory_scope_device) | (scope == memory_scope_all_svm_devices)) { if (order != memory_order_release) { __builtin_amdgcn_buffer_wbinvl1_vol(); __builtin_amdgcn_s_dcache_inv_vol(); } } } } } #endif // LOW_LEVEL_APPROACH ROCm-Device-Libs-rocm-5.0.0/opencl/src/misc/cdhx.cl000066400000000000000000000104321415221260100216000ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((const)) // TODO - remove these when these conversions are ordinary LLVM conversions ATTR uint __cvt_f16_rtn_f32(float a) { uint u = as_uint(a); uint um = u & 0x7fffffU; int e = (int)((u >> 23) & 0xff) - 127 + 15; int ds = max(0, min(19, 1 - e)); uint t = (um | (e > -112 ? 0x800000 : 0)) << (19 - ds); uint s = (u >> 16) & 0x8000; uint m = (u >> 13) & 0x3ff; uint i = 0x7c00 | m | (um ? 0x0200 : 0); uint n = ((uint)e << 10) | m; uint d = (0x400 | m) >> ds; uint v = e < 1 ? d : n; v += (s >> 15) & (t > 0U); uint j = 0x7bff + (s >> 15); v = e > 30 ? j : v; v = e == 143 ? i : v; return s | v; } ATTR uint __cvt_f16_rtp_f32(float a) { uint u = as_uint(a); uint um = u & 0x7fffffU; int e = (int)((u >> 23) & 0xff) - 127 + 15; int ds = max(0, min(19, 1 - e)); uint t = (um | (e > -112 ? 0x800000 : 0)) << (19 - ds); uint s = (u >> 16) & 0x8000; uint m = (u >> 13) & 0x3ff; uint i = 0x7c00 | m | (um ? 0x0200 : 0); uint n = ((uint)e << 10) | m; uint d = (0x400 | m) >> ds; uint v = e < 1 ? d : n; v += ~(s >> 15) & (t > 0U); uint j = 0x7c00 - (s >> 15); v = e > 30 ? j : v; v = e == 143 ? i : v; return s | v; } ATTR uint __cvt_f16_rtz_f32(float a) { uint u = as_uint(a); uint um = u & 0x7fffffU; int e = (int)((u >> 23) & 0xff) - 127 + 15; uint s = (u >> 16) & 0x8000; uint m = (u >> 13) & 0x3ff; uint i = 0x7c00 | m | (um ? 0x0200 : 0); uint n = ((uint)e << 10) | m; uint d = (0x400 | m) >> (1 - e); uint v = e > 30 ? 0x7bff : n; v = e == 143 ? i : v; v = e < 1 ? d : v; v = e < -10 ? 0 : v; return s | v; } ATTR uint __cvt_f16_rte_f64(double a) { ulong u = as_ulong(a); uint uh = u >> 32; int e = (int)((uh >> 20) & 0x7ff) - 1023 + 15; uint m = ((uh >> 8) & 0xffe) | (((uh & 0x1ff) | (uint)u) != 0); uint i = 0x7c00 | (m != 0 ? 0x0200 : 0); uint n = ((uint)e << 12) | m; uint s = (uh >> 16) & 0x8000; int b = clamp(1-e, 0, 13); uint d = (0x1000 | m) >> b; d |= (d << b) != (0x1000 | m); uint v = e < 1 ? d : n; v = (v >> 2) + ((v & 0x7) == 3 | (v & 0x7) > 5); v = e > 30 ? 0x7c00 : v; v = e == 1039 ? i : v; return s | v; } ATTR uint __cvt_f16_rtn_f64(double a) { ulong u = as_ulong(a); uint uh = u >> 32; int e = (int)((uh >> 20) & 0x7ff) - 1023 + 15; uint m = ((uh >> 9) & 0x7fe) | (((uh & 0x3ff) | (uint)u) != 0); uint i = 0x7c00 | (m != 0 ? 0x0200 : 0); uint n = ((uint)e << 11) | m; uint s = (uh >> 16) & 0x8000; uint vp = 0x7bff + (s >> 15); int b = clamp(1-e, 0, 12); uint d = (0x800 | m) >> b; d |= (d << b) != (0x800 | m); uint v = e < 1 ? d : n; v = (v >> 1) + (v & 1 & (s >> 15)); v = e > 30 ? vp : v; v = e == 1039 ? i : v; v = (e == -1008 & m == 0) ? 0 : v; return s | v; } ATTR uint __cvt_f16_rtp_f64(double a) { ulong u = as_ulong(a); uint uh = u >> 32; int e = (int)((uh >> 20) & 0x7ff) - 1023 + 15; uint m = ((uh >> 9) & 0x7fe) | (((uh & 0x3ff) | (uint)u) != 0); uint i = 0x7c00 | (m != 0 ? 0x0200 : 0); uint n = ((uint)e << 11) | m; uint s = (uh >> 16) & 0x8000; uint vp = 0x7c00 - (s >> 15); int b = clamp(1-e, 0, 12); uint d = (0x800 | m) >> b; d |= (d << b) != (0x800 | m); uint v = e < 1 ? d : n; v = (v >> 1) + (v & 1 & ((s >> 15) ^ 1)); v = e > 30 ? vp : v; v = e == 1039 ? i : v; v = (e == -1008 & m == 0) ? 0 : v; return s | v; } ATTR uint __cvt_f16_rtz_f64(double a) { ulong u = as_ulong(a); uint uh = u >> 32; uint m = ((uh >> 9) & 0x7fe) | (((uh & 0x3ff) | (uint)u) != 0); int e = (int)((uh >> 20) & 0x7ff) - 1023 + 15; uint i = 0x7c00 | (m != 0 ? 0x0200 : 0); m >>= 1; uint d = (0x400 | m) >> (1 - e); uint n = ((uint)e << 10) | m; uint v = e > 30 ? 0x7bff : n; v = e == 1039 ? i : v; v = e < 1 ? d : v; v = e < -10 ? 0 : v; return ((uh >> 16) & 0x8000) | v; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/misc/conversions.cl000066400000000000000000002416631415221260100232360ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_fp16 : enable extern __attribute__((const)) uint __cvt_f16_rtn_f32(float); extern __attribute__((const)) uint __cvt_f16_rtp_f32(float); extern __attribute__((const)) uint __cvt_f16_rtz_f32(float); extern __attribute__((const)) uint __cvt_f16_rtn_f64(double); extern __attribute__((const)) uint __cvt_f16_rtp_f64(double); extern __attribute__((const)) uint __cvt_f16_rtz_f64(double); #define ATTR __attribute__((overloadable, const)) #define IATTR __attribute__((const)) #define AATTR(S) __attribute__((overloadable, const, alias(S))) #define _C(A,B) A##B #define C(A,B) _C(A,B) #if !defined USE_CLP #define NOPN(N,TO,TI,S,R) ATTR TO##N convert_##TO##N##S##R(TO##N x) { return x; } #define NOP(TO,TI,S,R) \ NOPN(16,TO,TI,S,R) \ NOPN(8,TO,TI,S,R) \ NOPN(4,TO,TI,S,R) \ NOPN(3,TO,TI,S,R) \ NOPN(2,TO,TI,S,R) \ NOPN(,TO,TI,S,R) #define XLIST x #define XLIST2 x.s0, x.s1 #define XLIST3 XLIST2, x.s2 #define XLIST4 XLIST3, x.s3 #define XLIST8 XLIST4, x.s4, x.s5, x.s6, x.s7 #define XLIST16 XLIST8, x.s8, x.s9, x.sa, x.sb, x.sc, x.sd, x.se, x.sf #define YLIST y #define YLIST2 y.s0, y.s1 #define YLIST3 YLIST2, y.s2 #define YLIST4 YLIST3, y.s3 #define YLIST8 YLIST4, y.s4, y.s5, y.s6, y.s7 #define YLIST16 YLIST8, y.s8, y.s9, y.sa, y.sb, y.sc, y.sd, y.se, y.sf #define CASTN(N,TO,TI,S,R) ATTR TO##N convert_##TO##N##S##R(TI##N x) { return (TO##N)(XLIST##N); } #define CAST(TO,TI,S,R) \ CASTN(16,TO,TI,S,R) \ CASTN(8,TO,TI,S,R) \ CASTN(4,TO,TI,S,R) \ CASTN(3,TO,TI,S,R) \ CASTN(2,TO,TI,S,R) \ CASTN(,TO,TI,S,R) #else #define NOP(TO,TI,S,R) #define CAST(TO,TI,S,R) #endif #define char_short_lb CHAR_MIN #define char_short_ub CHAR_MAX #define char_int_lb CHAR_MIN #define char_int_ub CHAR_MAX #define char_long_lb CHAR_MIN #define char_long_ub CHAR_MAX #define char_float_lb CHAR_MIN #define char_float_ub CHAR_MAX #define char_double_lb CHAR_MIN #define char_double_ub CHAR_MAX #define char_half_lb CHAR_MIN #define char_half_ub CHAR_MAX #define uchar_short_lb 0 #define uchar_short_ub UCHAR_MAX #define uchar_int_lb 0 #define uchar_int_ub UCHAR_MAX #define uchar_long_lb 0 #define uchar_long_ub UCHAR_MAX #define uchar_float_lb 0 #define uchar_float_ub UCHAR_MAX #define uchar_double_lb 0 #define uchar_double_ub UCHAR_MAX #define uchar_half_lb 0 #define uchar_half_ub UCHAR_MAX #define short_int_lb SHRT_MIN #define short_int_ub SHRT_MAX #define short_long_lb SHRT_MIN #define short_long_ub SHRT_MAX #define short_float_lb SHRT_MIN #define short_float_ub SHRT_MAX #define short_double_lb SHRT_MIN #define short_double_ub SHRT_MAX #define short_half_lb -HALF_MAX #define short_half_ub HALF_MAX #define ushort_int_lb 0 #define ushort_int_ub USHRT_MAX #define ushort_long_lb 0 #define ushort_long_ub USHRT_MAX #define ushort_float_lb 0 #define ushort_float_ub USHRT_MAX #define ushort_double_lb 0 #define ushort_double_ub USHRT_MAX #define ushort_half_lb 0 #define ushort_half_ub HALF_MAX #define int_long_lb INT_MIN #define int_long_ub INT_MAX #define int_float_lb INT_MIN #define int_float_ub 0x7fffff80 #define int_double_lb INT_MIN #define int_double_ub INT_MAX #define int_half_lb -HALF_MAX #define int_half_ub HALF_MAX #define uint_long_lb 0 #define uint_long_ub UINT_MAX #define uint_float_lb 0 #define uint_float_ub 0xffffff00U #define uint_double_lb 0 #define uint_double_ub UINT_MAX #define uint_half_lb 0 #define uint_half_ub HALF_MAX #define long_float_lb LONG_MIN #define long_float_ub 0x7fffff8000000000L #define long_double_lb LONG_MIN #define long_double_ub 0x7ffffffffffffc00L #define long_half_lb -HALF_MAX #define long_half_ub HALF_MAX #define ulong_float_lb 0 #define ulong_float_ub 0xffffff0000000000UL #define ulong_double_lb 0 #define ulong_double_ub 0xfffffffffffff800UL #define ulong_half_lb 0 #define ulong_half_ub HALF_MAX #define char_minbnd CHAR_MAX #define uchar_minbnd UCHAR_MAX #define short_minbnd SHRT_MAX #define ushort_minbnd USHRT_MAX #define int_minbnd INT_MAX #define uint_minbnd UINT_MAX #define long_minbnd LONG_MAX #define ulong_minbnd ULONG_MAX #define char_maxbnd CHAR_MIN #define uchar_maxbnd 0 #define short_maxbnd SHRT_MIN #define ushort_maxbnd 0 #define int_maxbnd INT_MIN #define uint_maxbnd 0 #define long_maxbnd LONG_MIN #define ulong_maxbnd 0 #define MMN(F,N,TO,TI,S,R) \ ATTR TO##N \ convert_##TO##N##S##R(TI##N x) \ { \ return convert_##TO##N(F(x, (TI##N) TO##_##F##bnd)); \ } #define MIN(TO,TI,S,R) \ MMN(min,16,TO,TI,S,R) \ MMN(min,8,TO,TI,S,R) \ MMN(min,4,TO,TI,S,R) \ MMN(min,3,TO,TI,S,R) \ MMN(min,2,TO,TI,S,R) \ MMN(min,,TO,TI,S,R) #define MAX(TO,TI,S,R) \ MMN(max,16,TO,TI,S,R) \ MMN(max,8,TO,TI,S,R) \ MMN(max,4,TO,TI,S,R) \ MMN(max,3,TO,TI,S,R) \ MMN(max,2,TO,TI,S,R) \ MMN(max,,TO,TI,S,R) #define CLAMPN(N,TO,TI,S,R) \ ATTR TO##N \ convert_##TO##N##S##R(TI##N x) \ { \ return convert_##TO##N(min(max(x, (TI##N) TO##_##TI##_lb), (TI##N) TO##_##TI##_ub)); \ } #define CLAMP(TO,TI,S,R) \ CLAMPN(16,TO,TI,S,R) \ CLAMPN(8,TO,TI,S,R) \ CLAMPN(4,TO,TI,S,R) \ CLAMPN(3,TO,TI,S,R) \ CLAMPN(2,TO,TI,S,R) \ CLAMPN(,TO,TI,S,R) #define F2IEN(E,N,TO,TI,S,R) \ ATTR TO##N \ convert_##TO##N##S##R(TI##N x) \ { \ return convert_##TO##N##_sat##E(x); \ } #define F2IE(E,TO,TI,S,R) \ F2IEN(E,16,TO,TI,S,R) \ F2IEN(E,8,TO,TI,S,R) \ F2IEN(E,4,TO,TI,S,R) \ F2IEN(E,3,TO,TI,S,R) \ F2IEN(E,2,TO,TI,S,R) \ F2IEN(E,,TO,TI,S,R) #define EF2I(TO,TI,S,R) F2IE(_rte,TO,TI,S,R) #define NF2I(TO,TI,S,R) F2IE(_rtn,TO,TI,S,R) #define PF2I(TO,TI,S,R) F2IE(_rtp,TO,TI,S,R) #define ZF2I(TO,TI,S,R) F2IE(_rtz,TO,TI,S,R) #define CLAMPFN(F,N,TO,TI,S,R) \ ATTR TO##N \ convert_##TO##N##S##R(TI##N x) \ { \ x = min(max(F(x), (TI##N) TO##_##TI##_lb), (TI##N) TO##_##TI##_ub); \ return (TO##N)(XLIST##N); \ } #define CLAMPF(F,TO,TI,S,R) \ CLAMPFN(F,16,TO,TI,S,R) \ CLAMPFN(F,8,TO,TI,S,R) \ CLAMPFN(F,4,TO,TI,S,R) \ CLAMPFN(F,3,TO,TI,S,R) \ CLAMPFN(F,2,TO,TI,S,R) \ CLAMPFN(F,,TO,TI,S,R) #define ECLAMP(TO,TI,S,R) CLAMPF(rint,TO,TI,S,R) #define NCLAMP(TO,TI,S,R) CLAMPF(floor,TO,TI,S,R) #define PCLAMP(TO,TI,S,R) CLAMPF(ceil,TO,TI,S,R) #define ZCLAMP(TO,TI,S,R) CLAMPF(,TO,TI,S,R) #define SEL_(A,B,C) C ? B : A #define SEL_2(A,B,C) select(A,B,C) #define SEL_3(A,B,C) select(A,B,C) #define SEL_4(A,B,C) select(A,B,C) #define SEL_8(A,B,C) select(A,B,C) #define SEL_16(A,B,C) select(A,B,C) #define nou_short short #define nou_ushort short #define nou_int int #define nou_uint int #define nou_long long #define nou_ulong long #define CMP(N,TO,TI,X,OP,B) \ C(convert_,C(nou_##TO, N))(X OP (TI##N) TO##_##TI##_##B) #define CMP_(TO,TI,X,OP,B) (X OP (TI) TO##_##TI##_##B) #define CMP_2(TO,TI,X,OP,B) CMP(2,TO,TI,X,OP,B) #define CMP_3(TO,TI,X,OP,B) CMP(3,TO,TI,X,OP,B) #define CMP_4(TO,TI,X,OP,B) CMP(4,TO,TI,X,OP,B) #define CMP_8(TO,TI,X,OP,B) CMP(8,TO,TI,X,OP,B) #define CMP_16(TO,TI,X,OP,B) CMP(16,TO,TI,X,OP,B) #define CLAMP2FN(F,N,TO,TI,S,R) \ ATTR TO##N \ convert_##TO##N##S##R(TI##N x) \ { \ TI##N y = min(max(F(x), (TI##N) TO##_##TI##_lb), (TI##N) TO##_##TI##_ub); \ TO##N z = (TO##N)(YLIST##N); \ z = SEL_##N(z, (TO##N) TO##_minbnd, CMP_##N(TO,TI,x,>,ub)); \ return SEL_##N(z, (TO##N) TO##_maxbnd, CMP_##N(TO,TI,x,<,lb)); \ } #define CLAMP2F(F,TO,TI,S,R) \ CLAMP2FN(F,16,TO,TI,S,R) \ CLAMP2FN(F,8,TO,TI,S,R) \ CLAMP2FN(F,4,TO,TI,S,R) \ CLAMP2FN(F,3,TO,TI,S,R) \ CLAMP2FN(F,2,TO,TI,S,R) \ CLAMP2FN(F,,TO,TI,S,R) #define ECLAMP2(TO,TI,S,R) CLAMP2F(rint,TO,TI,S,R) #define NCLAMP2(TO,TI,S,R) CLAMP2F(floor,TO,TI,S,R) #define PCLAMP2(TO,TI,S,R) CLAMP2F(ceil,TO,TI,S,R) #define ZCLAMP2(TO,TI,S,R) CLAMP2F(,TO,TI,S,R) #define EXPAND2(TO,TI,S,R) \ ATTR TO##2 \ convert_##TO##2##S##R(TI##2 x) \ { \ return (TO##2)(convert_##TO##S##R(x.lo), \ convert_##TO##S##R(x.hi)); \ } #define EXPAND3(TO,TI,S,R) \ ATTR TO##3 \ convert_##TO##3##S##R(TI##3 x) \ { \ return (TO##3)(convert_##TO##2##S##R(x.s01), \ convert_##TO##S##R(x.s2)); \ } #define EXPAND4(TO,TI,S,R) \ ATTR TO##4 \ convert_##TO##4##S##R(TI##4 x) \ { \ return (TO##4)(convert_##TO##2##S##R(x.lo), \ convert_##TO##2##S##R(x.hi)); \ } #define EXPAND8(TO,TI,S,R) \ ATTR TO##8 \ convert_##TO##8##S##R(TI##8 x) \ { \ return (TO##8)(convert_##TO##4##S##R(x.lo), \ convert_##TO##4##S##R(x.hi)); \ } #define EXPAND16(TO,TI,S,R) \ ATTR TO##16 \ convert_##TO##16##S##R(TI##16 x) \ { \ return (TO##16)(convert_##TO##8##S##R(x.lo), \ convert_##TO##8##S##R(x.hi)); \ } #define EXPAND(TO,TI,S,R) \ EXPAND16(TO,TI,S,R) \ EXPAND8(TO,TI,S,R) \ EXPAND4(TO,TI,S,R) \ EXPAND3(TO,TI,S,R) \ EXPAND2(TO,TI,S,R) #define G_char_char(TO,TI,S,R) NOP(TO,TI,S,R) #define G_char_sat_char(TO,TI,S,R) NOP(TO,TI,S,R) #define G_char_sat_rte_char(TO,TI,S,R) NOP(TO,TI,S,R) #define G_char_sat_rtn_char(TO,TI,S,R) NOP(TO,TI,S,R) #define G_char_sat_rtp_char(TO,TI,S,R) NOP(TO,TI,S,R) #define G_char_sat_rtz_char(TO,TI,S,R) NOP(TO,TI,S,R) #define G_char_rte_char(TO,TI,S,R) NOP(TO,TI,S,R) #define G_char_rtn_char(TO,TI,S,R) NOP(TO,TI,S,R) #define G_char_rtp_char(TO,TI,S,R) NOP(TO,TI,S,R) #define G_char_rtz_char(TO,TI,S,R) NOP(TO,TI,S,R) #define G_char_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_sat_uchar(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rte_uchar(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtn_uchar(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtp_uchar(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtz_uchar(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_sat_short(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rte_short(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rtn_short(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rtp_short(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rtz_short(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtn_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtp_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtz_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_sat_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rte_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtn_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtp_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtz_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_sat_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rte_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rtn_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rtp_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rtz_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_rte_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtn_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtp_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtz_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_sat_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rte_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtn_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtp_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtz_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtn_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtp_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtz_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_sat_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rte_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rtn_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rtp_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_sat_rtz_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_char_rte_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtn_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtp_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtz_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_sat_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rte_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtn_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtp_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_sat_rtz_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_char_rte_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtn_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtp_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_rtz_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_char_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_char_sat_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_char_sat_rte_float(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_char_sat_rtn_float(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_char_sat_rtp_float(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_char_sat_rtz_float(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_char_rte_float(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_char_rtn_float(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_char_rtp_float(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_char_rtz_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_char_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_char_sat_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_char_sat_rte_double(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_char_sat_rtn_double(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_char_sat_rtp_double(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_char_sat_rtz_double(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_char_rte_double(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_char_rtn_double(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_char_rtp_double(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_char_rtz_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_char_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_char_sat_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_char_sat_rte_half(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_char_sat_rtn_half(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_char_sat_rtp_half(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_char_sat_rtz_half(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_char_rte_half(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_char_rtn_half(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_char_rtp_half(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_char_rtz_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uchar_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_sat_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uchar_sat_rte_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uchar_sat_rtn_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uchar_sat_rtp_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uchar_sat_rtz_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uchar_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_uchar(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uchar_sat_uchar(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uchar_sat_rte_uchar(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uchar_sat_rtn_uchar(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uchar_sat_rtp_uchar(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uchar_sat_rtz_uchar(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uchar_rte_uchar(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uchar_rtn_uchar(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uchar_rtp_uchar(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uchar_rtz_uchar(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uchar_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_sat_short(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rte_short(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rtn_short(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rtp_short(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rtz_short(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtn_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtp_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtz_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_sat_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rte_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rtn_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rtp_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rtz_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_sat_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rte_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rtn_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rtp_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rtz_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_rte_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtn_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtp_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtz_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_sat_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rte_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rtn_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rtp_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rtz_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtn_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtp_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtz_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_sat_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rte_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rtn_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rtp_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_sat_rtz_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uchar_rte_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtn_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtp_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtz_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_sat_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rte_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rtn_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rtp_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_sat_rtz_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uchar_rte_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtn_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtp_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_rtz_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uchar_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uchar_sat_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uchar_sat_rte_float(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_uchar_sat_rtn_float(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_uchar_sat_rtp_float(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_uchar_sat_rtz_float(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_uchar_rte_float(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_uchar_rtn_float(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_uchar_rtp_float(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_uchar_rtz_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uchar_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uchar_sat_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uchar_sat_rte_double(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_uchar_sat_rtn_double(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_uchar_sat_rtp_double(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_uchar_sat_rtz_double(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_uchar_rte_double(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_uchar_rtn_double(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_uchar_rtp_double(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_uchar_rtz_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uchar_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uchar_sat_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uchar_sat_rte_half(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_uchar_sat_rtn_half(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_uchar_sat_rtp_half(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_uchar_sat_rtz_half(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_uchar_rte_half(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_uchar_rtn_half(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_uchar_rtp_half(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_uchar_rtz_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_short_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_short(TO,TI,S,R) NOP(TO,TI,S,R) #define G_short_sat_short(TO,TI,S,R) NOP(TO,TI,S,R) #define G_short_sat_rte_short(TO,TI,S,R) NOP(TO,TI,S,R) #define G_short_sat_rtn_short(TO,TI,S,R) NOP(TO,TI,S,R) #define G_short_sat_rtp_short(TO,TI,S,R) NOP(TO,TI,S,R) #define G_short_sat_rtz_short(TO,TI,S,R) NOP(TO,TI,S,R) #define G_short_rte_short(TO,TI,S,R) NOP(TO,TI,S,R) #define G_short_rtn_short(TO,TI,S,R) NOP(TO,TI,S,R) #define G_short_rtp_short(TO,TI,S,R) NOP(TO,TI,S,R) #define G_short_rtz_short(TO,TI,S,R) NOP(TO,TI,S,R) #define G_short_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rte_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rtn_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rtp_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rtz_ushort(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_short_sat_rte_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_short_sat_rtn_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_short_sat_rtp_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_short_sat_rtz_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_short_rte_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtn_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtp_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtz_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rte_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rtn_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rtp_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rtz_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtn_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtp_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtz_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_short_sat_rte_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_short_sat_rtn_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_short_sat_rtp_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_short_sat_rtz_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_short_rte_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtn_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtp_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtz_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_sat_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rte_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rtn_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rtp_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_sat_rtz_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_short_rte_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtn_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtp_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_rtz_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_short_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_short_sat_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_short_sat_rte_float(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_short_sat_rtn_float(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_short_sat_rtp_float(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_short_sat_rtz_float(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_short_rte_float(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_short_rtn_float(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_short_rtp_float(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_short_rtz_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_short_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_short_sat_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_short_sat_rte_double(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_short_sat_rtn_double(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_short_sat_rtp_double(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_short_sat_rtz_double(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_short_rte_double(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_short_rtn_double(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_short_rtp_double(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_short_rtz_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_short_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_short_sat_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_short_sat_rte_half(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_short_sat_rtn_half(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_short_sat_rtp_half(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_short_sat_rtz_half(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_short_rte_half(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_short_rtn_half(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_short_rtp_half(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_short_rtz_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ushort_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_sat_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ushort_sat_rte_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ushort_sat_rtn_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ushort_sat_rtp_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ushort_sat_rtz_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ushort_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_sat_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_sat_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_sat_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_sat_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_sat_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_sat_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ushort_sat_rte_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ushort_sat_rtn_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ushort_sat_rtp_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ushort_sat_rtz_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ushort_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtn_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtp_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtz_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_ushort(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ushort_sat_ushort(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ushort_sat_rte_ushort(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ushort_sat_rtn_ushort(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ushort_sat_rtp_ushort(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ushort_sat_rtz_ushort(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ushort_rte_ushort(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ushort_rtn_ushort(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ushort_rtp_ushort(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ushort_rtz_ushort(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ushort_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_sat_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_ushort_sat_rte_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_ushort_sat_rtn_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_ushort_sat_rtp_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_ushort_sat_rtz_int(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_ushort_rte_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtn_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtp_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtz_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_sat_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_ushort_sat_rte_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_ushort_sat_rtn_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_ushort_sat_rtp_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_ushort_sat_rtz_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_ushort_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtn_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtp_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtz_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_sat_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_ushort_sat_rte_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_ushort_sat_rtn_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_ushort_sat_rtp_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_ushort_sat_rtz_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_ushort_rte_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtn_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtp_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtz_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_sat_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_ushort_sat_rte_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_ushort_sat_rtn_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_ushort_sat_rtp_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_ushort_sat_rtz_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_ushort_rte_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtn_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtp_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_rtz_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ushort_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ushort_sat_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ushort_sat_rte_float(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_ushort_sat_rtn_float(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_ushort_sat_rtp_float(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_ushort_sat_rtz_float(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_ushort_rte_float(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_ushort_rtn_float(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_ushort_rtp_float(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_ushort_rtz_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ushort_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ushort_sat_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ushort_sat_rte_double(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_ushort_sat_rtn_double(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_ushort_sat_rtp_double(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_ushort_sat_rtz_double(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_ushort_rte_double(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_ushort_rtn_double(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_ushort_rtp_double(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_ushort_rtz_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ushort_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ushort_sat_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ushort_sat_rte_half(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_ushort_sat_rtn_half(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_ushort_sat_rtp_half(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_ushort_sat_rtz_half(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_ushort_rte_half(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_ushort_rtn_half(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_ushort_rtp_half(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_ushort_rtz_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_int_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtn_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtp_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtz_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtn_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtp_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtz_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_int(TO,TI,S,R) NOP(TO,TI,S,R) #define G_int_sat_int(TO,TI,S,R) NOP(TO,TI,S,R) #define G_int_sat_rte_int(TO,TI,S,R) NOP(TO,TI,S,R) #define G_int_sat_rtn_int(TO,TI,S,R) NOP(TO,TI,S,R) #define G_int_sat_rtp_int(TO,TI,S,R) NOP(TO,TI,S,R) #define G_int_sat_rtz_int(TO,TI,S,R) NOP(TO,TI,S,R) #define G_int_rte_int(TO,TI,S,R) NOP(TO,TI,S,R) #define G_int_rtn_int(TO,TI,S,R) NOP(TO,TI,S,R) #define G_int_rtp_int(TO,TI,S,R) NOP(TO,TI,S,R) #define G_int_rtz_int(TO,TI,S,R) NOP(TO,TI,S,R) #define G_int_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_int_sat_rte_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_int_sat_rtn_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_int_sat_rtp_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_int_sat_rtz_uint(TO,TI,S,R) MIN(TO,TI,S,R) #define G_int_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtn_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtp_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtz_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_int_sat_rte_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_int_sat_rtn_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_int_sat_rtp_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_int_sat_rtz_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_int_rte_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtn_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtp_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtz_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_sat_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_int_sat_rte_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_int_sat_rtn_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_int_sat_rtp_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_int_sat_rtz_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_int_rte_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtn_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtp_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_rtz_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_int_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_int_sat_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_int_sat_rte_float(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_int_sat_rtn_float(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_int_sat_rtp_float(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_int_sat_rtz_float(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_int_rte_float(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_int_rtn_float(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_int_rtp_float(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_int_rtz_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_int_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_int_sat_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_int_sat_rte_double(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_int_sat_rtn_double(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_int_sat_rtp_double(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_int_sat_rtz_double(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_int_rte_double(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_int_rtn_double(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_int_rtp_double(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_int_rtz_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_int_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_int_sat_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_int_sat_rte_half(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_int_sat_rtn_half(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_int_sat_rtp_half(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_int_sat_rtz_half(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_int_rte_half(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_int_rtn_half(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_int_rtp_half(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_int_rtz_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uint_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rte_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rtn_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rtp_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rtz_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rte_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rtn_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rtp_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rtz_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtn_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtp_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtz_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_int(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rte_int(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rtn_int(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rtp_int(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_sat_rtz_int(TO,TI,S,R) MAX(TO,TI,S,R) #define G_uint_rte_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtn_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtp_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtz_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_uint(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uint_sat_uint(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uint_sat_rte_uint(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uint_sat_rtn_uint(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uint_sat_rtp_uint(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uint_sat_rtz_uint(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uint_rte_uint(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uint_rtn_uint(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uint_rtp_uint(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uint_rtz_uint(TO,TI,S,R) NOP(TO,TI,S,R) #define G_uint_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uint_sat_rte_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uint_sat_rtn_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uint_sat_rtp_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uint_sat_rtz_long(TO,TI,S,R) CLAMP(TO,TI,S,R) #define G_uint_rte_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtn_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtp_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtz_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_sat_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uint_sat_rte_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uint_sat_rtn_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uint_sat_rtp_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uint_sat_rtz_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_uint_rte_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtn_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtp_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_rtz_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_uint_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uint_sat_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uint_sat_rte_float(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_uint_sat_rtn_float(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_uint_sat_rtp_float(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_uint_sat_rtz_float(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_uint_rte_float(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_uint_rtn_float(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_uint_rtp_float(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_uint_rtz_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uint_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uint_sat_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uint_sat_rte_double(TO,TI,S,R) ECLAMP(TO,TI,S,R) #define G_uint_sat_rtn_double(TO,TI,S,R) NCLAMP(TO,TI,S,R) #define G_uint_sat_rtp_double(TO,TI,S,R) PCLAMP(TO,TI,S,R) #define G_uint_sat_rtz_double(TO,TI,S,R) ZCLAMP(TO,TI,S,R) #define G_uint_rte_double(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_uint_rtn_double(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_uint_rtp_double(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_uint_rtz_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uint_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uint_sat_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_uint_sat_rte_half(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_uint_sat_rtn_half(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_uint_sat_rtp_half(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_uint_sat_rtz_half(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_uint_rte_half(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_uint_rtn_half(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_uint_rtp_half(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_uint_rtz_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_long_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtn_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtp_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtz_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtn_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtp_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtz_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rte_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtn_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtp_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtz_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rte_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtn_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtp_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtz_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtn_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtp_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_rtz_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtn_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtp_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtz_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_long(TO,TI,S,R) NOP(TO,TI,S,R) #define G_long_sat_long(TO,TI,S,R) NOP(TO,TI,S,R) #define G_long_sat_rte_long(TO,TI,S,R) NOP(TO,TI,S,R) #define G_long_sat_rtn_long(TO,TI,S,R) NOP(TO,TI,S,R) #define G_long_sat_rtp_long(TO,TI,S,R) NOP(TO,TI,S,R) #define G_long_sat_rtz_long(TO,TI,S,R) NOP(TO,TI,S,R) #define G_long_rte_long(TO,TI,S,R) NOP(TO,TI,S,R) #define G_long_rtn_long(TO,TI,S,R) NOP(TO,TI,S,R) #define G_long_rtp_long(TO,TI,S,R) NOP(TO,TI,S,R) #define G_long_rtz_long(TO,TI,S,R) NOP(TO,TI,S,R) #define G_long_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_sat_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_long_sat_rte_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_long_sat_rtn_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_long_sat_rtp_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_long_sat_rtz_ulong(TO,TI,S,R) MIN(TO,TI,S,R) #define G_long_rte_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtn_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtp_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_rtz_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_long_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_long_sat_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_long_sat_rte_float(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_long_sat_rtn_float(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_long_sat_rtp_float(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_long_sat_rtz_float(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_long_rte_float(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_long_rtn_float(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_long_rtp_float(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_long_rtz_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_long_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_long_sat_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_long_sat_rte_double(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_long_sat_rtn_double(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_long_sat_rtp_double(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_long_sat_rtz_double(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_long_rte_double(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_long_rtn_double(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_long_rtp_double(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_long_rtz_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_long_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_long_sat_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_long_sat_rte_half(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_long_sat_rtn_half(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_long_sat_rtp_half(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_long_sat_rtz_half(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_long_rte_half(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_long_rtn_half(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_long_rtp_half(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_long_rtz_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ulong_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rte_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtn_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtp_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtz_char(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rte_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtn_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtp_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtz_short(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtn_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtp_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtz_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_int(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rte_int(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtn_int(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtp_int(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtz_int(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_rte_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtn_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtp_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtz_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rtn_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rtp_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_rtz_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtn_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtp_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtz_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_sat_long(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rte_long(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtn_long(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtp_long(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_sat_rtz_long(TO,TI,S,R) MAX(TO,TI,S,R) #define G_ulong_rte_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtn_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtp_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_rtz_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_ulong_ulong(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ulong_sat_ulong(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ulong_sat_rte_ulong(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ulong_sat_rtn_ulong(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ulong_sat_rtp_ulong(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ulong_sat_rtz_ulong(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ulong_rte_ulong(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ulong_rtn_ulong(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ulong_rtp_ulong(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ulong_rtz_ulong(TO,TI,S,R) NOP(TO,TI,S,R) #define G_ulong_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ulong_sat_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ulong_sat_rte_float(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_ulong_sat_rtn_float(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_ulong_sat_rtp_float(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_ulong_sat_rtz_float(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_ulong_rte_float(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_ulong_rtn_float(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_ulong_rtp_float(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_ulong_rtz_float(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ulong_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ulong_sat_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ulong_sat_rte_double(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_ulong_sat_rtn_double(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_ulong_sat_rtp_double(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_ulong_sat_rtz_double(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_ulong_rte_double(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_ulong_rtn_double(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_ulong_rtp_double(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_ulong_rtz_double(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ulong_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ulong_sat_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_ulong_sat_rte_half(TO,TI,S,R) ECLAMP2(TO,TI,S,R) #define G_ulong_sat_rtn_half(TO,TI,S,R) NCLAMP2(TO,TI,S,R) #define G_ulong_sat_rtp_half(TO,TI,S,R) PCLAMP2(TO,TI,S,R) #define G_ulong_sat_rtz_half(TO,TI,S,R) ZCLAMP2(TO,TI,S,R) #define G_ulong_rte_half(TO,TI,S,R) EF2I(TO,TI,S,R) #define G_ulong_rtn_half(TO,TI,S,R) NF2I(TO,TI,S,R) #define G_ulong_rtp_half(TO,TI,S,R) PF2I(TO,TI,S,R) #define G_ulong_rtz_half(TO,TI,S,R) ZF2I(TO,TI,S,R) #define G_float_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_sat_char(TO,TI,S,R) #define G_float_sat_rte_char(TO,TI,S,R) #define G_float_sat_rtn_char(TO,TI,S,R) #define G_float_sat_rtp_char(TO,TI,S,R) #define G_float_sat_rtz_char(TO,TI,S,R) #define G_float_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_sat_uchar(TO,TI,S,R) #define G_float_sat_rte_uchar(TO,TI,S,R) #define G_float_sat_rtn_uchar(TO,TI,S,R) #define G_float_sat_rtp_uchar(TO,TI,S,R) #define G_float_sat_rtz_uchar(TO,TI,S,R) #define G_float_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_sat_short(TO,TI,S,R) #define G_float_sat_rte_short(TO,TI,S,R) #define G_float_sat_rtn_short(TO,TI,S,R) #define G_float_sat_rtp_short(TO,TI,S,R) #define G_float_sat_rtz_short(TO,TI,S,R) #define G_float_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtn_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtp_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtz_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_sat_ushort(TO,TI,S,R) #define G_float_sat_rte_ushort(TO,TI,S,R) #define G_float_sat_rtn_ushort(TO,TI,S,R) #define G_float_sat_rtp_ushort(TO,TI,S,R) #define G_float_sat_rtz_ushort(TO,TI,S,R) #define G_float_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_sat_int(TO,TI,S,R) #define G_float_sat_rte_int(TO,TI,S,R) #define G_float_sat_rtn_int(TO,TI,S,R) #define G_float_sat_rtp_int(TO,TI,S,R) #define G_float_sat_rtz_int(TO,TI,S,R) #define G_float_rte_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtn_int(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_rtp_int(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_rtz_int(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_sat_uint(TO,TI,S,R) #define G_float_sat_rte_uint(TO,TI,S,R) #define G_float_sat_rtn_uint(TO,TI,S,R) #define G_float_sat_rtp_uint(TO,TI,S,R) #define G_float_sat_rtz_uint(TO,TI,S,R) #define G_float_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtn_uint(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_rtp_uint(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_rtz_uint(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_sat_long(TO,TI,S,R) #define G_float_sat_rte_long(TO,TI,S,R) #define G_float_sat_rtn_long(TO,TI,S,R) #define G_float_sat_rtp_long(TO,TI,S,R) #define G_float_sat_rtz_long(TO,TI,S,R) #define G_float_rte_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtn_long(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_rtp_long(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_rtz_long(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_sat_ulong(TO,TI,S,R) #define G_float_sat_rte_ulong(TO,TI,S,R) #define G_float_sat_rtn_ulong(TO,TI,S,R) #define G_float_sat_rtp_ulong(TO,TI,S,R) #define G_float_sat_rtz_ulong(TO,TI,S,R) #define G_float_rte_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtn_ulong(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_rtp_ulong(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_rtz_ulong(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_float(TO,TI,S,R) NOP(TO,TI,S,R) #define G_float_sat_float(TO,TI,S,R) #define G_float_sat_rte_float(TO,TI,S,R) #define G_float_sat_rtn_float(TO,TI,S,R) #define G_float_sat_rtp_float(TO,TI,S,R) #define G_float_sat_rtz_float(TO,TI,S,R) #define G_float_rte_float(TO,TI,S,R) NOP(TO,TI,S,R) #define G_float_rtn_float(TO,TI,S,R) NOP(TO,TI,S,R) #define G_float_rtp_float(TO,TI,S,R) NOP(TO,TI,S,R) #define G_float_rtz_float(TO,TI,S,R) NOP(TO,TI,S,R) #define G_float_double(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_sat_double(TO,TI,S,R) #define G_float_sat_rte_double(TO,TI,S,R) #define G_float_sat_rtn_double(TO,TI,S,R) #define G_float_sat_rtp_double(TO,TI,S,R) #define G_float_sat_rtz_double(TO,TI,S,R) #define G_float_rte_double(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtn_double(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_rtp_double(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_rtz_double(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_float_half(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_sat_half(TO,TI,S,R) #define G_float_sat_rte_half(TO,TI,S,R) #define G_float_sat_rtn_half(TO,TI,S,R) #define G_float_sat_rtp_half(TO,TI,S,R) #define G_float_sat_rtz_half(TO,TI,S,R) #define G_float_rte_half(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtn_half(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtp_half(TO,TI,S,R) CAST(TO,TI,S,R) #define G_float_rtz_half(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_sat_char(TO,TI,S,R) #define G_double_sat_rte_char(TO,TI,S,R) #define G_double_sat_rtn_char(TO,TI,S,R) #define G_double_sat_rtp_char(TO,TI,S,R) #define G_double_sat_rtz_char(TO,TI,S,R) #define G_double_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_sat_uchar(TO,TI,S,R) #define G_double_sat_rte_uchar(TO,TI,S,R) #define G_double_sat_rtn_uchar(TO,TI,S,R) #define G_double_sat_rtp_uchar(TO,TI,S,R) #define G_double_sat_rtz_uchar(TO,TI,S,R) #define G_double_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_sat_short(TO,TI,S,R) #define G_double_sat_rte_short(TO,TI,S,R) #define G_double_sat_rtn_short(TO,TI,S,R) #define G_double_sat_rtp_short(TO,TI,S,R) #define G_double_sat_rtz_short(TO,TI,S,R) #define G_double_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtn_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtp_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtz_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_sat_ushort(TO,TI,S,R) #define G_double_sat_rte_ushort(TO,TI,S,R) #define G_double_sat_rtn_ushort(TO,TI,S,R) #define G_double_sat_rtp_ushort(TO,TI,S,R) #define G_double_sat_rtz_ushort(TO,TI,S,R) #define G_double_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtn_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtp_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtz_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_sat_int(TO,TI,S,R) #define G_double_sat_rte_int(TO,TI,S,R) #define G_double_sat_rtn_int(TO,TI,S,R) #define G_double_sat_rtp_int(TO,TI,S,R) #define G_double_sat_rtz_int(TO,TI,S,R) #define G_double_rte_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtn_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtp_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtz_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_sat_uint(TO,TI,S,R) #define G_double_sat_rte_uint(TO,TI,S,R) #define G_double_sat_rtn_uint(TO,TI,S,R) #define G_double_sat_rtp_uint(TO,TI,S,R) #define G_double_sat_rtz_uint(TO,TI,S,R) #define G_double_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtn_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtp_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtz_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_sat_long(TO,TI,S,R) #define G_double_sat_rte_long(TO,TI,S,R) #define G_double_sat_rtn_long(TO,TI,S,R) #define G_double_sat_rtp_long(TO,TI,S,R) #define G_double_sat_rtz_long(TO,TI,S,R) #define G_double_rte_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtn_long(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_double_rtp_long(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_double_rtz_long(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_double_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_sat_ulong(TO,TI,S,R) #define G_double_sat_rte_ulong(TO,TI,S,R) #define G_double_sat_rtn_ulong(TO,TI,S,R) #define G_double_sat_rtp_ulong(TO,TI,S,R) #define G_double_sat_rtz_ulong(TO,TI,S,R) #define G_double_rte_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtn_ulong(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_double_rtp_ulong(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_double_rtz_ulong(TO,TI,S,R) EXPAND(TO,TI,S,R) #define G_double_float(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_sat_float(TO,TI,S,R) #define G_double_sat_rte_float(TO,TI,S,R) #define G_double_sat_rtn_float(TO,TI,S,R) #define G_double_sat_rtp_float(TO,TI,S,R) #define G_double_sat_rtz_float(TO,TI,S,R) #define G_double_rte_float(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtn_float(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtp_float(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtz_float(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_double(TO,TI,S,R) NOP(TO,TI,S,R) #define G_double_sat_double(TO,TI,S,R) #define G_double_sat_rte_double(TO,TI,S,R) #define G_double_sat_rtn_double(TO,TI,S,R) #define G_double_sat_rtp_double(TO,TI,S,R) #define G_double_sat_rtz_double(TO,TI,S,R) #define G_double_rte_double(TO,TI,S,R) NOP(TO,TI,S,R) #define G_double_rtn_double(TO,TI,S,R) NOP(TO,TI,S,R) #define G_double_rtp_double(TO,TI,S,R) NOP(TO,TI,S,R) #define G_double_rtz_double(TO,TI,S,R) NOP(TO,TI,S,R) #define G_double_half(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_sat_half(TO,TI,S,R) #define G_double_sat_rte_half(TO,TI,S,R) #define G_double_sat_rtn_half(TO,TI,S,R) #define G_double_sat_rtp_half(TO,TI,S,R) #define G_double_sat_rtz_half(TO,TI,S,R) #define G_double_rte_half(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtn_half(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtp_half(TO,TI,S,R) CAST(TO,TI,S,R) #define G_double_rtz_half(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_sat_char(TO,TI,S,R) #define G_half_sat_rte_char(TO,TI,S,R) #define G_half_sat_rtn_char(TO,TI,S,R) #define G_half_sat_rtp_char(TO,TI,S,R) #define G_half_sat_rtz_char(TO,TI,S,R) #define G_half_rte_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtn_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtp_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtz_char(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_sat_uchar(TO,TI,S,R) #define G_half_sat_rte_uchar(TO,TI,S,R) #define G_half_sat_rtn_uchar(TO,TI,S,R) #define G_half_sat_rtp_uchar(TO,TI,S,R) #define G_half_sat_rtz_uchar(TO,TI,S,R) #define G_half_rte_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtn_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtp_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtz_uchar(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_sat_short(TO,TI,S,R) #define G_half_sat_rte_short(TO,TI,S,R) #define G_half_sat_rtn_short(TO,TI,S,R) #define G_half_sat_rtp_short(TO,TI,S,R) #define G_half_sat_rtz_short(TO,TI,S,R) #define G_half_rte_short(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtn_short(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtp_short(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtz_short(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_sat_ushort(TO,TI,S,R) #define G_half_sat_rte_ushort(TO,TI,S,R) #define G_half_sat_rtn_ushort(TO,TI,S,R) #define G_half_sat_rtp_ushort(TO,TI,S,R) #define G_half_sat_rtz_ushort(TO,TI,S,R) #define G_half_rte_ushort(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtn_ushort(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtp_ushort(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtz_ushort(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_sat_int(TO,TI,S,R) #define G_half_sat_rte_int(TO,TI,S,R) #define G_half_sat_rtn_int(TO,TI,S,R) #define G_half_sat_rtp_int(TO,TI,S,R) #define G_half_sat_rtz_int(TO,TI,S,R) #define G_half_rte_int(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtn_int(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtp_int(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtz_int(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_sat_uint(TO,TI,S,R) #define G_half_sat_rte_uint(TO,TI,S,R) #define G_half_sat_rtn_uint(TO,TI,S,R) #define G_half_sat_rtp_uint(TO,TI,S,R) #define G_half_sat_rtz_uint(TO,TI,S,R) #define G_half_rte_uint(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtn_uint(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtp_uint(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtz_uint(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_sat_long(TO,TI,S,R) #define G_half_sat_rte_long(TO,TI,S,R) #define G_half_sat_rtn_long(TO,TI,S,R) #define G_half_sat_rtp_long(TO,TI,S,R) #define G_half_sat_rtz_long(TO,TI,S,R) #define G_half_rte_long(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtn_long(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtp_long(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtz_long(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_sat_ulong(TO,TI,S,R) #define G_half_sat_rte_ulong(TO,TI,S,R) #define G_half_sat_rtn_ulong(TO,TI,S,R) #define G_half_sat_rtp_ulong(TO,TI,S,R) #define G_half_sat_rtz_ulong(TO,TI,S,R) #define G_half_rte_ulong(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtn_ulong(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtp_ulong(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtz_ulong(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_float(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_sat_float(TO,TI,S,R) #define G_half_sat_rte_float(TO,TI,S,R) #define G_half_sat_rtn_float(TO,TI,S,R) #define G_half_sat_rtp_float(TO,TI,S,R) #define G_half_sat_rtz_float(TO,TI,S,R) #define G_half_rte_float(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtn_float(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtp_float(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtz_float(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_double(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_sat_double(TO,TI,S,R) #define G_half_sat_rte_double(TO,TI,S,R) #define G_half_sat_rtn_double(TO,TI,S,R) #define G_half_sat_rtp_double(TO,TI,S,R) #define G_half_sat_rtz_double(TO,TI,S,R) #define G_half_rte_double(TO,TI,S,R) CAST(TO,TI,S,R) #define G_half_rtn_double(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtp_double(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_rtz_double(TO,TI,S,R) EXPAND(TO,TI,R,S) #define G_half_half(TO,TI,S,R) NOP(TO,TI,S,R) #define G_half_sat_half(TO,TI,S,R) #define G_half_sat_rte_half(TO,TI,S,R) #define G_half_sat_rtn_half(TO,TI,S,R) #define G_half_sat_rtp_half(TO,TI,S,R) #define G_half_sat_rtz_half(TO,TI,S,R) #define G_half_rte_half(TO,TI,S,R) NOP(TO,TI,S,R) #define G_half_rtn_half(TO,TI,S,R) NOP(TO,TI,S,R) #define G_half_rtp_half(TO,TI,S,R) NOP(TO,TI,S,R) #define G_half_rtz_half(TO,TI,S,R) NOP(TO,TI,S,R) #define GEN2(TO,TI) \ C(G_,C(TO,C(_,TI)))(TO,TI,,) \ C(G_,C(TO,C(_sat_,TI)))(TO,TI,_sat,) \ C(G_,C(TO,C(_sat_rte_,TI)))(TO,TI,_sat,_rte) \ C(G_,C(TO,C(_sat_rtn_,TI)))(TO,TI,_sat,_rtn) \ C(G_,C(TO,C(_sat_rtp_,TI)))(TO,TI,_sat,_rtp) \ C(G_,C(TO,C(_sat_rtz_,TI)))(TO,TI,_sat,_rtz) \ C(G_,C(TO,C(_rte_,TI)))(TO,TI,,_rte) \ C(G_,C(TO,C(_rtn_,TI)))(TO,TI,,_rtn) \ C(G_,C(TO,C(_rtp_,TI)))(TO,TI,,_rtp) \ C(G_,C(TO,C(_rtz_,TI)))(TO,TI,,_rtz) #define GEN(T) \ GEN2(T,char) \ GEN2(T,uchar) \ GEN2(T,short) \ GEN2(T,ushort) \ GEN2(T,int) \ GEN2(T,uint) \ GEN2(T,long) \ GEN2(T,ulong) \ GEN2(T,float) \ GEN2(T,double) \ GEN2(T,half) GEN(char) GEN(uchar) GEN(short) GEN(ushort) GEN(int) GEN(uint) GEN(long) GEN(ulong) GEN(float) GEN(double) GEN(half) ATTR float convert_float_rtn(int i) { int s = i >> 31; uint u = as_uint((i + s) ^ s); uint lz = clz(u); uint e = 127U + 31U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffU; uint t = u & 0xffU; u = (e << 23) | (u >> 8); return as_float((u + ((s & t) > 0)) | (s & 0x80000000)); } ATTR float convert_float_rtp(int i) { int s = i >> 31; uint u = as_uint((i + s) ^ s); uint lz = clz(u); uint e = 127U + 31U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffU; uint t = u & 0xffU; u = (e << 23) | (u >> 8); return as_float((u + ((~s & t) > 0)) | (s & 0x80000000)); } ATTR float convert_float_rtz(int i) { int s = i >> 31; uint u = as_uint((i + s) ^ s); uint lz = clz(u); uint e = 127U + 31U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffU; u = (e << 23) | (u >> 8); return as_float(u | (s & 0x80000000)); } IATTR static float cvt1f4_zu4(uint u) { uint lz = clz(u); uint e = 127U + 31U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffU; return as_float((e << 23) | (u >> 8)); } extern AATTR("cvt1f4_zu4") float convert_float_rtn(uint); extern AATTR("cvt1f4_zu4") float convert_float_rtz(uint); ATTR float convert_float_rtp(uint u) { uint lz = clz(u); uint e = 127U + 31U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffU; uint t = u & 0xffU; u = (e << 23) | (u >> 8); return as_float(u + (t > 0)); } ATTR float convert_float_rtn(long l) { long s = l >> 63; ulong u = as_ulong((l + s) ^ s); uint lz = clz(u); uint e = 127U + 63U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffffffffffUL; ulong t = u & 0xffffffffffUL; uint v = (e << 23) | (uint)(u >> 40); return as_float((v + ((s & t) > 0)) | ((uint)s & 0x80000000)); } ATTR float convert_float_rtp(long l) { long s = l >> 63; ulong u = as_ulong((l + s) ^ s); uint lz = clz(u); uint e = 127U + 63U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffffffffffUL; ulong t = u & 0xffffffffffUL; uint v = (e << 23) | (uint)(u >> 40); return as_float((v + ((~s & t) > 0)) | ((uint)s & 0x80000000)); } ATTR float convert_float_rtz(long l) { long s = l >> 63; ulong u = as_ulong((l + s) ^ s); uint lz = clz(u); uint e = 127U + 63U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffffffffffUL; uint v = (e << 23) | (uint)(u >> 40); return as_float(v | ((uint)s & 0x80000000)); } IATTR static float cvt1f4_zu8(ulong u) { uint lz = clz(u); uint e = 127U + 63U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffffffffffUL; return as_float((e << 23) | (uint)(u >> 40)); } extern AATTR("cvt1f4_zu8") float convert_float_rtz(ulong); extern AATTR("cvt1f4_zu8") float convert_float_rtn(ulong); ATTR float convert_float_rtp(ulong u) { uint lz = clz(u); uint e = 127U + 63U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffffffffffUL; ulong t = u & 0xffffffffffUL; uint v = (e << 23) | (uint)(u >> 40); return as_float(v + (t > 0)); } ATTR float convert_float_rtn(double a) { ulong u = as_ulong(a); ulong um = u & 0xfffffffffffffUL; int e = (int)((u >> 52) & 0x7ff) - 1023 + 127; int ds = max(0, min(31, 1 - e)); ulong t = (um | (e > -896 ? 0x0010000000000000UL : 0UL)) << (35 - ds); uint s = (uint)(u >> 32) & 0x80000000; uint m = (uint)(u >> 29) & 0x7fffff; uint i = 0x7f800000 | m | (um ? 0x00400000 : 0U); uint n = ((uint)(e << 23)) | m; uint d = (0x800000 | m) >> ds; uint v = e < 1 ? d : n; v += (s >> 31) & (t > 0UL); uint j = 0x7f7fffff + (s >> 31); v = e > 254 ? j : v; v = e == 1151 ? i : v; return as_float(s | v); } ATTR float convert_float_rtp(double a) { ulong u = as_ulong(a); ulong um = u & 0xfffffffffffffUL; int e = (int)((u >> 52) & 0x7ff) - 1023 + 127; int ds = max(0, min(31, 1 - e)); ulong t = (um | (e > -896 ? 0x0010000000000000UL : 0UL)) << (35 - ds); uint s = (uint)(u >> 32) & 0x80000000; uint m = (uint)(u >> 29) & 0x7fffff; uint i = 0x7f800000 | m | (um ? 0x00400000 : 0U); uint n = ((uint)(e << 23)) | m; uint d = (0x800000 | m) >> ds; uint v = e < 1 ? d : n; v += ~(s >> 31) & (t > 0UL); uint j = 0x7f800000 - (s >> 31); v = e > 254 ? j : v; v = e == 1151 ? i : v; return as_float(s | v); } ATTR float convert_float_rtz(double a) { ulong u = as_ulong(a); ulong um = u & 0xfffffffffffffUL; int e = (int)((u >> 52) & 0x7ff) - 1023 + 127; uint s = (uint)(u >> 32) & 0x80000000; uint m = (uint)(u >> 29) & 0x7fffff; uint i = 0x7f800000 | m | (um ? 0x00400000 : 0U); uint n = ((uint)(e << 23)) | m; uint d = (0x800000 | m) >> (1 - e); uint v = e > 254 ? 0x7f7fffff : n; v = e == 1151 ? i : v; v = e < 1 ? d : v; v = e < -23 ? 0 : v; return as_float(s | v); } ATTR double convert_double_rtn(long l) { long s = l >> 63; ulong u = as_ulong((l + s) ^ s); uint lz = clz(u); uint e = 1023U + 63U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffffffffffUL; ulong t = u & 0x7ffUL; u = ((ulong)e << 52) | (u >> 11); return as_double((u + ((s & t) > 0)) | ((ulong)s & 0x8000000000000000UL)); } ATTR double convert_double_rtp(long l) { long s = l >> 63; ulong u = as_ulong((l + s) ^ s); uint lz = clz(u); uint e = 1023U + 63U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffffffffffUL; ulong t = u & 0x7ffUL; u = ((ulong)e << 52) | (u >> 11); return as_double((u + ((~s & t) > 0)) | ((ulong)s & 0x8000000000000000UL)); } ATTR double convert_double_rtz(long l) { long s = l >> 63; ulong u = as_ulong((l + s) ^ s); uint lz = clz(u); uint e = 1023U + 63U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffffffffffUL; u = ((ulong)e << 52) | (u >> 11); return as_double(u | ((ulong)s & 0x8000000000000000UL)); } IATTR static double cvt1f8_zu8(ulong u) { uint lz = clz(u); uint e = 1023U + 63U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffffffffffUL; return as_double(((ulong)e << 52) | (u >> 11)); } AATTR("cvt1f8_zu8") double convert_double_rtz(ulong); AATTR("cvt1f8_zu8") double convert_double_rtn(ulong); ATTR double convert_double_rtp(ulong u) { uint lz = clz(u); uint e = 1023U + 63U - lz; e = u ? e : 0; u = (u << lz) & 0x7fffffffffffffffUL; ulong t = u & 0x7ffUL; u = ((ulong)e << 52) | (u >> 11); return as_double(u + (t > 0UL)); } ATTR half convert_half_rtn(short s) { return as_half((ushort)__cvt_f16_rtz_f32((float)s)); } ATTR half convert_half_rtp(short s) { return as_half((ushort)__cvt_f16_rtp_f32((float)s)); } ATTR half convert_half_rtz(short s) { return as_half((ushort)__cvt_f16_rtz_f32((float)s)); } IATTR static half cvt1f2_zu2(ushort u) { return as_half((ushort)__cvt_f16_rtz_f32((float)u)); } AATTR("cvt1f2_zu2") half convert_half_rtn(ushort); AATTR("cvt1f2_zu2") half convert_half_rtz(ushort); ATTR half convert_half_rtp(ushort u) { return as_half((ushort)__cvt_f16_rtp_f32((float)u)); } ATTR half convert_half_rtn(int i) { i = clamp(i, SHRT_MIN, SHRT_MAX); return as_half((ushort)__cvt_f16_rtn_f32((float)i)); } ATTR half convert_half_rtp(int i) { i = clamp(i, SHRT_MIN, SHRT_MAX); return as_half((ushort)__cvt_f16_rtp_f32((float)i)); } ATTR half convert_half_rtz(int i) { i = clamp(i, SHRT_MIN, SHRT_MAX); return as_half((ushort)__cvt_f16_rtz_f32((float)i)); } IATTR static half cvt1f2_zu4(uint u) { u = min(u, (uint)USHRT_MAX); return as_half((ushort)__cvt_f16_rtz_f32((float)u)); } AATTR("cvt1f2_zu4") half convert_half_rtn(uint); AATTR("cvt1f2_zu4") half convert_half_rtz(uint); ATTR half convert_half_rtp(uint u) { u = min(u, (uint)USHRT_MAX); return as_half((ushort)__cvt_f16_rtp_f32((float)u)); } ATTR half convert_half_rtn(long l) { int i = (int)clamp(l, (long)SHRT_MIN, (long)SHRT_MAX); return as_half((ushort)__cvt_f16_rtn_f32((float)i)); } ATTR half convert_half_rtp(long l) { int i = (int)clamp(l, (long)SHRT_MIN, (long)SHRT_MAX); return as_half((ushort)__cvt_f16_rtp_f32((float)i)); } ATTR half convert_half_rtz(long l) { int i = (int)clamp(l, (long)SHRT_MIN, (long)SHRT_MAX); return as_half((ushort)__cvt_f16_rtz_f32((float)i)); } IATTR static half cvt1f2_zu8(ulong ul) { uint u = (uint)min(ul, (ulong)USHRT_MAX); return as_half((ushort)__cvt_f16_rtz_f32((float)u)); } AATTR("cvt1f2_zu8") half convert_half_rtn(ulong); AATTR("cvt1f2_zu8") half convert_half_rtz(ulong); ATTR half convert_half_rtp(ulong ul) { uint u = (uint)min(ul, (ulong)USHRT_MAX); return as_half((ushort)__cvt_f16_rtp_f32((float)u)); } ATTR half convert_half_rtp(float a) { return as_half((ushort)__cvt_f16_rtp_f32(a)); } ATTR half convert_half_rtn(float a) { return as_half((ushort)__cvt_f16_rtn_f32(a)); } ATTR half convert_half_rtz(float a) { return as_half((ushort)__cvt_f16_rtz_f32(a)); } ATTR half convert_half_rtp(double a) { return as_half((ushort)__cvt_f16_rtp_f64(a)); } ATTR half convert_half_rtn(double a) { return as_half((ushort)__cvt_f16_rtn_f64(a)); } ATTR half convert_half_rtz(double a) { return as_half((ushort)__cvt_f16_rtz_f64(a)); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/misc/printf.cl000066400000000000000000000024751415221260100221640ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #ifndef NULL #define NULL 0 #endif #define OFFSET 8 // Atomically reserves space to the printf data buffer and returns a pointer to it __global char * __printf_alloc(uint bytes) { __global char *ptr = (__global char *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3]); uint size = ((__global uint *)ptr)[1]; uint offset = atomic_load_explicit((__global atomic_uint *)ptr, memory_order_relaxed, memory_scope_device); for (;;) { if (OFFSET + offset + bytes > size) return NULL; if (atomic_compare_exchange_strong_explicit((__global atomic_uint *)ptr, &offset, offset+bytes, memory_order_relaxed, memory_order_relaxed, memory_scope_device)) break; } return ptr + OFFSET + offset; } // printf stub to resolve link time dependencies. // Will be replaced by the compiler. __attribute__((noinline)) __attribute__((optnone)) __attribute__((format(printf, 1, 2))) int printf(__constant const char* st, ...) { __printf_alloc(0); return -1; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/misc/shuffle.cl000066400000000000000000000053051415221260100223110ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _S(X) #X #define S(X) _S(X) #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) #define char_utype uchar #define short_utype ushort #define int_utype uint #define long_utype ulong #define float_utype uint #define double_utype ulong #define half_utype ushort #define ATTR __attribute__((overloadable, const)) #define IATTR __attribute__((const)) #define AATTR(A) __attribute__((overloadable, const, alias(A))) #define LIST2 t[m.s0], t[m.s1] #define LIST4 LIST2, t[m.s2], t[m.s3] #define LIST8 LIST4, t[m.s4], t[m.s5], t[m.s6], t[m.s7] #define LIST16 LIST8, t[m.s8], t[m.s9], t[m.sa], t[m.sb], t[m.sc], t[m.sd], t[m.se], t[m.sf] #define GENIMN(M,N,T) \ IATTR T##N \ sh_##N##T##M(T##M x, C(T##_utype,N) m) \ { \ __attribute__((aligned(sizeof(T##M)))) T t[M]; \ *(__private T##M *)t = x; \ m &= (C(T##_utype,N))(M-1); \ return (T##N) ( LIST##N ); \ } \ extern AATTR(S(sh_##N##T##M)) T##N shuffle(T##M, C(T##_utype,N)); \ extern AATTR(S(sh_##N##T##M)) u##T##N shuffle(u##T##M, C(T##_utype,N)); \ \ IATTR T##N \ sh2_##N##T##M(T##M x, T##M y, C(T##_utype,N) m) \ { \ __attribute__((aligned(sizeof(T##M)))) T t[2*M]; \ *(__private T##M *)t = x; \ *(__private T##M *)(t + M) = y; \ m &= (C(T##_utype,N))(2*M-1); \ return (T##N) ( LIST##N ); \ } \ extern AATTR(S(sh2_##N##T##M)) T##N shuffle2(T##M, T##M, C(T##_utype,N)); \ extern AATTR(S(sh2_##N##T##M)) u##T##N shuffle2(u##T##M, u##T##M, C(T##_utype,N)); #define GENIN(N,T) \ GENIMN(16,N,T) \ GENIMN(8,N,T) \ GENIMN(4,N,T) \ GENIMN(2,N,T) #define GENI(T) \ GENIN(16,T) \ GENIN(8,T) \ GENIN(4,T) \ GENIN(2,T) GENI(char) GENI(short) GENI(int) GENI(long) #define GENFMN(M,N,T) \ ATTR T##N \ shuffle(T##M x, C(T##_utype,N) m) \ { \ __attribute__((aligned(sizeof(T##M)))) T t[M]; \ *(__private T##M *)t = x; \ m &= (C(T##_utype,N))(M-1); \ return (T##N) ( LIST##N ); \ } \ \ ATTR T##N \ shuffle2(T##M x, T##M y, C(T##_utype,N) m) \ { \ __attribute__((aligned(sizeof(T##M)))) T t[2*M]; \ *(__private T##M *)t = x; \ *(__private T##M *)(t + M) = y; \ m &= (C(T##_utype,N))(2*M-1); \ return (T##N) ( LIST##N ); \ } #define GENFN(N,T) \ GENFMN(16,N,T) \ GENFMN(8,N,T) \ GENFMN(4,N,T) \ GENFMN(2,N,T) #define GENF(T) \ GENFN(16,T) \ GENFN(8,T) \ GENFN(4,T) \ GENFN(2,T) GENF(float) GENF(double) GENF(half) ROCm-Device-Libs-rocm-5.0.0/opencl/src/misc/workitem.cl000066400000000000000000000024151415221260100225150ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #define ATTR __attribute__((overloadable, const)) ATTR size_t get_global_offset(uint dim) { return __ockl_get_global_offset(dim); } ATTR size_t get_global_id(uint dim) { return __ockl_get_global_id(dim); } ATTR size_t get_local_id(uint dim) { return __ockl_get_local_id(dim); } ATTR size_t get_group_id(uint dim) { return __ockl_get_group_id(dim); } ATTR size_t get_global_size(uint dim) { return __ockl_get_global_size(dim); } ATTR size_t get_local_size(uint dim) { return __ockl_get_local_size(dim); } ATTR size_t get_num_groups(uint dim) { return __ockl_get_num_groups(dim); } ATTR uint get_work_dim(void) { return __ockl_get_work_dim(); } ATTR size_t get_enqueued_local_size(uint dim) { return __ockl_get_enqueued_local_size(dim); } ATTR size_t get_global_linear_id(void) { return __ockl_get_global_linear_id(); } ATTR size_t get_local_linear_id(void) { return __ockl_get_local_linear_id(); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/pipes/000077500000000000000000000000001415221260100205175ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/pipes/commitp.cl000066400000000000000000000042651415221260100225160ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "pipes.h" #define ATTR __attribute__((always_inline)) #define COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \ ATTR void \ __commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ { \ } // DO_PIPE_SIZE(COMMIT_READ_PIPE_SIZE) ATTR void __commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) { } #define COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \ ATTR void \ __commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ { \ } // DO_PIPE_SIZE(COMMIT_WRITE_PIPE_SIZE) ATTR void __commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) { } // Work group functions #define WORK_GROUP_COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \ ATTR void \ __work_group_commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ { \ } // DO_PIPE_SIZE(WORK_GROUP_COMMIT_READ_PIPE_SIZE) ATTR void __work_group_commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) { } #define WORK_GROUP_COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \ ATTR void \ __work_group_commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ { \ } // DO_PIPE_SIZE(WORK_GROUP_COMMIT_WRITE_PIPE_SIZE) ATTR void __work_group_commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) { } // sub group functions #define SUB_GROUP_COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \ ATTR void \ __sub_group_commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ { \ } // DO_PIPE_SIZE(SUB_GROUP_COMMIT_READ_PIPE_SIZE) ATTR void __sub_group_commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) { } #define SUB_GROUP_COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \ ATTR void \ __sub_group_commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ { \ } // DO_PIPE_SIZE(SUB_GROUP_COMMIT_WRITE_PIPE_SIZE) ATTR void __sub_group_commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) { } ROCm-Device-Libs-rocm-5.0.0/opencl/src/pipes/getp.cl000066400000000000000000000022011415221260100217710ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "pipes.h" #define ATTR __attribute__((always_inline, pure)) static ATTR uint num_packets(__global struct pipeimp* p) { size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); return (uint)(wi - ri); } ATTR uint __get_pipe_num_packets_ro(__global struct pipeimp* p, uint size, uint align) { return num_packets(p); } ATTR uint __get_pipe_num_packets_wo(__global struct pipeimp* p, uint size, uint align) { return num_packets(p); } ATTR uint __get_pipe_max_packets_ro(__global struct pipeimp* p, uint size, uint align) { return (uint)p->end_idx; } ATTR uint __get_pipe_max_packets_wo(__global struct pipeimp* p, uint size, uint align) { return (uint)p->end_idx; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/pipes/memcpyia.cl000066400000000000000000000023761415221260100226530ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ void __memcpy_internal_aligned(void *d, const void *s, size_t size, size_t align) { if (align == 2) { short *d2 = (short *)d; short *s2 = (short *)s; short *e2 = s2 + size/2; while (s2 < e2) *d2++ = *s2++; } else if (align == 4) { int *d4 = (int *)d; int *s4 = (int *)s; int *e4 = s4 + size/4; while (s4 < e4) *d4++ = *s4++; } else if (align == 8) { long *d8 = (long *)d; long *s8 = (long *)s; long *e8 = s8 + size/8; while (s8 < e8) *d8++ = *s8++; } else if (align == 16) { long2 *d16 = (long2 *)d; long2 *s16 = (long2 *)s; long2 *e16 = s16 + size/16; while (s16 < e16) *d16++ = *s16++; } else if (align == 32 || align == 64 || align == 128) { long4 *d32 = (long4 *)d; long4 *s32 = (long4 *)s; long4 *e32 = s32 + size/32; while (s32 < e32) *d32++ = *s32++; } else { char *d1 = (char *)d; char *s1 = (char *)s; char *e1 = s1 + size; while (s1 < e1) *d1++ = *s1++; } } ROCm-Device-Libs-rocm-5.0.0/opencl/src/pipes/pipes.h000066400000000000000000000055751415221260100220240ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "irif.h" #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable extern size_t __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n); #define DO_PIPE_SIZE(F) \ F(1,uchar) \ F(2,ushort) \ F(4,uint) \ F(8,ulong) \ F(16,ulong2) \ F(32,ulong4) \ F(64,ulong8) \ F(128,ulong16) struct pipeimp { atomic_size_t read_idx; atomic_size_t write_idx; size_t end_idx; uchar pad[128 - 3*sizeof(size_t)]; uchar packets[1]; }; extern void __memcpy_internal_aligned(void *, const void *, size_t, size_t); static __attribute__((always_inline)) size_t reserve(volatile __global atomic_size_t *pi, size_t lim, size_t n) { size_t i = __opencl_atomic_load(pi, memory_order_relaxed, memory_scope_device); for (;;) { if (i + n > lim) return ~(size_t)0; if (__opencl_atomic_compare_exchange_strong(pi, &i, i + n, memory_order_relaxed, memory_order_relaxed, memory_scope_device)) break; } return i; } static inline size_t wave_reserve_1(volatile __global atomic_size_t *pi, size_t lim) { ulong n = __builtin_popcountl(__builtin_amdgcn_read_exec()); uint l = __builtin_amdgcn_mbcnt_hi(__builtin_amdgcn_read_exec_hi(), __builtin_amdgcn_mbcnt_lo(__builtin_amdgcn_read_exec_lo(), 0u)); size_t i = 0; if (l == 0) { i = __opencl_atomic_load(pi, memory_order_relaxed, memory_scope_device); for (;;) { if (i + n > lim) { i = ~(size_t)0; break; } if (__opencl_atomic_compare_exchange_strong(pi, &i, i + n, memory_order_relaxed, memory_order_relaxed, memory_scope_device)) break; } } __builtin_amdgcn_wave_barrier(); // Broadcast the result; the ctz tells us which lane has active lane id 0 uint k = (uint)__llvm_cttz_i64(__builtin_amdgcn_read_exec()); i = ((size_t)__builtin_amdgcn_readlane((uint)(i >> 32), k) << 32) | (size_t)__builtin_amdgcn_readlane((uint)i, k); __builtin_amdgcn_wave_barrier(); if (i != ~(size_t)0) i += l; else { // The entire group didn't fit, have to handle one by one i = reserve(pi, lim, (size_t)1); } return i; } static inline size_t wrap(size_t i, size_t n) { // Assume end_i < 2^32 size_t ret; if (as_uint2(i).y == 0U) { uint j = (uint)i; uint m = (uint)n; if (j < m) ret = i; else ret = (ulong)(j % m); } else ret = i % n; return ret; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/pipes/readp.cl000066400000000000000000000044371415221260100221420ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "pipes.h" #define ATTR __attribute__((always_inline)) #define READ_PIPE_SIZE(SIZE, STYPE) \ ATTR int \ __read_pipe_2_##SIZE(__global struct pipeimp* p, STYPE* ptr) \ { \ size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ size_t ri = wave_reserve_1(&p->read_idx, wi); \ if (ri == ~(size_t)0) \ return -1; \ \ size_t pi = wrap(ri, p->end_idx); \ *ptr = ((__global STYPE *)p->packets)[pi]; \ \ if (ri == wi-1) { \ __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ }\ \ return 0; \ } DO_PIPE_SIZE(READ_PIPE_SIZE) ATTR int __read_pipe_2(__global struct pipeimp* p, void* ptr, uint size, uint align) { size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); size_t ri = wave_reserve_1(&p->read_idx, wi); if (ri == ~(size_t)0) return -1; size_t pi = wrap(ri, p->end_idx); __memcpy_internal_aligned(ptr, p->packets + pi*size, size, align); if (ri == wi-1) { __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); } return 0; } #define READ_PIPE_RESERVED_SIZE(SIZE, STYPE) \ ATTR int \ __read_pipe_4_##SIZE(__global struct pipeimp* p, reserve_id_t rid, uint i, STYPE* ptr) \ { \ size_t rin = __builtin_astype(rid, size_t) + i; \ size_t pi = wrap(rin, p->end_idx); \ *ptr = ((__global STYPE *)p->packets)[pi]; \ \ return 0; \ } DO_PIPE_SIZE(READ_PIPE_RESERVED_SIZE) ATTR int __read_pipe_4(__global struct pipeimp* p, reserve_id_t rid, uint i, void *ptr, uint size, uint align) { size_t rin = __builtin_astype(rid, size_t) + i; \ size_t pi = wrap(rin, p->end_idx); __memcpy_internal_aligned(ptr, p->packets + pi*size, size, align); return 0; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/pipes/reservep.cl000066400000000000000000000170621415221260100227000ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_subgroups : enable #include "pipes.h" #include "wgscratch.h" #define ATTR __attribute__((always_inline)) #define RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \ ATTR reserve_id_t \ __reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ { \ size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ size_t rid = __amd_wresvn(&p->read_idx, wi, num_packets); \ \ if (rid + num_packets == wi) { \ __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ } \ \ return __builtin_astype(rid, reserve_id_t); \ } // DO_PIPE_SIZE(RESERVE_READ_PIPE_SIZE) ATTR reserve_id_t __reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) { size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); size_t rid = __amd_wresvn(&p->read_idx, wi, num_packets); if (rid + num_packets == wi) { __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); } return __builtin_astype(rid, reserve_id_t); } #define RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \ ATTR reserve_id_t \ __reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ { \ size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \ size_t ei = p->end_idx; \ return __amd_wresvn(&p->write_idx, ri + ei, num_packets); \ } // DO_PIPE_SIZE(RESERVE_WRITE_PIPE_SIZE) ATTR reserve_id_t __reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) { size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); size_t ei = p->end_idx; size_t rid = __amd_wresvn(&p->write_idx, ri + ei, num_packets); return __builtin_astype(rid, reserve_id_t); } // Work group functions #define WORK_GROUP_RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \ ATTR reserve_id_t \ __work_group_reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ { \ __local size_t *t = (__local size_t *)__get_scratch_lds(); \ \ if ((int)get_local_linear_id() == 0) { \ size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ size_t rid = reserve(&p->read_idx, wi, num_packets); \ \ if (rid + num_packets == wi) { \ __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ } \ \ *t = rid; \ } \ \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ \ return __builtin_astype(*t, reserve_id_t); \ } // DO_PIPE_SIZE(WORK_GROUP_RESERVE_READ_PIPE_SIZE) ATTR reserve_id_t __work_group_reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) { __local size_t *t = (__local size_t *)__get_scratch_lds(); if ((int)get_local_linear_id() == 0) { size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); size_t rid = reserve(&p->read_idx, wi, num_packets); if (rid + num_packets == wi) { __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); } *t = rid; } work_group_barrier(CLK_LOCAL_MEM_FENCE); return __builtin_astype(*t, reserve_id_t); } #define WORK_GROUP_RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \ ATTR reserve_id_t \ __work_group_reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ { \ __local size_t *t = (__local size_t *)__get_scratch_lds(); \ \ if ((int)get_local_linear_id() == 0) { \ size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \ size_t ei = p->end_idx; \ *t = reserve(&p->write_idx, ri + ei, num_packets); \ } \ \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ \ return __builtin_astype(*t, reserve_id_t); \ } // DO_PIPE_SIZE(WORK_GROUP_RESERVE_WRITE_PIPE_SIZE) ATTR reserve_id_t __work_group_reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) { __local size_t *t = (__local size_t *)__get_scratch_lds(); if ((int)get_local_linear_id() == 0) { size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); size_t ei = p->end_idx; *t = reserve(&p->write_idx, ri + ei, num_packets); } work_group_barrier(CLK_LOCAL_MEM_FENCE); return __builtin_astype(*t, reserve_id_t); } // sub group functions #define SUB_GROUP_RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \ ATTR reserve_id_t \ __sub_group_reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ { \ size_t rid = ~(size_t)0; \ \ if (get_sub_group_local_id() == 0) { \ size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ rid = reserve(&p->read_idx, wi, num_packets); \ \ if (rid + num_packets == wi) { \ __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ } \ } \ \ return __builtin_astype(sub_group_broadcast(rid, 0), reserve_id_t); \ } // DO_PIPE_SIZE(SUB_GROUP_RESERVE_READ_PIPE_SIZE) ATTR reserve_id_t __sub_group_reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) { size_t rid = ~(size_t)0; if (get_sub_group_local_id() == 0) { size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); rid = reserve(&p->read_idx, wi, num_packets); if (rid + num_packets == wi) { __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); } } return __builtin_astype(sub_group_broadcast(rid, 0), reserve_id_t); } #define SUB_GROUP_RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \ ATTR reserve_id_t \ __sub_group_reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ { \ size_t rid = ~(size_t)0; \ \ if (get_sub_group_local_id() == 0) { \ size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \ size_t ei = p->end_idx; \ rid = reserve(&p->write_idx, ri + ei, num_packets); \ } \ \ return __builtin_astype(sub_group_broadcast(rid, 0), reserve_id_t); \ } // DO_PIPE_SIZE(SUB_GROUP_RESERVE_WRITE_PIPE_SIZE) ATTR reserve_id_t __sub_group_reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) { size_t rid = ~(size_t)0; if (get_sub_group_local_id() == 0) { size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); size_t ei = p->end_idx; rid = reserve(&p->write_idx, ri + ei, num_packets); } return __builtin_astype(sub_group_broadcast(rid, 0), reserve_id_t); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/pipes/validp.cl000066400000000000000000000007071415221260100223220ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ __attribute__((overloadable, always_inline)) bool is_valid_reserve_id(reserve_id_t rid) { return as_ulong(rid) != ~(size_t)0; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/pipes/wresvnp.cl000066400000000000000000000120661415221260100225500ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "pipes.h" size_t __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n) { uint alc = (size_t)(__builtin_popcount(__builtin_amdgcn_read_exec_lo()) + __builtin_popcount(__builtin_amdgcn_read_exec_hi())); uint l = __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0u)); size_t rid; if (__builtin_amdgcn_read_exec() == (1UL << alc) - 1UL) { // Handle fully active subgroup uint sum = sub_group_scan_inclusive_add((uint)n); size_t idx = 0; if (l == alc-1) { idx = reserve(pidx, lim, (size_t)sum); } idx = sub_group_broadcast(idx, alc-1); rid = idx + (size_t)(sum - (uint)n); rid = idx != ~(size_t)0 ? rid : idx; } else { // Inclusive add scan with not all lanes active const ulong nomsb = 0x7fffffffffffffffUL; // Step 1 ulong smask = __builtin_amdgcn_read_exec() & ((0x1UL << l) - 0x1UL); int slid = 63 - (int)clz(smask); uint t = __builtin_amdgcn_ds_bpermute(slid << 2, n); uint sum = n + (slid < 0 ? 0 : t); smask ^= (0x1UL << slid) & nomsb; // Step 2 slid = 63 - (int)clz(smask); t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); sum += slid < 0 ? 0 : t; smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; // Step 3 slid = 63 - (int)clz(smask); t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); sum += slid < 0 ? 0 : t; smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; // Step 4 slid = 63 - (int)clz(smask); t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); sum += slid < 0 ? 0 : t; smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; // Step 5 slid = 63 - (int)clz(smask); t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); sum += slid < 0 ? 0 : t; smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; slid = 63 - (int)clz(smask); smask ^= (0x1UL << slid) & nomsb; // Step 6 slid = 63 - (int)clz(smask); t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); sum += slid < 0 ? 0 : t; __builtin_amdgcn_wave_barrier(); size_t idx = 0; if (l == 63 - (int)clz(__builtin_amdgcn_read_exec())) { idx = reserve(pidx, lim, (size_t)sum); } __builtin_amdgcn_wave_barrier(); // Broadcast uint k = 63u - (uint)clz(__builtin_amdgcn_read_exec()); idx = ((size_t)__builtin_amdgcn_readlane((uint)(idx >> 32), k) << 32) | (size_t)__builtin_amdgcn_readlane((uint)idx, k); __builtin_amdgcn_wave_barrier(); rid = idx + (size_t)(sum - (uint)n); rid = idx != ~(size_t)0 ? rid : idx; } if (rid == ~(size_t)0) { // Try again one at a time rid = reserve(pidx, lim, n); } return rid; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/pipes/writep.cl000066400000000000000000000036711415221260100223600ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "pipes.h" #define ATTR __attribute__((always_inline)) #define WRITE_PIPE_SIZE(SIZE, STYPE) \ ATTR int \ __write_pipe_2_##SIZE(__global struct pipeimp* p, const STYPE* ptr) \ { \ size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \ size_t ei = p->end_idx; \ size_t wi = wave_reserve_1(&p->write_idx, ri+ei); \ if (wi == ~(size_t)0) \ return -1; \ \ size_t pi = wrap(wi, ei); \ ((__global STYPE *)p->packets)[pi] = *ptr; \ return 0; \ } DO_PIPE_SIZE(WRITE_PIPE_SIZE) ATTR int __write_pipe_2(__global struct pipeimp* p, const void* ptr, uint size, uint align) { size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); size_t ei = p->end_idx; size_t wi = wave_reserve_1(&p->write_idx, ri+ei); if (wi == ~(size_t)0) return -1; size_t pi = wrap(wi, ei); __memcpy_internal_aligned(p->packets + pi*size, ptr, size, align); return 0; } #define WRITE_PIPE_RESERVED_SIZE(SIZE, STYPE) \ ATTR int \ __write_pipe_4_##SIZE(__global struct pipeimp* p, reserve_id_t rid, uint i, const STYPE* ptr) \ { \ size_t rin = __builtin_astype(rid, size_t) + i; \ size_t pi = wrap(rin, p->end_idx); \ ((__global STYPE *)p->packets)[pi] = *ptr; \ return 0; \ } DO_PIPE_SIZE(WRITE_PIPE_RESERVED_SIZE) ATTR int __write_pipe_4(__global struct pipeimp* p, reserve_id_t rid, uint i, const void *ptr, uint size, uint align) { size_t rin = __builtin_astype(rid, size_t) + i; \ size_t pi = wrap(rin, p->end_idx); __memcpy_internal_aligned(p->packets + pi*size, ptr, size, align); return 0; } ROCm-Device-Libs-rocm-5.0.0/opencl/src/relational/000077500000000000000000000000001415221260100215315ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/relational/anyall.cl000066400000000000000000000023331415221260100233320ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define ATTR __attribute__((overloadable, const)) #define char_mask ((char)1 << 7) #define short_mask ((short)1 << 15) #define int_mask ((int)1 << 31) #define long_mask ((long)1 << 63) #define any_op | #define all_op & #define RED(T,O) #define RED2(T,O) \ T a = a2.lo O a2.hi #define RED3(T,O) \ T a = a3.s0 O a3.s1 O a3.s2 #define RED4(T,O) \ T##2 a2 = a4.hi O a4.lo; \ RED2(T,O) #define RED8(T,O) \ T##4 a4 = a8.hi O a8.lo; \ RED4(T,O) #define RED16(T,O) \ T##8 a8 = a16.hi O a16.lo; \ RED8(T,O) #define RET(T) return (a & T##_mask) != (T)0 #define GENNT(F,N,T) \ ATTR int \ F(T##N a##N) \ { \ RED##N(T,F##_op); \ RET(T); \ } #define GENT(F,T) \ GENNT(F,16,T) \ GENNT(F,8,T) \ GENNT(F,4,T) \ GENNT(F,3,T) \ GENNT(F,2,T) \ GENNT(F,,T) #define GEN(F) \ GENT(F,char) \ GENT(F,short) \ GENT(F,int) \ GENT(F,long) GEN(any) GEN(all) ROCm-Device-Libs-rocm-5.0.0/opencl/src/relational/bselect.cl000066400000000000000000000031671415221260100235010ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #define _S(X) #X #define S(X) _S(X) #define _C(A,B) A##B #define C(A,B) _C(A,B) #define ATTR __attribute__((overloadable, const)) #define IATTR __attribute__((const)) #define AATTR(S) __attribute__((overloadable, const, alias(S))) #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define char_utype uchar #define short_utype ushort #define int_utype uint #define long_utype ulong #define float_itype int #define float_utype uint #define double_itype long #define double_utype ulong #define half_itype short #define half_utype ushort #define FGENN(N,T) \ ATTR T##N \ bitselect(T##N a, T##N b, T##N c) \ { \ return as_##T##N(bitselect(C(as_,C(T##_itype,N))(a), C(as_,C(T##_itype,N))(b), C(as_,C(T##_itype,N))(c))); \ } \ #define FGEN(T) \ FGENN(16,T) \ FGENN(8,T) \ FGENN(4,T) \ FGENN(3,T) \ FGENN(2,T) \ FGENN(,T) FGEN(float) FGEN(double) FGEN(half) #define IGENN(N,T) \ IATTR static T##N \ bsel_##T##N(T##N a, T##N b, T##N c) \ { \ return a ^ ((a ^ b) & c); \ } \ extern AATTR(S(bsel_##T##N)) T##N bitselect(T##N, T##N, T##N); \ extern AATTR(S(bsel_##T##N)) C(T##_utype,N) bitselect(C(T##_utype,N), C(T##_utype,N), C(T##_utype,N)); #define IGEN(T) \ IGENN(16,T) \ IGENN(8,T) \ IGENN(4,T) \ IGENN(3,T) \ IGENN(2,T) \ IGENN(,T) IGEN(char) IGEN(short) IGEN(int) IGEN(long) ROCm-Device-Libs-rocm-5.0.0/opencl/src/relational/predicates.cl000066400000000000000000000056531415221260100242050ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ocml.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define ATTR __attribute__((overloadable, const)) #define _C(A,B) A##B #define C(A,B) _C(A,B) #define float_ssuf _f32 #define double_ssuf _f64 #define half_ssuf _f16 #define half_psuf _2f16 #define float_rtype int #define double_rtype long #define half_rtype short #define SNAME(F,T) C(__ocml_,C(F,T##_ssuf)) #define PNAME(F,T) C(__ocml_,C(F,T##_psuf)) #define USLST2(F,T) -SNAME(F,T)(x.s0), -SNAME(F,T)(x.s1) #define USLST3(F,T) USLST2(F,T), -SNAME(F,T)(x.s2) #define USLST4(F,T) USLST2(F,T), -SNAME(F,T)(x.s2), -SNAME(F,T)(x.s3) #define USLST8(F,T) USLST4(F,T), -SNAME(F,T)(x.s4), -SNAME(F,T)(x.s5), -SNAME(F,T)(x.s6), -SNAME(F,T)(x.s7) #define USLST16(F,T) USLST8(F,T), -SNAME(F,T)(x.s8), -SNAME(F,T)(x.s9), -SNAME(F,T)(x.sa), -SNAME(F,T)(x.sb), -SNAME(F,T)(x.sc), -SNAME(F,T)(x.sd), -SNAME(F,T)(x.se), -SNAME(F,T)(x.sf) #define UPLST3(F,T) PNAME(F,T)(x.s01), -SNAME(F,T)(x.s2) #define UPLST4(F,T) PNAME(F,T)(x.s01), PNAME(F,T)(x.s23) #define UPLST8(F,T) UPLST4(F,T), PNAME(F,T)(x.s45), PNAME(F,T)(x.s67) #define UPLST16(F,T) UPLST8(F,T), PNAME(F,T)(x.s89), PNAME(F,T)(x.sab), PNAME(F,T)(x.scd), PNAME(F,T)(x.sef) #define USGENTN(N,F,T) \ ATTR C(T##_rtype,N) \ F(T##N x) \ { \ return (C(T##_rtype,N)) ( USLST##N(F,T) ); \ } #define UPGENTN(N,F,T) \ ATTR C(T##_rtype,N) \ F(T##N x) \ { \ return (C(T##_rtype,N)) ( UPLST##N(F,T) ); \ } #define UGENT1(F,T) \ ATTR int \ F(T x) \ { \ return SNAME(F,T)(x); \ } #define UGENT2(F,T) \ ATTR C(T##_rtype,2) \ F(T##2 x) \ { \ return PNAME(F,T)(x); \ } #define USGENT(F,T) \ USGENTN(16,F,T) \ USGENTN(8,F,T) \ USGENTN(4,F,T) \ USGENTN(3,F,T) \ USGENTN(2,F,T) \ UGENT1(F,T) #define UPGENT(F,T) \ UPGENTN(16,F,T) \ UPGENTN(8,F,T) \ UPGENTN(4,F,T) \ UPGENTN(3,F,T) \ UGENT2(F,T) \ UGENT1(F,T) #define UGEN(F) \ USGENT(F,float) \ USGENT(F,double) \ UPGENT(F,half) UGEN(isfinite) UGEN(isinf) UGEN(isnan) UGEN(isnormal) UGEN(signbit) #define BGENTN(N,F,T,E) \ ATTR C(T##_rtype,N) \ F(T##N x, T##N y) \ { \ return E; \ } #define BGENT1(F,T,E) \ ATTR int \ F(T x, T y) \ { \ return E; \ } #define BGENT(F,T,E) \ BGENTN(16,F,T,E) \ BGENTN(8,F,T,E) \ BGENTN(4,F,T,E) \ BGENTN(3,F,T,E) \ BGENTN(2,F,T,E) \ BGENT1(F,T,E) #define BGEN(F,E) \ BGENT(F,float,E) \ BGENT(F,double,E) \ BGENT(F,half,E) BGEN(isequal,x==y) BGEN(isnotequal,x!=y) BGEN(isgreater,x>y) BGEN(isgreaterequal,x>=y) BGEN(isless,x> 6U; else return (wgs + 31U) >> 5U; } CATTR uint get_enqueued_num_sub_groups(void) { uint wgs = mul24((uint)get_enqueued_local_size(2), mul24((uint)get_enqueued_local_size(1), (uint)get_enqueued_local_size(0))); if (__oclc_wavefrontsize64) return (wgs + 63U) >> 6U; else return (wgs + 31U) >> 5U; } CATTR uint get_sub_group_id(void) { if (__oclc_wavefrontsize64) return (uint)get_local_linear_id() >> 6U; else return (uint)get_local_linear_id() >> 5U; } CATTR uint get_sub_group_local_id(void) { return __ockl_lane_u32(); } ROCm-Device-Libs-rocm-5.0.0/opencl/src/subgroup/subredscan.cl000066400000000000000000000026721415221260100237250ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "ockl.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _C(X,Y) X ## Y #define C(X,Y) _C(X,Y) #define red_full reduce #define scan_full scan #define PFX __ockl_wf #define ATTR __attribute__((overloadable)) #define i32_tn int #define u32_tn uint #define i64_tn long #define u64_tn ulong #define f32_tn float #define f64_tn double #define f16_tn half #define true_inc inclusive_ #define false_inc exclusive_ #define GENROT(O,T) \ ATTR T##_tn \ C(sub_group_reduce_,O)(T##_tn x) \ { \ return C(PFX,C(red_,C(O,C(_,T))))(x); \ } #define GENRO(O) \ GENROT(O,i32) \ GENROT(O,u32) \ GENROT(O,i64) \ GENROT(O,u64) \ GENROT(O,f32) \ GENROT(O,f64) \ GENROT(O,f16) GENRO(add) GENRO(max) GENRO(min) #define GENSOTI(O, T, I) \ ATTR T##_tn \ C(sub_group_scan_,C(I##_inc,O))(T##_tn x) \ { \ return C(PFX,C(scan_,C(O,C(_,T))))(x, I); \ } #define GENSOT(O,T) \ GENSOTI(O,T,false) \ GENSOTI(O,T,true) #define GENSO(O) \ GENSOT(O,i32) \ GENSOT(O,u32) \ GENSOT(O,i64) \ GENSOT(O,u64) \ GENSOT(O,f32) \ GENSOT(O,f64) \ GENSOT(O,f16) GENSO(add) GENSO(max) GENSO(min) ROCm-Device-Libs-rocm-5.0.0/opencl/src/vldst/000077500000000000000000000000001415221260100205335ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/vldst/vldst_gen.cl000066400000000000000000000042361415221260100230450ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _C(X,Y) X##Y #define C(X,Y) _C(X,Y) #define _S(X) #X #define S(X) _S(X) #define LATTR __attribute__((overloadable, pure)) #define SATTR __attribute__((overloadable)) #define char_align 1 #define uchar_align 1 #define short_align 2 #define ushort_align 2 #define int_align 4 #define uint_align 4 #define long_align 8 #define ulong_align 8 #define float_align 4 #define double_align 8 #define half_align 2 #define LGENAN(N,A,T) \ LATTR T##N \ vload##N(size_t i, const A T *p) \ { \ typedef T __attribute__((ext_vector_type(N), aligned(T##_align))) vt; \ p += i * N; \ return *(const A vt *)p; \ } #define LGENA3(A,T) \ LATTR T##3 \ vload3(size_t i, const A T *p) \ { \ p += i * 3; \ return (T##3) ( p[0], p[1], p[2] ); \ } #define LGENA(A,T) \ LGENAN(16,A,T) \ LGENAN(8,A,T) \ LGENAN(4,A,T) \ LGENA3(A,T) \ LGENAN(2,A,T) #define LGEN(T) \ LGENA(__constant,T) \ LGENA(__private,T) \ LGENA(__local,T) \ LGENA(__global,T) \ LGENA(,T) LGEN(char) LGEN(uchar) LGEN(short) LGEN(ushort) LGEN(int) LGEN(uint) LGEN(long) LGEN(ulong) LGEN(float) LGEN(double) LGEN(half) #define SGENAN(N,A,T) \ SATTR void \ vstore##N(T##N v, size_t i, A T *p) \ { \ typedef T __attribute__((ext_vector_type(N), aligned(T##_align))) vt; \ p += i * N; \ *(A vt *)p = v; \ } #define SGENA3(A,T) \ SATTR void \ vstore3(T##3 v, size_t i, A T *p) \ { \ p += i * 3; \ p[0] = v.s0; \ p[1] = v.s1; \ p[2] = v.s2; \ } #define SGENA(A,T) \ SGENAN(16,A,T) \ SGENAN(8,A,T) \ SGENAN(4,A,T) \ SGENA3(A,T) \ SGENAN(2,A,T) #define SGEN(T) \ SGENA(__private,T) \ SGENA(__local,T) \ SGENA(__global,T) \ SGENA(,T) SGEN(char) SGEN(uchar) SGEN(short) SGEN(ushort) SGEN(int) SGEN(uint) SGEN(long) SGEN(ulong) SGEN(float) SGEN(double) SGEN(half) ROCm-Device-Libs-rocm-5.0.0/opencl/src/vldst/vldst_half.cl000066400000000000000000000061601415221260100232040ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define LATTR __attribute__((overloadable, pure)) #define SATTR __attribute__((overloadable)) #define LGENAN(N,A) \ LATTR float##N \ vload_half##N(size_t i, const A half *p) \ { \ return convert_float##N(vload##N(i, p)); \ } #define LGENA1(A) \ LATTR float \ vload_half(size_t i, const A half *p) \ { \ return convert_float(p[i]); \ } #define LGENA(A) \ LGENAN(16,A) \ LGENAN(8,A) \ LGENAN(4,A) \ LGENAN(3,A) \ LGENAN(2,A) \ LGENA1(A) LGENA(__constant) LGENA(__global) LGENA(__local) LGENA(__private) LGENA() #define LAGENAN(N,A) \ LATTR float##N \ vloada_half##N(size_t i, const A half *p) \ { \ return convert_float##N(*(const A half##N *)(p + i*N)); \ } #define LAGENA3(A) \ LATTR float3 \ vloada_half3(size_t i, const A half *p) \ { \ half4 v = *(const A half4 *)(p + i*4); \ return convert_float3(v.s012); \ } #define LAGENA1(A) \ LATTR float \ vloada_half(size_t i, const A half *p) \ { \ return convert_float(p[i]); \ } #define LAGENA(A) \ LAGENAN(16,A) \ LAGENAN(8,A) \ LAGENAN(4,A) \ LAGENA3(A) \ LAGENAN(2,A) \ LAGENA1(A) LAGENA(__constant) LAGENA(__global) LAGENA(__local) LAGENA(__private) LAGENA() #define SGENTARN(N,T,A,R) \ SATTR void \ vstore_half##N##R(T##N v, size_t i, A half *p) \ { \ vstore##N(convert_half##N##R(v), i, p); \ } #define SGENTAR1(T,A,R) \ SATTR void \ vstore_half##R(T v, size_t i, A half *p) \ { \ p[i] = convert_half##R(v); \ } #define SGENTAR(T,A,R) \ SGENTARN(16,T,A,R) \ SGENTARN(8,T,A,R) \ SGENTARN(4,T,A,R) \ SGENTARN(3,T,A,R) \ SGENTARN(2,T,A,R) \ SGENTAR1(T,A,R) #define SGENTA(T,A) \ SGENTAR(T,A,) \ SGENTAR(T,A,_rte) \ SGENTAR(T,A,_rtn) \ SGENTAR(T,A,_rtp) \ SGENTAR(T,A,_rtz) #define SGENT(T) \ SGENTA(T,__global) \ SGENTA(T,__local) \ SGENTA(T,__private) \ SGENTA(T,) SGENT(float) SGENT(double) #define SAGENTARN(N,T,A,R) \ SATTR void \ vstorea_half##N##R(T##N v, size_t i, A half *p) \ { \ *(A half##N *)(p + i*N) = convert_half##N##R(v); \ } #define SAGENTAR3(T,A,R) \ SATTR void \ vstorea_half3##R(T##3 v, size_t i, A half *p) \ { \ half4 h; \ h.s012 = convert_half3##R(v); \ *(A half4 *)(p + i*4) = h; \ } #define SAGENTAR1(T,A,R) \ SATTR void \ vstorea_half##R(T v, size_t i, A half *p) \ { \ p[i] = convert_half##R(v); \ } #define SAGENTAR(T,A,R) \ SAGENTARN(16,T,A,R) \ SAGENTARN(8,T,A,R) \ SAGENTARN(4,T,A,R) \ SAGENTAR3(T,A,R) \ SAGENTARN(2,T,A,R) \ SAGENTAR1(T,A,R) #define SAGENTA(T,A) \ SAGENTAR(T,A,) \ SAGENTAR(T,A,_rte) \ SAGENTAR(T,A,_rtn) \ SAGENTAR(T,A,_rtp) \ SAGENTAR(T,A,_rtz) #define SAGENT(T) \ SAGENTA(T,__global) \ SAGENTA(T,__local) \ SAGENTA(T,__private) \ SAGENTA(T,) SAGENT(float) SAGENT(double) ROCm-Device-Libs-rocm-5.0.0/opencl/src/workgroup/000077500000000000000000000000001415221260100214365ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/opencl/src/workgroup/wganyall.cl000066400000000000000000000024531415221260100236000ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "wgscratch.h" #define update_any atomic_fetch_or_explicit #define update_all atomic_fetch_and_explicit #define GEN_AA(SUF,ID) \ __attribute__((overloadable, always_inline)) int \ work_group_##SUF(int predicate) \ { \ uint n = get_num_sub_groups(); \ int a = sub_group_##SUF(predicate); \ if (n == 1) \ return a; \ \ __local atomic_uint *p = (__local atomic_uint *)__get_scratch_lds(); \ uint l = get_sub_group_local_id(); \ uint i = get_sub_group_id(); \ \ if ((i == 0) & (l == 0)) \ atomic_store_explicit(p, a, memory_order_relaxed, memory_scope_work_group); \ \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ if ((i != 0) & (l == 0)) \ update_##SUF(p, a, memory_order_relaxed, memory_scope_work_group); \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ a = atomic_load_explicit(p, memory_order_relaxed, memory_scope_work_group); \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ \ return a; \ } GEN_AA(all, 1U) GEN_AA(any, 0U); ROCm-Device-Libs-rocm-5.0.0/opencl/src/workgroup/wgbarrier.cl000066400000000000000000000022041415221260100237400ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ __attribute__((overloadable)) void barrier(cl_mem_fence_flags flags) { work_group_barrier(flags); } __attribute__((overloadable)) void work_group_barrier(cl_mem_fence_flags flags) { work_group_barrier(flags, memory_scope_work_group); } __attribute__((overloadable)) void work_group_barrier(cl_mem_fence_flags flags, memory_scope scope) { if (flags) { atomic_work_item_fence(flags, flags == (CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE) ? memory_order_seq_cst : memory_order_release, scope); __builtin_amdgcn_s_barrier(); atomic_work_item_fence(flags, flags == (CLK_GLOBAL_MEM_FENCE|CLK_LOCAL_MEM_FENCE) ? memory_order_seq_cst : memory_order_acquire, scope); } else { __builtin_amdgcn_s_barrier(); } } ROCm-Device-Libs-rocm-5.0.0/opencl/src/workgroup/wgbcast.cl000066400000000000000000000034621415221260100234150ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "wgscratch.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define GEN_BROADCAST(T) \ __attribute__((overloadable, always_inline)) T \ work_group_broadcast(T a, size_t local_id_x) \ { \ if (get_num_sub_groups() == 1) \ return sub_group_broadcast(a, local_id_x); \ \ __local T *p = (__local T *)__get_scratch_lds(); \ if (get_local_id(0) == local_id_x) \ *p = a; \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ a = *p; \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ return a; \ } \ \ __attribute__((overloadable, always_inline)) T \ work_group_broadcast(T a, size_t local_id_x, size_t local_id_y) \ { \ __local T *p = (__local T *)__get_scratch_lds(); \ if (get_local_id(0) == local_id_x && get_local_id(1) == local_id_y) \ *p = a; \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ a = *p; \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ return a; \ } \ \ __attribute__((overloadable, always_inline)) T \ work_group_broadcast(T a, size_t local_id_x, size_t local_id_y, size_t local_id_z) \ { \ __local T *p = (__local T *)__get_scratch_lds(); \ if (get_local_id(0) == local_id_x && get_local_id(1) == local_id_y && get_local_id(2) == local_id_z) \ *p = a; \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ a = *p; \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ return a; \ } GEN_BROADCAST(uint) GEN_BROADCAST(int) GEN_BROADCAST(ulong) GEN_BROADCAST(long) GEN_BROADCAST(float) GEN_BROADCAST(double) GEN_BROADCAST(half) ROCm-Device-Libs-rocm-5.0.0/opencl/src/workgroup/wgreduce.cl000066400000000000000000000054501415221260100235670ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "wgscratch.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable #define reduce_add atomic_fetch_add_explicit #define reduce_min atomic_fetch_min_explicit #define reduce_max atomic_fetch_max_explicit #define AGEN(T,OP) \ __attribute__((overloadable)) T \ work_group_reduce_##OP(T a) \ { \ uint n = get_num_sub_groups(); \ a = sub_group_reduce_##OP(a); \ if (n == 1) \ return a; \ \ __local atomic_##T *p = (__local atomic_##T *)__get_scratch_lds(); \ uint l = get_sub_group_local_id(); \ uint i = get_sub_group_id(); \ \ if ((i == 0) & (l == 0)) \ atomic_store_explicit(p, a, memory_order_relaxed, memory_scope_work_group); \ \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ if ((i != 0) & (l == 0)) \ reduce_##OP(p, a, memory_order_relaxed, memory_scope_work_group); \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ a = atomic_load_explicit(p, memory_order_relaxed, memory_scope_work_group); \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ return a; \ } AGEN(int,add) AGEN(int,max) AGEN(int,min) AGEN(uint,add) AGEN(uint,max) AGEN(uint,min) AGEN(long,add) AGEN(long,max) AGEN(long,min) AGEN(ulong,add) AGEN(ulong,max) AGEN(ulong,min) // TODO implement floating point reduction using LDS atomics as above // (note that ds_add_f32 is not available on GFX7) // TODO Use a special reduce for per-sub-group results since there // are fewer of them than work-items in a sub group #define add(X,Y) (X + Y) #define SGEN(T,OP,ID) \ __attribute__((overloadable)) T \ work_group_reduce_##OP(T a) \ { \ uint n = get_num_sub_groups(); \ a = sub_group_reduce_##OP(a); \ if (n == 1) \ return a; \ \ __local T *p = (__local T *)__get_scratch_lds(); \ uint l = get_sub_group_local_id(); \ uint i = get_sub_group_id(); \ \ if (l == 0) \ p[i] = a; \ \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ if (i == 0) { \ T t = l < n ? p[l] : ID; \ t = sub_group_reduce_##OP(t); \ if (l == 0) \ p[0] = t; \ } \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ T ret = p[0]; \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ return ret; \ } SGEN(float,add,0.0f) SGEN(float,max,-INFINITY) SGEN(float,min,INFINITY) SGEN(double,add,0.0) SGEN(double,max,-(double)INFINITY) SGEN(double,min,(double)INFINITY) SGEN(half,add,0.0h) SGEN(half,max,-(half)INFINITY) SGEN(half,min,(half)INFINITY) ROCm-Device-Libs-rocm-5.0.0/opencl/src/workgroup/wgscan.cl000066400000000000000000000060161415221260100232430ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #include "wgscratch.h" #pragma OPENCL EXTENSION cl_khr_fp16 : enable // TODO Use a special scan for per-sub-group results since there // are fewer of them than work-items in a sub group #define add(X,Y) (X + Y) #define GENI(TYPE,OP,ID) \ __attribute__((overloadable)) TYPE \ work_group_scan_inclusive_##OP(TYPE a) \ { \ uint n = get_num_sub_groups(); \ a = sub_group_scan_inclusive_##OP(a); \ if (n == 1) \ return a; \ \ __local TYPE *p = (__local TYPE *)__get_scratch_lds(); \ uint l = get_sub_group_local_id(); \ uint i = get_sub_group_id(); \ \ if (l == get_sub_group_size() - 1U) \ p[i] = a; \ \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ if (i == 0) { \ TYPE t = l < n ? p[l] : ID; \ t = sub_group_scan_inclusive_##OP(t); \ if (l < n) \ p[l] = t; \ } \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ TYPE ret = i == 0 ? a : OP(a, p[i-1]); \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ return ret; \ } GENI(int,add,0) GENI(int,max,INT_MIN) GENI(int,min,INT_MAX) GENI(uint,add,0U) GENI(uint,max,0U) GENI(uint,min,UINT_MAX) GENI(long,add,0L) GENI(long,max,LONG_MIN) GENI(long,min,LONG_MAX) GENI(ulong,add,0UL) GENI(ulong,max,0UL) GENI(ulong,min,ULONG_MAX) GENI(float,add,0.0f) GENI(float,max,-INFINITY) GENI(float,min,INFINITY) GENI(double,add,0.0) GENI(double,max,-(double)INFINITY) GENI(double,min,(double)INFINITY) GENI(half,add,0.0h) GENI(half,max,-(half)INFINITY) GENI(half,min,(half)INFINITY) #define GENE(TYPE,OP,ID) \ __attribute__((overloadable)) TYPE \ work_group_scan_exclusive_##OP(TYPE a) \ { \ uint n = get_num_sub_groups(); \ TYPE t = sub_group_scan_exclusive_##OP(a); \ if (n == 1) \ return t; \ \ __local TYPE *p = (__local TYPE *)__get_scratch_lds(); \ uint l = get_sub_group_local_id(); \ uint i = get_sub_group_id(); \ \ if (l == get_sub_group_size() - 1U) \ p[i] = OP(a, t); \ \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ if (i == 0) { \ TYPE s = l < n ? p[l] : ID; \ s = sub_group_scan_inclusive_##OP(s); \ if (l < n) \ p[l] = s; \ } \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ TYPE ret = i == 0 ? t : OP(t, p[i-1]); \ work_group_barrier(CLK_LOCAL_MEM_FENCE); \ return ret; \ } GENE(int,add,0) GENE(int,max,INT_MIN) GENE(int,min,INT_MAX) GENE(uint,add,0U) GENE(uint,max,0U) GENE(uint,min,UINT_MAX) GENE(long,add,0L) GENE(long,max,LONG_MIN) GENE(long,min,LONG_MAX) GENE(ulong,add,0UL) GENE(ulong,max,0UL) GENE(ulong,min,ULONG_MAX) GENE(float,add,0.0f) GENE(float,max,-INFINITY) GENE(float,min,INFINITY) GENE(double,add,0.0) GENE(double,max,-(double)INFINITY) GENE(double,min,(double)INFINITY) GENE(half,add,0.0h) GENE(half,max,-(half)INFINITY) GENE(half,min,(half)INFINITY) ROCm-Device-Libs-rocm-5.0.0/test/000077500000000000000000000000001415221260100163075ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/test/constant_folding/000077500000000000000000000000001415221260100216425ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/test/constant_folding/CMakeLists.txt000066400000000000000000000023611415221260100244040ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- if(TARGET FileCheck) set(FILECHECK_BIN $) else() # FIXME: Is there a better way to get the binary directory? # FileCheck is also not normally installed, so it only really works # well with build directories by default. find_program(FILECHECK_BIN FileCheck HINTS ${LLVM_DIR}/../../../bin) endif() if(NOT FILECHECK_BIN) message(STATUS "FileCheck not found, not adding constant fold tests") return() endif() message(STATUS "Running constant fold tests") function(add_constant_fold_test name) add_test(NAME constant_fold_${name} COMMAND ${CMAKE_COMMAND} -DCLANG_BIN=$ -DBINARY_DIR=${PROJECT_BINARY_DIR} -DFILECHECK_BIN=${FILECHECK_BIN} -DOUTPUT_FILE=output.${name}.ll -DINPUT_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${name}.cl -DTEST_CPU=gfx900 -P ${CMAKE_CURRENT_SOURCE_DIR}/RunConstantFoldTest.cmake) endfunction() add_constant_fold_test(lgamma_r) ROCm-Device-Libs-rocm-5.0.0/test/constant_folding/RunConstantFoldTest.cmake000066400000000000000000000022331415221260100265670ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- # Test execution is wrapped here because add_test only allows running # one command at a time. # FIXME: It would be better to use llvm-lit and parse RUN lines from # individual tests. execute_process(COMMAND ${CLANG_BIN} -O3 -S -emit-llvm -cl-std=CL2.0 -target amdgcn-amd-amdhsa -mcpu=${TEST_CPU} -Xclang -finclude-default-header --rocm-path=${BINARY_DIR} -mllvm -amdgpu-simplify-libcall=0 -o ${OUTPUT_FILE} ${INPUT_FILE} RESULT_VARIABLE CLANG_RESULT ERROR_VARIABLE CLANG_ERR) if(CLANG_RESULT) message(FATAL_ERROR "Error compiling test: ${CLANG_ERR}") endif() execute_process(COMMAND ${FILECHECK_BIN} -v --enable-var-scope ${INPUT_FILE} --input-file ${OUTPUT_FILE} RESULT_VARIABLE FILECHECK_RESULT ERROR_VARIABLE FILECHECK_ERROR) if(FILECHECK_RESULT) message(FATAL_ERROR "Error in test output: ${FILECHECK_ERROR}") endif() ROCm-Device-Libs-rocm-5.0.0/test/constant_folding/lgamma_r.cl000066400000000000000000000075101415221260100237440ustar00rootroot00000000000000// Verify lgamma_r function constant folds to correct values. // Run with filecheck from test cmake __attribute__((always_inline)) static float test_lgamma_r(float val, volatile global int* sign_out) { int tmp; float result = lgamma_r(val, &tmp); *sign_out = tmp; return result; } // CHECK-LABEL: @constant_fold_lgamma_r_f32( kernel void constant_fold_lgamma_r_f32(volatile global float* out, volatile global int* sign_out) { // CHECK: store volatile i32 0, // CHECK-NEXT: store volatile float 0x7FF0000000000000 out[0] = test_lgamma_r(0.0f, sign_out); // CHECK-NEXT: store volatile i32 0, // CHECK-NEXT: store volatile float 0x7FF0000000000000 out[0] = test_lgamma_r(-0.0f, sign_out); // CHECK-NEXT: store volatile i32 0, // CHECK-NEXT: store volatile float 0x7FF8000000000000, out[0] = test_lgamma_r(__builtin_nanf(""), sign_out); // CHECK-NEXT: store volatile i32 0, // CHECK-NEXT: store volatile float 0x7FF4000000000000, out[0] = test_lgamma_r(__builtin_nansf(""), sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0x7FF0000000000000, out[0] = test_lgamma_r(__builtin_inff(), sign_out); // CHECK-NEXT: store volatile i32 0, // CHECK-NEXT: store volatile float 0x7FF0000000000000, out[0] = test_lgamma_r(-__builtin_inff(), sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0x419DE28020000000, out[0] = test_lgamma_r(0x1.0p+23f, sign_out); // CHECK-NEXT: store volatile i32 0, // CHECK-NEXT: store volatile float 0x7FF0000000000000, out[0] = test_lgamma_r(-0x1.0p+23f, sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0.000000e+00, out[0] = test_lgamma_r(1.0f, sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0.000000e+00, out[0] = test_lgamma_r(2.0f, sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0x3FE62E4300000000, out[0] = test_lgamma_r(3.0f, sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0x3FE250D040000000, out[0] = test_lgamma_r(0.5f, sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0x405601E680000000, out[0] = test_lgamma_r(0x1.0p-127f, sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0x419DE28040000000, out[0] = test_lgamma_r(nextafter(0x1.0p+23f, __builtin_inff()), sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0x419DE28000000000, out[0] = test_lgamma_r(nextafter(0x1.0p+23f, -__builtin_inff()), sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0xC19DE28040000000, out[0] = test_lgamma_r(nextafter(-0x1.0p+23f, __builtin_inff()), sign_out); // CHECK-NEXT: store volatile i32 0, // CHECK-NEXT: store volatile float 0x7FF0000000000000, out[0] = test_lgamma_r(nextafter(-0x1.0p+23f, -__builtin_inff()), sign_out); // CHECK-NEXT: store volatile i32 0, // CHECK-NEXT: store volatile float 0x7FF0000000000000, out[0] = test_lgamma_r(-1.0f, sign_out); // CHECK-NEXT: store volatile i32 0, // CHECK-NEXT: store volatile float 0x7FF0000000000000, out[0] = test_lgamma_r(-2.0f, sign_out); // CHECK-NEXT: store volatile i32 0, // CHECK-NEXT: store volatile float 0x7FF0000000000000, out[0] = test_lgamma_r(-3.0f, sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0xBFF4F1B100000000, out[0] = test_lgamma_r(-3.5f, sign_out); // CHECK-NEXT: store volatile i32 1, // CHECK-NEXT: store volatile float 0xC19DE28040000000, out[0] = test_lgamma_r(as_float(0xcaffffff), sign_out); } ROCm-Device-Libs-rocm-5.0.0/utils/000077500000000000000000000000001415221260100164705ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/utils/prepare-builtins/000077500000000000000000000000001415221260100217555ustar00rootroot00000000000000ROCm-Device-Libs-rocm-5.0.0/utils/prepare-builtins/CMakeLists.txt000066400000000000000000000020631415221260100245160ustar00rootroot00000000000000##===-------------------------------------------------------------------------- ## ROCm Device Libraries ## ## This file is distributed under the University of Illinois Open Source ## License. See LICENSE.TXT for details. ##===-------------------------------------------------------------------------- cmake_minimum_required(VERSION 3.13.4) include(AddLLVM) if (ROCM_DEVICELIB_STANDALONE_BUILD) add_definitions(${LLVM_DEFINITIONS}) include_directories(${LLVM_INCLUDE_DIR}) include_directories(${LLVM_CONFIG_INCLUDE_DIR}) include_directories(${LLVM_MAIN_INCLUDE_DIR}) include_directories(${LLVM_INCLUDE_DIRS}) add_definitions(${LLVM_DEFINITIONS}) link_directories("${LLVM_LIBRARY_DIR}") endif() add_executable(prepare-builtins prepare-builtins.cpp) set_target_properties(prepare-builtins PROPERTIES CXX_STANDARD 14 CXX_STANDARD_REQUIRED Yes CXX_EXTENSIONS No) llvm_update_compile_flags(prepare-builtins) llvm_map_components_to_libnames(llvm_libs support core bitreader bitwriter) target_link_libraries(prepare-builtins ${llvm_libs}) ROCm-Device-Libs-rocm-5.0.0/utils/prepare-builtins/prepare-builtins.cpp000066400000000000000000000065131415221260100257530ustar00rootroot00000000000000/*===-------------------------------------------------------------------------- * ROCm Device Libraries * * This file is distributed under the University of Illinois Open Source * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ #if !defined(__STDC_LIMIT_MACROS) # define __STDC_LIMIT_MACROS #endif #if !defined(__STDC_CONSTANT_MACROS) # define __STDC_CONSTANT_MACROS #endif #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Config/llvm-config.h" #include using namespace llvm; static cl::opt InputFilename(cl::Positional, cl::desc(""), cl::init("-")); static cl::opt OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename")); int main(int argc, char **argv) { LLVMContext Context; llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. cl::ParseCommandLineOptions(argc, argv, "bitcode library builtin preparation tool\n"); std::string ErrorMessage; Module *M = nullptr; { ErrorOr> BufferOrErr = MemoryBuffer::getFile(InputFilename); if (std::error_code ec = BufferOrErr.getError()) ErrorMessage = ec.message(); else { std::unique_ptr &BufferPtr = BufferOrErr.get(); Expected> ModuleOrErr = parseBitcodeFile(BufferPtr.get()->getMemBufferRef(), Context); if (Error Err = ModuleOrErr.takeError()) { ErrorMessage = toString(std::move(Err)); } else M = ModuleOrErr.get().release(); } } if (!M) { errs() << argv[0] << ": "; if (ErrorMessage.size()) errs() << ErrorMessage << "\n"; else errs() << "bitcode didn't read correctly.\n"; return 1; } // Set linkage of every external definition to linkonce_odr. for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { if (!i->isDeclaration() && i->getLinkage() == GlobalValue::ExternalLinkage) { i->setLinkage(GlobalValue::LinkOnceODRLinkage); } } for (Module::global_iterator i = M->global_begin(), e = M->global_end(); i != e; ++i) { if (!i->isDeclaration() && i->getLinkage() == GlobalValue::ExternalLinkage) { i->setLinkage(GlobalValue::LinkOnceODRLinkage); } } for (Module::alias_iterator i = M->alias_begin(), e = M->alias_end(); i != e; ++i) { if (!i->isDeclaration() && i->getLinkage() == GlobalValue::ExternalLinkage) { i->setLinkage(GlobalValue::LinkOnceODRLinkage); } } if (OutputFilename.empty()) { errs() << "no output file\n"; return 1; } std::error_code EC; std::unique_ptr Out (new ToolOutputFile(OutputFilename, EC, sys::fs::OF_None)); if (EC) { errs() << EC.message() << '\n'; exit(1); } WriteBitcodeToFile(*M, Out->os()); // Declare success. Out->keep(); return 0; }