diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt --- a/openmp/libomptarget/CMakeLists.txt +++ b/openmp/libomptarget/CMakeLists.txt @@ -86,6 +86,7 @@ # Build offloading plugins and device RTLs if they are available. add_subdirectory(plugins) add_subdirectory(DeviceRTL) +add_subdirectory(DeviceLib) add_subdirectory(tools) # Add tests. diff --git a/openmp/libomptarget/DeviceLib/CMakeLists.txt b/openmp/libomptarget/DeviceLib/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceLib/CMakeLists.txt @@ -0,0 +1,161 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build the DeviceRTL for all toolchains that are available +# +##===----------------------------------------------------------------------===## + +set(LIBOMPTARGET_BUILD_DEVICERTL_BCLIB TRUE CACHE BOOL + "Can be set to false to disable building this library.") + +if (NOT LIBOMPTARGET_BUILD_DEVICERTL_BCLIB) + libomptarget_say("Not building DeviceRTL: Disabled by LIBOMPTARGET_BUILD_DEVICERTL_BCLIB") + return() +endif() + +if (LLVM_DIR) + # Builds that use pre-installed LLVM have LLVM_DIR set. + find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) + find_program(LINK_TOOL llvm-link PATHS ${LLVM_TOOLS_BINARY_DIR} + NO_DEFAULT_PATH) + find_program(OPT_TOOL opt PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) + if ((NOT CLANG_TOOL) OR (NOT LINK_TOOL) OR (NOT OPT_TOOL)) + libomptarget_say("Not building DeviceLib. Missing clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} or opt: ${OPT_TOOL}") + return() + else() + libomptarget_say("Building DeviceRTL. Using clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} and opt: ${OPT_TOOL}") + endif() +elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD) + # LLVM in-tree builds may use CMake target names to discover the tools. + set(CLANG_TOOL $) + set(LINK_TOOL $) + set(OPT_TOOL $) + libomptarget_say("Building DeviceRTL. Using clang from in-tree build") +else() + libomptarget_say("Not building DeviceLib. No appropriate clang found") + return() +endif() + +# TODO: This part needs to be refined when libomptarget is going to support +# Windows! +# TODO: This part can also be removed if we can change the clang driver to make +# it support device only compilation. +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + set(aux_triple x86_64-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le") + set(aux_triple powerpc64le-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(aux_triple aarch64-unknown-linux-gnu) +else() + libomptarget_say("Not building DeviceRTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}") + return() +endif() + +set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR}) +set(include_directory ${devicertl_base_directory}/include) +set(source_directory ${devicertl_base_directory}/src) + +# Set flags for LLVM Bitcode compilation. +set(bc_flags -S -x c++ -std=c++17 -fvisibility=hidden + -fno-exceptions -fno-rtti -O1 + -Xclang -emit-llvm-bc + -Xclang -aux-triple -Xclang ${aux_triple} + -I${include_directory} + -I${devicertl_base_directory}/../include + ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL} +) + +function(compileDeviceRTLLibrary src_files target_name output_name) + set(target_bc_flags ${ARGN}) + + set(bc_files "") + foreach(src ${src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + set(outfile "${outfile}-${target_name}.bc") + + add_custom_command(OUTPUT ${outfile} + COMMAND ${CLANG_TOOL} + ${bc_flags} + ${target_bc_flags} + ${infile} -o ${outfile} + DEPENDS ${infile} ${include_files} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}" + VERBATIM + ) + if("${CLANG_TOOL}" STREQUAL "$") + # Add a file-level dependency to ensure that clang is up-to-date. + # By default, add_custom_command only builds clang if the + # executable is missing. + add_custom_command(OUTPUT ${outfile} + DEPENDS clang + APPEND + ) + endif() + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) + + list(APPEND bc_files ${outfile}) + endforeach() + + set(bclib_name "libomptarget-${target_name}-${output_name}.bc") + + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} + COMMAND ${LINK_TOOL} + -o ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode ${bclib_name}" + ) + + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMAND ${OPT_TOOL} ${link_opt_flags} ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} + -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} + COMMENT "Optimizing LLVM bitcode ${bclib_name}" + ) + + # Add a file-level dependency to ensure that llvm-link and opt are up-to-date. + # By default, add_custom_command only builds the tool if the executable is missing + if("${LINK_TOOL}" STREQUAL "$") + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} + DEPENDS llvm-link + APPEND) + endif() + if("${OPT_TOOL}" STREQUAL "$") + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + DEPENDS opt + APPEND) + endif() + + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) + + set(bclib_target_name "omptarget-${target_name}-${output_name}-bc") + + add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) + + # Copy library to destination. + add_custom_command(TARGET ${bclib_target_name} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + ${LIBOMPTARGET_LIBRARY_DIR}) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") +endfunction() + +set(src_files ${source_directory}/OpenMPMath.cpp) + +# Generate a Bitcode library for the OpenMP math function wrappers. +compileDeviceRTLLibrary(${src_files} nvptx "math-wrappers" -target nvptx64-nvidia-cuda -nogpulib) +compileDeviceRTLLibrary(${src_files} amdgpu "math-wrappers" -target amdgcn-amd-amdhsa -D__AMDGCN__ -nogpulib) + +set(src_files ${source_directory}/DeviceLibm.cpp) + +# Generate a Bitcode library for the device math function wrappers. +compileDeviceRTLLibrary(${src_files} nvptx "libm" -target nvptx64-nvidia-cuda -D__CUDA__ -nogpulib) +compileDeviceRTLLibrary(${src_files} amdgpu "libm" -target amdgcn-amd-amdhsa -D__AMDGCN__ -nogpulib) diff --git a/openmp/libomptarget/DeviceLib/include/Headers/__clang_cuda_device_functions.h b/openmp/libomptarget/DeviceLib/include/Headers/__clang_cuda_device_functions.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceLib/include/Headers/__clang_cuda_device_functions.h @@ -0,0 +1,1558 @@ +/*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__ +#define __CLANG_CUDA_DEVICE_FUNCTIONS_H__ + +#ifndef __OPENMP_NVPTX__ +#if CUDA_VERSION < 9000 +#error This file is intended to be used with CUDA-9+ only. +#endif +#endif + +// __DEVICE__ is a helper macro with common set of attributes for the wrappers +// we implement in this file. We need static in order to avoid emitting unused +// functions and __forceinline__ helps inlining these wrappers at -O1. +#pragma push_macro("__DEVICE__") +#ifdef __OPENMP_NVPTX__ +#define __DEVICE__ static __attribute__((always_inline, nothrow)) +#else +#define __DEVICE__ static __device__ __forceinline__ +#endif + +__DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); } +__DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); } +__DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); } +__DEVICE__ unsigned int __brev(unsigned int __a) { return __nv_brev(__a); } +__DEVICE__ unsigned long long __brevll(unsigned long long __a) { + return __nv_brevll(__a); +} +#if defined(__cplusplus) +__DEVICE__ void __brkpt() { __asm__ __volatile__("brkpt;"); } +__DEVICE__ void __brkpt(int __a) { __brkpt(); } +#else +__DEVICE__ void __attribute__((overloadable)) __brkpt(void) { + __asm__ __volatile__("brkpt;"); +} +__DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); } +#endif +__DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b, + unsigned int __c) { + return __nv_byte_perm(__a, __b, __c); +} +__DEVICE__ int __clz(int __a) { return __nv_clz(__a); } +__DEVICE__ int __clzll(long long __a) { return __nv_clzll(__a); } +__DEVICE__ float __cosf(float __a) { return __nv_fast_cosf(__a); } +__DEVICE__ double __dAtomicAdd(double *__p, double __v) { + return __nvvm_atom_add_gen_d(__p, __v); +} +__DEVICE__ double __dAtomicAdd_block(double *__p, double __v) { + return __nvvm_atom_cta_add_gen_d(__p, __v); +} +__DEVICE__ double __dAtomicAdd_system(double *__p, double __v) { + return __nvvm_atom_sys_add_gen_d(__p, __v); +} +__DEVICE__ double __dadd_rd(double __a, double __b) { + return __nv_dadd_rd(__a, __b); +} +__DEVICE__ double __dadd_rn(double __a, double __b) { + return __nv_dadd_rn(__a, __b); +} +__DEVICE__ double __dadd_ru(double __a, double __b) { + return __nv_dadd_ru(__a, __b); +} +__DEVICE__ double __dadd_rz(double __a, double __b) { + return __nv_dadd_rz(__a, __b); +} +__DEVICE__ double __ddiv_rd(double __a, double __b) { + return __nv_ddiv_rd(__a, __b); +} +__DEVICE__ double __ddiv_rn(double __a, double __b) { + return __nv_ddiv_rn(__a, __b); +} +__DEVICE__ double __ddiv_ru(double __a, double __b) { + return __nv_ddiv_ru(__a, __b); +} +__DEVICE__ double __ddiv_rz(double __a, double __b) { + return __nv_ddiv_rz(__a, __b); +} +__DEVICE__ double __dmul_rd(double __a, double __b) { + return __nv_dmul_rd(__a, __b); +} +__DEVICE__ double __dmul_rn(double __a, double __b) { + return __nv_dmul_rn(__a, __b); +} +__DEVICE__ double __dmul_ru(double __a, double __b) { + return __nv_dmul_ru(__a, __b); +} +__DEVICE__ double __dmul_rz(double __a, double __b) { + return __nv_dmul_rz(__a, __b); +} +__DEVICE__ float __double2float_rd(double __a) { + return __nv_double2float_rd(__a); +} +__DEVICE__ float __double2float_rn(double __a) { + return __nv_double2float_rn(__a); +} +__DEVICE__ float __double2float_ru(double __a) { + return __nv_double2float_ru(__a); +} +__DEVICE__ float __double2float_rz(double __a) { + return __nv_double2float_rz(__a); +} +__DEVICE__ int __double2hiint(double __a) { return __nv_double2hiint(__a); } +__DEVICE__ int __double2int_rd(double __a) { return __nv_double2int_rd(__a); } +__DEVICE__ int __double2int_rn(double __a) { return __nv_double2int_rn(__a); } +__DEVICE__ int __double2int_ru(double __a) { return __nv_double2int_ru(__a); } +__DEVICE__ int __double2int_rz(double __a) { return __nv_double2int_rz(__a); } +__DEVICE__ long long __double2ll_rd(double __a) { + return __nv_double2ll_rd(__a); +} +__DEVICE__ long long __double2ll_rn(double __a) { + return __nv_double2ll_rn(__a); +} +__DEVICE__ long long __double2ll_ru(double __a) { + return __nv_double2ll_ru(__a); +} +__DEVICE__ long long __double2ll_rz(double __a) { + return __nv_double2ll_rz(__a); +} +__DEVICE__ int __double2loint(double __a) { return __nv_double2loint(__a); } +__DEVICE__ unsigned int __double2uint_rd(double __a) { + return __nv_double2uint_rd(__a); +} +__DEVICE__ unsigned int __double2uint_rn(double __a) { + return __nv_double2uint_rn(__a); +} +__DEVICE__ unsigned int __double2uint_ru(double __a) { + return __nv_double2uint_ru(__a); +} +__DEVICE__ unsigned int __double2uint_rz(double __a) { + return __nv_double2uint_rz(__a); +} +__DEVICE__ unsigned long long __double2ull_rd(double __a) { + return __nv_double2ull_rd(__a); +} +__DEVICE__ unsigned long long __double2ull_rn(double __a) { + return __nv_double2ull_rn(__a); +} +__DEVICE__ unsigned long long __double2ull_ru(double __a) { + return __nv_double2ull_ru(__a); +} +__DEVICE__ unsigned long long __double2ull_rz(double __a) { + return __nv_double2ull_rz(__a); +} +__DEVICE__ long long __double_as_longlong(double __a) { + return __nv_double_as_longlong(__a); +} +__DEVICE__ double __drcp_rd(double __a) { return __nv_drcp_rd(__a); } +__DEVICE__ double __drcp_rn(double __a) { return __nv_drcp_rn(__a); } +__DEVICE__ double __drcp_ru(double __a) { return __nv_drcp_ru(__a); } +__DEVICE__ double __drcp_rz(double __a) { return __nv_drcp_rz(__a); } +__DEVICE__ double __dsqrt_rd(double __a) { return __nv_dsqrt_rd(__a); } +__DEVICE__ double __dsqrt_rn(double __a) { return __nv_dsqrt_rn(__a); } +__DEVICE__ double __dsqrt_ru(double __a) { return __nv_dsqrt_ru(__a); } +__DEVICE__ double __dsqrt_rz(double __a) { return __nv_dsqrt_rz(__a); } +__DEVICE__ double __dsub_rd(double __a, double __b) { + return __nv_dsub_rd(__a, __b); +} +__DEVICE__ double __dsub_rn(double __a, double __b) { + return __nv_dsub_rn(__a, __b); +} +__DEVICE__ double __dsub_ru(double __a, double __b) { + return __nv_dsub_ru(__a, __b); +} +__DEVICE__ double __dsub_rz(double __a, double __b) { + return __nv_dsub_rz(__a, __b); +} +__DEVICE__ float __exp10f(float __a) { return __nv_fast_exp10f(__a); } +__DEVICE__ float __expf(float __a) { return __nv_fast_expf(__a); } +__DEVICE__ float __fAtomicAdd(float *__p, float __v) { + return __nvvm_atom_add_gen_f(__p, __v); +} +__DEVICE__ float __fAtomicAdd_block(float *__p, float __v) { + return __nvvm_atom_cta_add_gen_f(__p, __v); +} +__DEVICE__ float __fAtomicAdd_system(float *__p, float __v) { + return __nvvm_atom_sys_add_gen_f(__p, __v); +} +__DEVICE__ float __fAtomicExch(float *__p, float __v) { + return __nv_int_as_float( + __nvvm_atom_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); +} +__DEVICE__ float __fAtomicExch_block(float *__p, float __v) { + return __nv_int_as_float( + __nvvm_atom_cta_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); +} +__DEVICE__ float __fAtomicExch_system(float *__p, float __v) { + return __nv_int_as_float( + __nvvm_atom_sys_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); +} +__DEVICE__ float __fadd_rd(float __a, float __b) { + return __nv_fadd_rd(__a, __b); +} +__DEVICE__ float __fadd_rn(float __a, float __b) { + return __nv_fadd_rn(__a, __b); +} +__DEVICE__ float __fadd_ru(float __a, float __b) { + return __nv_fadd_ru(__a, __b); +} +__DEVICE__ float __fadd_rz(float __a, float __b) { + return __nv_fadd_rz(__a, __b); +} +__DEVICE__ float __fdiv_rd(float __a, float __b) { + return __nv_fdiv_rd(__a, __b); +} +__DEVICE__ float __fdiv_rn(float __a, float __b) { + return __nv_fdiv_rn(__a, __b); +} +__DEVICE__ float __fdiv_ru(float __a, float __b) { + return __nv_fdiv_ru(__a, __b); +} +__DEVICE__ float __fdiv_rz(float __a, float __b) { + return __nv_fdiv_rz(__a, __b); +} +__DEVICE__ float __fdividef(float __a, float __b) { + return __nv_fast_fdividef(__a, __b); +} +__DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); } +__DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); } +__DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); } +__DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); } +#ifdef _MSC_VER +__DEVICE__ int __finitel(long double __a); +#endif +__DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); } +__DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); } +__DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); } +__DEVICE__ int __float2int_rz(float __a) { return __nv_float2int_rz(__a); } +__DEVICE__ long long __float2ll_rd(float __a) { return __nv_float2ll_rd(__a); } +__DEVICE__ long long __float2ll_rn(float __a) { return __nv_float2ll_rn(__a); } +__DEVICE__ long long __float2ll_ru(float __a) { return __nv_float2ll_ru(__a); } +__DEVICE__ long long __float2ll_rz(float __a) { return __nv_float2ll_rz(__a); } +__DEVICE__ unsigned int __float2uint_rd(float __a) { + return __nv_float2uint_rd(__a); +} +__DEVICE__ unsigned int __float2uint_rn(float __a) { + return __nv_float2uint_rn(__a); +} +__DEVICE__ unsigned int __float2uint_ru(float __a) { + return __nv_float2uint_ru(__a); +} +__DEVICE__ unsigned int __float2uint_rz(float __a) { + return __nv_float2uint_rz(__a); +} +__DEVICE__ unsigned long long __float2ull_rd(float __a) { + return __nv_float2ull_rd(__a); +} +__DEVICE__ unsigned long long __float2ull_rn(float __a) { + return __nv_float2ull_rn(__a); +} +__DEVICE__ unsigned long long __float2ull_ru(float __a) { + return __nv_float2ull_ru(__a); +} +__DEVICE__ unsigned long long __float2ull_rz(float __a) { + return __nv_float2ull_rz(__a); +} +__DEVICE__ int __float_as_int(float __a) { return __nv_float_as_int(__a); } +__DEVICE__ unsigned int __float_as_uint(float __a) { + return __nv_float_as_uint(__a); +} +__DEVICE__ double __fma_rd(double __a, double __b, double __c) { + return __nv_fma_rd(__a, __b, __c); +} +__DEVICE__ double __fma_rn(double __a, double __b, double __c) { + return __nv_fma_rn(__a, __b, __c); +} +__DEVICE__ double __fma_ru(double __a, double __b, double __c) { + return __nv_fma_ru(__a, __b, __c); +} +__DEVICE__ double __fma_rz(double __a, double __b, double __c) { + return __nv_fma_rz(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_rd(float __a, float __b, float __c) { + return __nv_fmaf_ieee_rd(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_rn(float __a, float __b, float __c) { + return __nv_fmaf_ieee_rn(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_ru(float __a, float __b, float __c) { + return __nv_fmaf_ieee_ru(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_rz(float __a, float __b, float __c) { + return __nv_fmaf_ieee_rz(__a, __b, __c); +} +__DEVICE__ float __fmaf_rd(float __a, float __b, float __c) { + return __nv_fmaf_rd(__a, __b, __c); +} +__DEVICE__ float __fmaf_rn(float __a, float __b, float __c) { + return __nv_fmaf_rn(__a, __b, __c); +} +__DEVICE__ float __fmaf_ru(float __a, float __b, float __c) { + return __nv_fmaf_ru(__a, __b, __c); +} +__DEVICE__ float __fmaf_rz(float __a, float __b, float __c) { + return __nv_fmaf_rz(__a, __b, __c); +} +__DEVICE__ float __fmul_rd(float __a, float __b) { + return __nv_fmul_rd(__a, __b); +} +__DEVICE__ float __fmul_rn(float __a, float __b) { + return __nv_fmul_rn(__a, __b); +} +__DEVICE__ float __fmul_ru(float __a, float __b) { + return __nv_fmul_ru(__a, __b); +} +__DEVICE__ float __fmul_rz(float __a, float __b) { + return __nv_fmul_rz(__a, __b); +} +__DEVICE__ float __frcp_rd(float __a) { return __nv_frcp_rd(__a); } +__DEVICE__ float __frcp_rn(float __a) { return __nv_frcp_rn(__a); } +__DEVICE__ float __frcp_ru(float __a) { return __nv_frcp_ru(__a); } +__DEVICE__ float __frcp_rz(float __a) { return __nv_frcp_rz(__a); } +__DEVICE__ float __frsqrt_rn(float __a) { return __nv_frsqrt_rn(__a); } +__DEVICE__ float __fsqrt_rd(float __a) { return __nv_fsqrt_rd(__a); } +__DEVICE__ float __fsqrt_rn(float __a) { return __nv_fsqrt_rn(__a); } +__DEVICE__ float __fsqrt_ru(float __a) { return __nv_fsqrt_ru(__a); } +__DEVICE__ float __fsqrt_rz(float __a) { return __nv_fsqrt_rz(__a); } +__DEVICE__ float __fsub_rd(float __a, float __b) { + return __nv_fsub_rd(__a, __b); +} +__DEVICE__ float __fsub_rn(float __a, float __b) { + return __nv_fsub_rn(__a, __b); +} +__DEVICE__ float __fsub_ru(float __a, float __b) { + return __nv_fsub_ru(__a, __b); +} +__DEVICE__ float __fsub_rz(float __a, float __b) { + return __nv_fsub_rz(__a, __b); +} +__DEVICE__ int __hadd(int __a, int __b) { return __nv_hadd(__a, __b); } +__DEVICE__ double __hiloint2double(int __a, int __b) { + return __nv_hiloint2double(__a, __b); +} +__DEVICE__ int __iAtomicAdd(int *__p, int __v) { + return __nvvm_atom_add_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAdd_block(int *__p, int __v) { + return __nvvm_atom_cta_add_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAdd_system(int *__p, int __v) { + return __nvvm_atom_sys_add_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAnd(int *__p, int __v) { + return __nvvm_atom_and_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAnd_block(int *__p, int __v) { + return __nvvm_atom_cta_and_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAnd_system(int *__p, int __v) { + return __nvvm_atom_sys_and_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicCAS(int *__p, int __cmp, int __v) { + return __nvvm_atom_cas_gen_i(__p, __cmp, __v); +} +__DEVICE__ int __iAtomicCAS_block(int *__p, int __cmp, int __v) { + return __nvvm_atom_cta_cas_gen_i(__p, __cmp, __v); +} +__DEVICE__ int __iAtomicCAS_system(int *__p, int __cmp, int __v) { + return __nvvm_atom_sys_cas_gen_i(__p, __cmp, __v); +} +__DEVICE__ int __iAtomicExch(int *__p, int __v) { + return __nvvm_atom_xchg_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicExch_block(int *__p, int __v) { + return __nvvm_atom_cta_xchg_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicExch_system(int *__p, int __v) { + return __nvvm_atom_sys_xchg_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMax(int *__p, int __v) { + return __nvvm_atom_max_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMax_block(int *__p, int __v) { + return __nvvm_atom_cta_max_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMax_system(int *__p, int __v) { + return __nvvm_atom_sys_max_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMin(int *__p, int __v) { + return __nvvm_atom_min_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMin_block(int *__p, int __v) { + return __nvvm_atom_cta_min_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMin_system(int *__p, int __v) { + return __nvvm_atom_sys_min_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicOr(int *__p, int __v) { + return __nvvm_atom_or_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicOr_block(int *__p, int __v) { + return __nvvm_atom_cta_or_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicOr_system(int *__p, int __v) { + return __nvvm_atom_sys_or_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicXor(int *__p, int __v) { + return __nvvm_atom_xor_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicXor_block(int *__p, int __v) { + return __nvvm_atom_cta_xor_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicXor_system(int *__p, int __v) { + return __nvvm_atom_sys_xor_gen_i(__p, __v); +} +__DEVICE__ long long __illAtomicMax(long long *__p, long long __v) { + return __nvvm_atom_max_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMax_block(long long *__p, long long __v) { + return __nvvm_atom_cta_max_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMax_system(long long *__p, long long __v) { + return __nvvm_atom_sys_max_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMin(long long *__p, long long __v) { + return __nvvm_atom_min_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMin_block(long long *__p, long long __v) { + return __nvvm_atom_cta_min_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMin_system(long long *__p, long long __v) { + return __nvvm_atom_sys_min_gen_ll(__p, __v); +} +__DEVICE__ double __int2double_rn(int __a) { return __nv_int2double_rn(__a); } +__DEVICE__ float __int2float_rd(int __a) { return __nv_int2float_rd(__a); } +__DEVICE__ float __int2float_rn(int __a) { return __nv_int2float_rn(__a); } +__DEVICE__ float __int2float_ru(int __a) { return __nv_int2float_ru(__a); } +__DEVICE__ float __int2float_rz(int __a) { return __nv_int2float_rz(__a); } +__DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); } +__DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); } +__DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); } +__DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); } +#ifdef _MSC_VER +__DEVICE__ int __isinfl(long double __a); +#endif +__DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); } +__DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); } +#ifdef _MSC_VER +__DEVICE__ int __isnanl(long double __a); +#endif +__DEVICE__ double __ll2double_rd(long long __a) { + return __nv_ll2double_rd(__a); +} +__DEVICE__ double __ll2double_rn(long long __a) { + return __nv_ll2double_rn(__a); +} +__DEVICE__ double __ll2double_ru(long long __a) { + return __nv_ll2double_ru(__a); +} +__DEVICE__ double __ll2double_rz(long long __a) { + return __nv_ll2double_rz(__a); +} +__DEVICE__ float __ll2float_rd(long long __a) { return __nv_ll2float_rd(__a); } +__DEVICE__ float __ll2float_rn(long long __a) { return __nv_ll2float_rn(__a); } +__DEVICE__ float __ll2float_ru(long long __a) { return __nv_ll2float_ru(__a); } +__DEVICE__ float __ll2float_rz(long long __a) { return __nv_ll2float_rz(__a); } +__DEVICE__ long long __llAtomicAnd(long long *__p, long long __v) { + return __nvvm_atom_and_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicAnd_block(long long *__p, long long __v) { + return __nvvm_atom_cta_and_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicAnd_system(long long *__p, long long __v) { + return __nvvm_atom_sys_and_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicOr(long long *__p, long long __v) { + return __nvvm_atom_or_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicOr_block(long long *__p, long long __v) { + return __nvvm_atom_cta_or_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicOr_system(long long *__p, long long __v) { + return __nvvm_atom_sys_or_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicXor(long long *__p, long long __v) { + return __nvvm_atom_xor_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicXor_block(long long *__p, long long __v) { + return __nvvm_atom_cta_xor_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicXor_system(long long *__p, long long __v) { + return __nvvm_atom_sys_xor_gen_ll(__p, __v); +} +__DEVICE__ float __log10f(float __a) { return __nv_fast_log10f(__a); } +__DEVICE__ float __log2f(float __a) { return __nv_fast_log2f(__a); } +__DEVICE__ float __logf(float __a) { return __nv_fast_logf(__a); } +__DEVICE__ double __longlong_as_double(long long __a) { + return __nv_longlong_as_double(__a); +} +__DEVICE__ int __mul24(int __a, int __b) { return __nv_mul24(__a, __b); } +__DEVICE__ long long __mul64hi(long long __a, long long __b) { + return __nv_mul64hi(__a, __b); +} +__DEVICE__ int __mulhi(int __a, int __b) { return __nv_mulhi(__a, __b); } +__DEVICE__ unsigned int __pm0(void) { return __nvvm_read_ptx_sreg_pm0(); } +__DEVICE__ unsigned int __pm1(void) { return __nvvm_read_ptx_sreg_pm1(); } +__DEVICE__ unsigned int __pm2(void) { return __nvvm_read_ptx_sreg_pm2(); } +__DEVICE__ unsigned int __pm3(void) { return __nvvm_read_ptx_sreg_pm3(); } +__DEVICE__ int __popc(int __a) { return __nv_popc(__a); } +__DEVICE__ int __popcll(long long __a) { return __nv_popcll(__a); } +__DEVICE__ float __powf(float __a, float __b) { + return __nv_fast_powf(__a, __b); +} + +// Parameter must have a known integer value. +#define __prof_trigger(__a) __asm__ __volatile__("pmevent \t%0;" ::"i"(__a)) +__DEVICE__ int __rhadd(int __a, int __b) { return __nv_rhadd(__a, __b); } +__DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) { + return __nv_sad(__a, __b, __c); +} +__DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); } +__DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); } +__DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); } +__DEVICE__ void __sincosf(float __a, float *__s, float *__c) { + return __nv_fast_sincosf(__a, __s, __c); +} +__DEVICE__ float __sinf(float __a) { return __nv_fast_sinf(__a); } +__DEVICE__ int __syncthreads_and(int __a) { return __nvvm_bar0_and(__a); } +__DEVICE__ int __syncthreads_count(int __a) { return __nvvm_bar0_popc(__a); } +__DEVICE__ int __syncthreads_or(int __a) { return __nvvm_bar0_or(__a); } +__DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); } +__DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); } +__DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); }; +__DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); }; +__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); } +__DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_add_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicAdd_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_add_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicAdd_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_add_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicAnd(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_and_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicAnd_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_and_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicAnd_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_and_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicCAS(unsigned int *__p, unsigned int __cmp, + unsigned int __v) { + return __nvvm_atom_cas_gen_i((int *)__p, __cmp, __v); +} +__DEVICE__ unsigned int +__uAtomicCAS_block(unsigned int *__p, unsigned int __cmp, unsigned int __v) { + return __nvvm_atom_cta_cas_gen_i((int *)__p, __cmp, __v); +} +__DEVICE__ unsigned int +__uAtomicCAS_system(unsigned int *__p, unsigned int __cmp, unsigned int __v) { + return __nvvm_atom_sys_cas_gen_i((int *)__p, __cmp, __v); +} +__DEVICE__ unsigned int __uAtomicDec(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_dec_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicDec_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_dec_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicDec_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_dec_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicExch(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_xchg_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicExch_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_xchg_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicExch_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_xchg_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicInc(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_inc_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicInc_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_inc_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicInc_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_inc_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMax(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_max_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMax_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_max_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMax_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_max_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMin(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_min_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMin_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_min_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMin_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_min_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicOr(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_or_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicOr_block(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_cta_or_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicOr_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_or_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicXor(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_xor_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicXor_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_xor_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicXor_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_xor_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uhadd(unsigned int __a, unsigned int __b) { + return __nv_uhadd(__a, __b); +} +__DEVICE__ double __uint2double_rn(unsigned int __a) { + return __nv_uint2double_rn(__a); +} +__DEVICE__ float __uint2float_rd(unsigned int __a) { + return __nv_uint2float_rd(__a); +} +__DEVICE__ float __uint2float_rn(unsigned int __a) { + return __nv_uint2float_rn(__a); +} +__DEVICE__ float __uint2float_ru(unsigned int __a) { + return __nv_uint2float_ru(__a); +} +__DEVICE__ float __uint2float_rz(unsigned int __a) { + return __nv_uint2float_rz(__a); +} +__DEVICE__ float __uint_as_float(unsigned int __a) { + return __nv_uint_as_float(__a); +} // +__DEVICE__ double __ull2double_rd(unsigned long long __a) { + return __nv_ull2double_rd(__a); +} +__DEVICE__ double __ull2double_rn(unsigned long long __a) { + return __nv_ull2double_rn(__a); +} +__DEVICE__ double __ull2double_ru(unsigned long long __a) { + return __nv_ull2double_ru(__a); +} +__DEVICE__ double __ull2double_rz(unsigned long long __a) { + return __nv_ull2double_rz(__a); +} +__DEVICE__ float __ull2float_rd(unsigned long long __a) { + return __nv_ull2float_rd(__a); +} +__DEVICE__ float __ull2float_rn(unsigned long long __a) { + return __nv_ull2float_rn(__a); +} +__DEVICE__ float __ull2float_ru(unsigned long long __a) { + return __nv_ull2float_ru(__a); +} +__DEVICE__ float __ull2float_rz(unsigned long long __a) { + return __nv_ull2float_rz(__a); +} +__DEVICE__ unsigned long long __ullAtomicAdd(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_add_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAdd_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_add_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAdd_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_add_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAnd(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_and_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAnd_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_and_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAnd_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_and_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicCAS(unsigned long long *__p, + unsigned long long __cmp, + unsigned long long __v) { + return __nvvm_atom_cas_gen_ll((long long *)__p, __cmp, __v); +} +__DEVICE__ unsigned long long __ullAtomicCAS_block(unsigned long long *__p, + unsigned long long __cmp, + unsigned long long __v) { + return __nvvm_atom_cta_cas_gen_ll((long long *)__p, __cmp, __v); +} +__DEVICE__ unsigned long long __ullAtomicCAS_system(unsigned long long *__p, + unsigned long long __cmp, + unsigned long long __v) { + return __nvvm_atom_sys_cas_gen_ll((long long *)__p, __cmp, __v); +} +__DEVICE__ unsigned long long __ullAtomicExch(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_xchg_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicExch_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_xchg_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicExch_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_xchg_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMax(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_max_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMax_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_max_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMax_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_max_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMin(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_min_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMin_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_min_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMin_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_min_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicOr(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_or_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicOr_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_or_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicOr_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_or_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicXor(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_xor_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicXor_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_xor_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicXor_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_xor_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned int __umul24(unsigned int __a, unsigned int __b) { + return __nv_umul24(__a, __b); +} +__DEVICE__ unsigned long long __umul64hi(unsigned long long __a, + unsigned long long __b) { + return __nv_umul64hi(__a, __b); +} +__DEVICE__ unsigned int __umulhi(unsigned int __a, unsigned int __b) { + return __nv_umulhi(__a, __b); +} +__DEVICE__ unsigned int __urhadd(unsigned int __a, unsigned int __b) { + return __nv_urhadd(__a, __b); +} +__DEVICE__ unsigned int __usad(unsigned int __a, unsigned int __b, + unsigned int __c) { + return __nv_usad(__a, __b, __c); +} + +#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020 +__DEVICE__ unsigned int __vabs2(unsigned int __a) { return __nv_vabs2(__a); } +__DEVICE__ unsigned int __vabs4(unsigned int __a) { return __nv_vabs4(__a); } +__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { + return __nv_vabsdiffs2(__a, __b); +} +__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { + return __nv_vabsdiffs4(__a, __b); +} +__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) { + return __nv_vabsdiffu2(__a, __b); +} +__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) { + return __nv_vabsdiffu4(__a, __b); +} +__DEVICE__ unsigned int __vabsss2(unsigned int __a) { + return __nv_vabsss2(__a); +} +__DEVICE__ unsigned int __vabsss4(unsigned int __a) { + return __nv_vabsss4(__a); +} +__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) { + return __nv_vadd2(__a, __b); +} +__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) { + return __nv_vadd4(__a, __b); +} +__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) { + return __nv_vaddss2(__a, __b); +} +__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) { + return __nv_vaddss4(__a, __b); +} +__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) { + return __nv_vaddus2(__a, __b); +} +__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) { + return __nv_vaddus4(__a, __b); +} +__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) { + return __nv_vavgs2(__a, __b); +} +__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) { + return __nv_vavgs4(__a, __b); +} +__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) { + return __nv_vavgu2(__a, __b); +} +__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) { + return __nv_vavgu4(__a, __b); +} +__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) { + return __nv_vcmpeq2(__a, __b); +} +__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) { + return __nv_vcmpeq4(__a, __b); +} +__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) { + return __nv_vcmpges2(__a, __b); +} +__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) { + return __nv_vcmpges4(__a, __b); +} +__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) { + return __nv_vcmpgeu2(__a, __b); +} +__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) { + return __nv_vcmpgeu4(__a, __b); +} +__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) { + return __nv_vcmpgts2(__a, __b); +} +__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) { + return __nv_vcmpgts4(__a, __b); +} +__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) { + return __nv_vcmpgtu2(__a, __b); +} +__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) { + return __nv_vcmpgtu4(__a, __b); +} +__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) { + return __nv_vcmples2(__a, __b); +} +__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) { + return __nv_vcmples4(__a, __b); +} +__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) { + return __nv_vcmpleu2(__a, __b); +} +__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) { + return __nv_vcmpleu4(__a, __b); +} +__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) { + return __nv_vcmplts2(__a, __b); +} +__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) { + return __nv_vcmplts4(__a, __b); +} +__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) { + return __nv_vcmpltu2(__a, __b); +} +__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) { + return __nv_vcmpltu4(__a, __b); +} +__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) { + return __nv_vcmpne2(__a, __b); +} +__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) { + return __nv_vcmpne4(__a, __b); +} +__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) { + return __nv_vhaddu2(__a, __b); +} +__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) { + return __nv_vhaddu4(__a, __b); +} +__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) { + return __nv_vmaxs2(__a, __b); +} +__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) { + return __nv_vmaxs4(__a, __b); +} +__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) { + return __nv_vmaxu2(__a, __b); +} +__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) { + return __nv_vmaxu4(__a, __b); +} +__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) { + return __nv_vmins2(__a, __b); +} +__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) { + return __nv_vmins4(__a, __b); +} +__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) { + return __nv_vminu2(__a, __b); +} +__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) { + return __nv_vminu4(__a, __b); +} +__DEVICE__ unsigned int __vneg2(unsigned int __a) { return __nv_vneg2(__a); } +__DEVICE__ unsigned int __vneg4(unsigned int __a) { return __nv_vneg4(__a); } +__DEVICE__ unsigned int __vnegss2(unsigned int __a) { + return __nv_vnegss2(__a); +} +__DEVICE__ unsigned int __vnegss4(unsigned int __a) { + return __nv_vnegss4(__a); +} +__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) { + return __nv_vsads2(__a, __b); +} +__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) { + return __nv_vsads4(__a, __b); +} +__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) { + return __nv_vsadu2(__a, __b); +} +__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) { + return __nv_vsadu4(__a, __b); +} +__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) { + return __nv_vseteq2(__a, __b); +} +__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) { + return __nv_vseteq4(__a, __b); +} +__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) { + return __nv_vsetges2(__a, __b); +} +__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) { + return __nv_vsetges4(__a, __b); +} +__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) { + return __nv_vsetgeu2(__a, __b); +} +__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) { + return __nv_vsetgeu4(__a, __b); +} +__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) { + return __nv_vsetgts2(__a, __b); +} +__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) { + return __nv_vsetgts4(__a, __b); +} +__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) { + return __nv_vsetgtu2(__a, __b); +} +__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) { + return __nv_vsetgtu4(__a, __b); +} +__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) { + return __nv_vsetles2(__a, __b); +} +__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) { + return __nv_vsetles4(__a, __b); +} +__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) { + return __nv_vsetleu2(__a, __b); +} +__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) { + return __nv_vsetleu4(__a, __b); +} +__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) { + return __nv_vsetlts2(__a, __b); +} +__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) { + return __nv_vsetlts4(__a, __b); +} +__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) { + return __nv_vsetltu2(__a, __b); +} +__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) { + return __nv_vsetltu4(__a, __b); +} +__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) { + return __nv_vsetne2(__a, __b); +} +__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) { + return __nv_vsetne4(__a, __b); +} +__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { + return __nv_vsub2(__a, __b); +} +__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) { + return __nv_vsub4(__a, __b); +} +__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) { + return __nv_vsubss2(__a, __b); +} +__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) { + return __nv_vsubss4(__a, __b); +} +__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) { + return __nv_vsubus2(__a, __b); +} +__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { + return __nv_vsubus4(__a, __b); +} +#else // CUDA_VERSION >= 9020 +// CUDA no longer provides inline assembly (or bitcode) implementation of these +// functions, so we have to reimplment them. The implementation is naive and is +// not optimized for performance. + +// Helper function to convert N-bit boolean subfields into all-0 or all-1. +// E.g. __bool2mask(0x01000100,8) -> 0xff00ff00 +// __bool2mask(0x00010000,16) -> 0xffff0000 +__DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) { + return (__a << shift) - __a; +} +__DEVICE__ unsigned int __vabs2(unsigned int __a) { + unsigned int r; + __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabs4(unsigned int __a) { + unsigned int r; + __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} + +__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsss2(unsigned int __a) { + unsigned int r; + __asm__("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsss4(unsigned int __a) { + unsigned int r; + __asm__("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vadd2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vadd4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vadd2.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vadd4.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vadd2.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vadd4.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vavrg2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vavrg4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vavrg2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vavrg4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset2.u32.u32.eq %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vseteq2(__a, __b), 16); +} +__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset4.u32.u32.eq %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vseteq4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset2.s32.s32.ge %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetges2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset4.s32.s32.ge %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetges4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset2.u32.u32.ge %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgeu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset4.u32.u32.ge %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgeu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset2.s32.s32.gt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgts2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset4.s32.s32.gt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgts4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset2.u32.u32.gt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgtu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset4.u32.u32.gt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgtu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset2.s32.s32.le %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetles2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset4.s32.s32.le %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetles4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset2.u32.u32.le %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetleu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset4.u32.u32.le %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetleu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset2.s32.s32.lt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetlts2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset4.s32.s32.lt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetlts4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset2.u32.u32.lt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetltu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset4.u32.u32.lt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetltu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset2.u32.u32.ne %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetne2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vset4.u32.u32.ne %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetne4(__a, __b), 8); +} + +// Based on ITEM 23 in AIM-239: http://dspace.mit.edu/handle/1721.1/6086 +// (a & b) + (a | b) = a + b = (a ^ b) + 2 * (a & b) => +// (a + b) / 2 = ((a ^ b) >> 1) + (a & b) +// To operate on multiple sub-elements we need to make sure to mask out bits +// that crossed over into adjacent elements during the shift. +__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) { + return (((__a ^ __b) >> 1) & ~0x80008000u) + (__a & __b); +} +__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) { + return (((__a ^ __b) >> 1) & ~0x80808080u) + (__a & __b); +} + +__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) { + unsigned int r; + if ((__a & 0x8000) && (__b & 0x8000)) { + // Work around a bug in ptxas which produces invalid result if low element + // is negative. + unsigned mask = __vcmpgts2(__a, __b); + r = (__a & mask) | (__b & ~mask); + } else { + __asm__("vmax2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + } + return r; +} +__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vmax4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vmax2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vmax4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vmin2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vmin4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vmin2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vmin4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} + +__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vsub2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); } + +__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vsub4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vneg4(unsigned int __a) { return __vsub4(0, __a); } +__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vsub2.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vnegss2(unsigned int __a) { + return __vsubss2(0, __a); +} +__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vsub4.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vnegss4(unsigned int __a) { + return __vsubss4(0, __a); +} +__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vsub2.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { + unsigned int r; + __asm__("vsub4.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +#endif // CUDA_VERSION >= 9020 + +// For OpenMP we require the user to include as we need to know what +// clock_t is on the system. +#ifndef __OPENMP_NVPTX__ +__DEVICE__ /* clock_t= */ int clock() { return __nvvm_read_ptx_sreg_clock(); } +#endif +__DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); } + +// These functions shouldn't be declared when including this header +// for math function resolution purposes. +#ifndef __OPENMP_NVPTX__ +__DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) { + return __builtin_memcpy(__a, __b, __c); +} +__DEVICE__ void *memset(void *__a, int __b, size_t __c) { + return __builtin_memset(__a, __b, __c); +} +#endif + +#pragma pop_macro("__DEVICE__") +#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__ diff --git a/openmp/libomptarget/DeviceLib/include/Headers/__clang_cuda_libdevice_declares.h b/openmp/libomptarget/DeviceLib/include/Headers/__clang_cuda_libdevice_declares.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceLib/include/Headers/__clang_cuda_libdevice_declares.h @@ -0,0 +1,468 @@ +/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__ +#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__ + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__OPENMP_NVPTX__) +#define __DEVICE__ +#pragma omp begin assumes ext_spmd_amenable no_openmp +#elif defined(__CUDA__) +#define __DEVICE__ __device__ +#endif + +__DEVICE__ int __nv_abs(int __a); +__DEVICE__ double __nv_acos(double __a); +__DEVICE__ float __nv_acosf(float __a); +__DEVICE__ double __nv_acosh(double __a); +__DEVICE__ float __nv_acoshf(float __a); +__DEVICE__ double __nv_asin(double __a); +__DEVICE__ float __nv_asinf(float __a); +__DEVICE__ double __nv_asinh(double __a); +__DEVICE__ float __nv_asinhf(float __a); +__DEVICE__ double __nv_atan2(double __a, double __b); +__DEVICE__ float __nv_atan2f(float __a, float __b); +__DEVICE__ double __nv_atan(double __a); +__DEVICE__ float __nv_atanf(float __a); +__DEVICE__ double __nv_atanh(double __a); +__DEVICE__ float __nv_atanhf(float __a); +__DEVICE__ int __nv_brev(int __a); +__DEVICE__ long long __nv_brevll(long long __a); +__DEVICE__ int __nv_byte_perm(int __a, int __b, int __c); +__DEVICE__ double __nv_cbrt(double __a); +__DEVICE__ float __nv_cbrtf(float __a); +__DEVICE__ double __nv_ceil(double __a); +__DEVICE__ float __nv_ceilf(float __a); +__DEVICE__ int __nv_clz(int __a); +__DEVICE__ int __nv_clzll(long long __a); +__DEVICE__ double __nv_copysign(double __a, double __b); +__DEVICE__ float __nv_copysignf(float __a, float __b); +__DEVICE__ double __nv_cos(double __a); +__DEVICE__ float __nv_cosf(float __a); +__DEVICE__ double __nv_cosh(double __a); +__DEVICE__ float __nv_coshf(float __a); +__DEVICE__ double __nv_cospi(double __a); +__DEVICE__ float __nv_cospif(float __a); +__DEVICE__ double __nv_cyl_bessel_i0(double __a); +__DEVICE__ float __nv_cyl_bessel_i0f(float __a); +__DEVICE__ double __nv_cyl_bessel_i1(double __a); +__DEVICE__ float __nv_cyl_bessel_i1f(float __a); +__DEVICE__ double __nv_dadd_rd(double __a, double __b); +__DEVICE__ double __nv_dadd_rn(double __a, double __b); +__DEVICE__ double __nv_dadd_ru(double __a, double __b); +__DEVICE__ double __nv_dadd_rz(double __a, double __b); +__DEVICE__ double __nv_ddiv_rd(double __a, double __b); +__DEVICE__ double __nv_ddiv_rn(double __a, double __b); +__DEVICE__ double __nv_ddiv_ru(double __a, double __b); +__DEVICE__ double __nv_ddiv_rz(double __a, double __b); +__DEVICE__ double __nv_dmul_rd(double __a, double __b); +__DEVICE__ double __nv_dmul_rn(double __a, double __b); +__DEVICE__ double __nv_dmul_ru(double __a, double __b); +__DEVICE__ double __nv_dmul_rz(double __a, double __b); +__DEVICE__ float __nv_double2float_rd(double __a); +__DEVICE__ float __nv_double2float_rn(double __a); +__DEVICE__ float __nv_double2float_ru(double __a); +__DEVICE__ float __nv_double2float_rz(double __a); +__DEVICE__ int __nv_double2hiint(double __a); +__DEVICE__ int __nv_double2int_rd(double __a); +__DEVICE__ int __nv_double2int_rn(double __a); +__DEVICE__ int __nv_double2int_ru(double __a); +__DEVICE__ int __nv_double2int_rz(double __a); +__DEVICE__ long long __nv_double2ll_rd(double __a); +__DEVICE__ long long __nv_double2ll_rn(double __a); +__DEVICE__ long long __nv_double2ll_ru(double __a); +__DEVICE__ long long __nv_double2ll_rz(double __a); +__DEVICE__ int __nv_double2loint(double __a); +__DEVICE__ unsigned int __nv_double2uint_rd(double __a); +__DEVICE__ unsigned int __nv_double2uint_rn(double __a); +__DEVICE__ unsigned int __nv_double2uint_ru(double __a); +__DEVICE__ unsigned int __nv_double2uint_rz(double __a); +__DEVICE__ unsigned long long __nv_double2ull_rd(double __a); +__DEVICE__ unsigned long long __nv_double2ull_rn(double __a); +__DEVICE__ unsigned long long __nv_double2ull_ru(double __a); +__DEVICE__ unsigned long long __nv_double2ull_rz(double __a); +__DEVICE__ unsigned long long __nv_double_as_longlong(double __a); +__DEVICE__ double __nv_drcp_rd(double __a); +__DEVICE__ double __nv_drcp_rn(double __a); +__DEVICE__ double __nv_drcp_ru(double __a); +__DEVICE__ double __nv_drcp_rz(double __a); +__DEVICE__ double __nv_dsqrt_rd(double __a); +__DEVICE__ double __nv_dsqrt_rn(double __a); +__DEVICE__ double __nv_dsqrt_ru(double __a); +__DEVICE__ double __nv_dsqrt_rz(double __a); +__DEVICE__ double __nv_dsub_rd(double __a, double __b); +__DEVICE__ double __nv_dsub_rn(double __a, double __b); +__DEVICE__ double __nv_dsub_ru(double __a, double __b); +__DEVICE__ double __nv_dsub_rz(double __a, double __b); +__DEVICE__ double __nv_erfc(double __a); +__DEVICE__ float __nv_erfcf(float __a); +__DEVICE__ double __nv_erfcinv(double __a); +__DEVICE__ float __nv_erfcinvf(float __a); +__DEVICE__ double __nv_erfcx(double __a); +__DEVICE__ float __nv_erfcxf(float __a); +__DEVICE__ double __nv_erf(double __a); +__DEVICE__ float __nv_erff(float __a); +__DEVICE__ double __nv_erfinv(double __a); +__DEVICE__ float __nv_erfinvf(float __a); +__DEVICE__ double __nv_exp10(double __a); +__DEVICE__ float __nv_exp10f(float __a); +__DEVICE__ double __nv_exp2(double __a); +__DEVICE__ float __nv_exp2f(float __a); +__DEVICE__ double __nv_exp(double __a); +__DEVICE__ float __nv_expf(float __a); +__DEVICE__ double __nv_expm1(double __a); +__DEVICE__ float __nv_expm1f(float __a); +__DEVICE__ double __nv_fabs(double __a); +__DEVICE__ float __nv_fabsf(float __a); +__DEVICE__ float __nv_fadd_rd(float __a, float __b); +__DEVICE__ float __nv_fadd_rn(float __a, float __b); +__DEVICE__ float __nv_fadd_ru(float __a, float __b); +__DEVICE__ float __nv_fadd_rz(float __a, float __b); +__DEVICE__ float __nv_fast_cosf(float __a); +__DEVICE__ float __nv_fast_exp10f(float __a); +__DEVICE__ float __nv_fast_expf(float __a); +__DEVICE__ float __nv_fast_fdividef(float __a, float __b); +__DEVICE__ float __nv_fast_log10f(float __a); +__DEVICE__ float __nv_fast_log2f(float __a); +__DEVICE__ float __nv_fast_logf(float __a); +__DEVICE__ float __nv_fast_powf(float __a, float __b); +__DEVICE__ void __nv_fast_sincosf(float __a, float *__s, float *__c); +__DEVICE__ float __nv_fast_sinf(float __a); +__DEVICE__ float __nv_fast_tanf(float __a); +__DEVICE__ double __nv_fdim(double __a, double __b); +__DEVICE__ float __nv_fdimf(float __a, float __b); +__DEVICE__ float __nv_fdiv_rd(float __a, float __b); +__DEVICE__ float __nv_fdiv_rn(float __a, float __b); +__DEVICE__ float __nv_fdiv_ru(float __a, float __b); +__DEVICE__ float __nv_fdiv_rz(float __a, float __b); +__DEVICE__ int __nv_ffs(int __a); +__DEVICE__ int __nv_ffsll(long long __a); +__DEVICE__ int __nv_finitef(float __a); +__DEVICE__ unsigned short __nv_float2half_rn(float __a); +__DEVICE__ int __nv_float2int_rd(float __a); +__DEVICE__ int __nv_float2int_rn(float __a); +__DEVICE__ int __nv_float2int_ru(float __a); +__DEVICE__ int __nv_float2int_rz(float __a); +__DEVICE__ long long __nv_float2ll_rd(float __a); +__DEVICE__ long long __nv_float2ll_rn(float __a); +__DEVICE__ long long __nv_float2ll_ru(float __a); +__DEVICE__ long long __nv_float2ll_rz(float __a); +__DEVICE__ unsigned int __nv_float2uint_rd(float __a); +__DEVICE__ unsigned int __nv_float2uint_rn(float __a); +__DEVICE__ unsigned int __nv_float2uint_ru(float __a); +__DEVICE__ unsigned int __nv_float2uint_rz(float __a); +__DEVICE__ unsigned long long __nv_float2ull_rd(float __a); +__DEVICE__ unsigned long long __nv_float2ull_rn(float __a); +__DEVICE__ unsigned long long __nv_float2ull_ru(float __a); +__DEVICE__ unsigned long long __nv_float2ull_rz(float __a); +__DEVICE__ int __nv_float_as_int(float __a); +__DEVICE__ unsigned int __nv_float_as_uint(float __a); +__DEVICE__ double __nv_floor(double __a); +__DEVICE__ float __nv_floorf(float __a); +__DEVICE__ double __nv_fma(double __a, double __b, double __c); +__DEVICE__ float __nv_fmaf(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_rd(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_rn(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ru(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_rz(float __a, float __b, float __c); +__DEVICE__ double __nv_fma_rd(double __a, double __b, double __c); +__DEVICE__ double __nv_fma_rn(double __a, double __b, double __c); +__DEVICE__ double __nv_fma_ru(double __a, double __b, double __c); +__DEVICE__ double __nv_fma_rz(double __a, double __b, double __c); +__DEVICE__ double __nv_fmax(double __a, double __b); +__DEVICE__ float __nv_fmaxf(float __a, float __b); +__DEVICE__ double __nv_fmin(double __a, double __b); +__DEVICE__ float __nv_fminf(float __a, float __b); +__DEVICE__ double __nv_fmod(double __a, double __b); +__DEVICE__ float __nv_fmodf(float __a, float __b); +__DEVICE__ float __nv_fmul_rd(float __a, float __b); +__DEVICE__ float __nv_fmul_rn(float __a, float __b); +__DEVICE__ float __nv_fmul_ru(float __a, float __b); +__DEVICE__ float __nv_fmul_rz(float __a, float __b); +__DEVICE__ float __nv_frcp_rd(float __a); +__DEVICE__ float __nv_frcp_rn(float __a); +__DEVICE__ float __nv_frcp_ru(float __a); +__DEVICE__ float __nv_frcp_rz(float __a); +__DEVICE__ double __nv_frexp(double __a, int *__b); +__DEVICE__ float __nv_frexpf(float __a, int *__b); +__DEVICE__ float __nv_frsqrt_rn(float __a); +__DEVICE__ float __nv_fsqrt_rd(float __a); +__DEVICE__ float __nv_fsqrt_rn(float __a); +__DEVICE__ float __nv_fsqrt_ru(float __a); +__DEVICE__ float __nv_fsqrt_rz(float __a); +__DEVICE__ float __nv_fsub_rd(float __a, float __b); +__DEVICE__ float __nv_fsub_rn(float __a, float __b); +__DEVICE__ float __nv_fsub_ru(float __a, float __b); +__DEVICE__ float __nv_fsub_rz(float __a, float __b); +__DEVICE__ int __nv_hadd(int __a, int __b); +__DEVICE__ float __nv_half2float(unsigned short __h); +__DEVICE__ double __nv_hiloint2double(int __a, int __b); +__DEVICE__ double __nv_hypot(double __a, double __b); +__DEVICE__ float __nv_hypotf(float __a, float __b); +__DEVICE__ int __nv_ilogb(double __a); +__DEVICE__ int __nv_ilogbf(float __a); +__DEVICE__ double __nv_int2double_rn(int __a); +__DEVICE__ float __nv_int2float_rd(int __a); +__DEVICE__ float __nv_int2float_rn(int __a); +__DEVICE__ float __nv_int2float_ru(int __a); +__DEVICE__ float __nv_int2float_rz(int __a); +__DEVICE__ float __nv_int_as_float(int __a); +__DEVICE__ int __nv_isfinited(double __a); +__DEVICE__ int __nv_isinfd(double __a); +__DEVICE__ int __nv_isinff(float __a); +__DEVICE__ int __nv_isnand(double __a); +__DEVICE__ int __nv_isnanf(float __a); +__DEVICE__ double __nv_j0(double __a); +__DEVICE__ float __nv_j0f(float __a); +__DEVICE__ double __nv_j1(double __a); +__DEVICE__ float __nv_j1f(float __a); +__DEVICE__ float __nv_jnf(int __a, float __b); +__DEVICE__ double __nv_jn(int __a, double __b); +__DEVICE__ double __nv_ldexp(double __a, int __b); +__DEVICE__ float __nv_ldexpf(float __a, int __b); +__DEVICE__ double __nv_lgamma(double __a); +__DEVICE__ float __nv_lgammaf(float __a); +__DEVICE__ double __nv_ll2double_rd(long long __a); +__DEVICE__ double __nv_ll2double_rn(long long __a); +__DEVICE__ double __nv_ll2double_ru(long long __a); +__DEVICE__ double __nv_ll2double_rz(long long __a); +__DEVICE__ float __nv_ll2float_rd(long long __a); +__DEVICE__ float __nv_ll2float_rn(long long __a); +__DEVICE__ float __nv_ll2float_ru(long long __a); +__DEVICE__ float __nv_ll2float_rz(long long __a); +__DEVICE__ long long __nv_llabs(long long __a); +__DEVICE__ long long __nv_llmax(long long __a, long long __b); +__DEVICE__ long long __nv_llmin(long long __a, long long __b); +__DEVICE__ long long __nv_llrint(double __a); +__DEVICE__ long long __nv_llrintf(float __a); +__DEVICE__ long long __nv_llround(double __a); +__DEVICE__ long long __nv_llroundf(float __a); +__DEVICE__ double __nv_log10(double __a); +__DEVICE__ float __nv_log10f(float __a); +__DEVICE__ double __nv_log1p(double __a); +__DEVICE__ float __nv_log1pf(float __a); +__DEVICE__ double __nv_log2(double __a); +__DEVICE__ float __nv_log2f(float __a); +__DEVICE__ double __nv_logb(double __a); +__DEVICE__ float __nv_logbf(float __a); +__DEVICE__ double __nv_log(double __a); +__DEVICE__ float __nv_logf(float __a); +__DEVICE__ double __nv_longlong_as_double(long long __a); +__DEVICE__ int __nv_max(int __a, int __b); +__DEVICE__ int __nv_min(int __a, int __b); +__DEVICE__ double __nv_modf(double __a, double *__b); +__DEVICE__ float __nv_modff(float __a, float *__b); +__DEVICE__ int __nv_mul24(int __a, int __b); +__DEVICE__ long long __nv_mul64hi(long long __a, long long __b); +__DEVICE__ int __nv_mulhi(int __a, int __b); +__DEVICE__ double __nv_nan(const signed char *__a); +__DEVICE__ float __nv_nanf(const signed char *__a); +__DEVICE__ double __nv_nearbyint(double __a); +__DEVICE__ float __nv_nearbyintf(float __a); +__DEVICE__ double __nv_nextafter(double __a, double __b); +__DEVICE__ float __nv_nextafterf(float __a, float __b); +__DEVICE__ double __nv_norm3d(double __a, double __b, double __c); +__DEVICE__ float __nv_norm3df(float __a, float __b, float __c); +__DEVICE__ double __nv_norm4d(double __a, double __b, double __c, double __d); +__DEVICE__ float __nv_norm4df(float __a, float __b, float __c, float __d); +__DEVICE__ double __nv_normcdf(double __a); +__DEVICE__ float __nv_normcdff(float __a); +__DEVICE__ double __nv_normcdfinv(double __a); +__DEVICE__ float __nv_normcdfinvf(float __a); +__DEVICE__ float __nv_normf(int __a, const float *__b); +__DEVICE__ double __nv_norm(int __a, const double *__b); +__DEVICE__ int __nv_popc(int __a); +__DEVICE__ int __nv_popcll(long long __a); +__DEVICE__ double __nv_pow(double __a, double __b); +__DEVICE__ float __nv_powf(float __a, float __b); +__DEVICE__ double __nv_powi(double __a, int __b); +__DEVICE__ float __nv_powif(float __a, int __b); +__DEVICE__ double __nv_rcbrt(double __a); +__DEVICE__ float __nv_rcbrtf(float __a); +__DEVICE__ double __nv_rcp64h(double __a); +__DEVICE__ double __nv_remainder(double __a, double __b); +__DEVICE__ float __nv_remainderf(float __a, float __b); +__DEVICE__ double __nv_remquo(double __a, double __b, int *__c); +__DEVICE__ float __nv_remquof(float __a, float __b, int *__c); +__DEVICE__ int __nv_rhadd(int __a, int __b); +__DEVICE__ double __nv_rhypot(double __a, double __b); +__DEVICE__ float __nv_rhypotf(float __a, float __b); +__DEVICE__ double __nv_rint(double __a); +__DEVICE__ float __nv_rintf(float __a); +__DEVICE__ double __nv_rnorm3d(double __a, double __b, double __c); +__DEVICE__ float __nv_rnorm3df(float __a, float __b, float __c); +__DEVICE__ double __nv_rnorm4d(double __a, double __b, double __c, double __d); +__DEVICE__ float __nv_rnorm4df(float __a, float __b, float __c, float __d); +__DEVICE__ float __nv_rnormf(int __a, const float *__b); +__DEVICE__ double __nv_rnorm(int __a, const double *__b); +__DEVICE__ double __nv_round(double __a); +__DEVICE__ float __nv_roundf(float __a); +__DEVICE__ double __nv_rsqrt(double __a); +__DEVICE__ float __nv_rsqrtf(float __a); +__DEVICE__ int __nv_sad(int __a, int __b, int __c); +__DEVICE__ float __nv_saturatef(float __a); +__DEVICE__ double __nv_scalbn(double __a, int __b); +__DEVICE__ float __nv_scalbnf(float __a, int __b); +__DEVICE__ int __nv_signbitd(double __a); +__DEVICE__ int __nv_signbitf(float __a); +__DEVICE__ void __nv_sincos(double __a, double *__b, double *__c); +__DEVICE__ void __nv_sincosf(float __a, float *__b, float *__c); +__DEVICE__ void __nv_sincospi(double __a, double *__b, double *__c); +__DEVICE__ void __nv_sincospif(float __a, float *__b, float *__c); +__DEVICE__ double __nv_sin(double __a); +__DEVICE__ float __nv_sinf(float __a); +__DEVICE__ double __nv_sinh(double __a); +__DEVICE__ float __nv_sinhf(float __a); +__DEVICE__ double __nv_sinpi(double __a); +__DEVICE__ float __nv_sinpif(float __a); +__DEVICE__ double __nv_sqrt(double __a); +__DEVICE__ float __nv_sqrtf(float __a); +__DEVICE__ double __nv_tan(double __a); +__DEVICE__ float __nv_tanf(float __a); +__DEVICE__ double __nv_tanh(double __a); +__DEVICE__ float __nv_tanhf(float __a); +__DEVICE__ double __nv_tgamma(double __a); +__DEVICE__ float __nv_tgammaf(float __a); +__DEVICE__ double __nv_trunc(double __a); +__DEVICE__ float __nv_truncf(float __a); +__DEVICE__ int __nv_uhadd(unsigned int __a, unsigned int __b); +__DEVICE__ double __nv_uint2double_rn(unsigned int __i); +__DEVICE__ float __nv_uint2float_rd(unsigned int __a); +__DEVICE__ float __nv_uint2float_rn(unsigned int __a); +__DEVICE__ float __nv_uint2float_ru(unsigned int __a); +__DEVICE__ float __nv_uint2float_rz(unsigned int __a); +__DEVICE__ float __nv_uint_as_float(unsigned int __a); +__DEVICE__ double __nv_ull2double_rd(unsigned long long __a); +__DEVICE__ double __nv_ull2double_rn(unsigned long long __a); +__DEVICE__ double __nv_ull2double_ru(unsigned long long __a); +__DEVICE__ double __nv_ull2double_rz(unsigned long long __a); +__DEVICE__ float __nv_ull2float_rd(unsigned long long __a); +__DEVICE__ float __nv_ull2float_rn(unsigned long long __a); +__DEVICE__ float __nv_ull2float_ru(unsigned long long __a); +__DEVICE__ float __nv_ull2float_rz(unsigned long long __a); +__DEVICE__ unsigned long long __nv_ullmax(unsigned long long __a, + unsigned long long __b); +__DEVICE__ unsigned long long __nv_ullmin(unsigned long long __a, + unsigned long long __b); +__DEVICE__ unsigned int __nv_umax(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_umin(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned long long __nv_umul64hi(unsigned long long __a, + unsigned long long __b); +__DEVICE__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_usad(unsigned int __a, unsigned int __b, + unsigned int __c); +#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020 +__DEVICE__ int __nv_vabs2(int __a); +__DEVICE__ int __nv_vabs4(int __a); +__DEVICE__ int __nv_vabsdiffs2(int __a, int __b); +__DEVICE__ int __nv_vabsdiffs4(int __a, int __b); +__DEVICE__ int __nv_vabsdiffu2(int __a, int __b); +__DEVICE__ int __nv_vabsdiffu4(int __a, int __b); +__DEVICE__ int __nv_vabsss2(int __a); +__DEVICE__ int __nv_vabsss4(int __a); +__DEVICE__ int __nv_vadd2(int __a, int __b); +__DEVICE__ int __nv_vadd4(int __a, int __b); +__DEVICE__ int __nv_vaddss2(int __a, int __b); +__DEVICE__ int __nv_vaddss4(int __a, int __b); +__DEVICE__ int __nv_vaddus2(int __a, int __b); +__DEVICE__ int __nv_vaddus4(int __a, int __b); +__DEVICE__ int __nv_vavgs2(int __a, int __b); +__DEVICE__ int __nv_vavgs4(int __a, int __b); +__DEVICE__ int __nv_vavgu2(int __a, int __b); +__DEVICE__ int __nv_vavgu4(int __a, int __b); +__DEVICE__ int __nv_vcmpeq2(int __a, int __b); +__DEVICE__ int __nv_vcmpeq4(int __a, int __b); +__DEVICE__ int __nv_vcmpges2(int __a, int __b); +__DEVICE__ int __nv_vcmpges4(int __a, int __b); +__DEVICE__ int __nv_vcmpgeu2(int __a, int __b); +__DEVICE__ int __nv_vcmpgeu4(int __a, int __b); +__DEVICE__ int __nv_vcmpgts2(int __a, int __b); +__DEVICE__ int __nv_vcmpgts4(int __a, int __b); +__DEVICE__ int __nv_vcmpgtu2(int __a, int __b); +__DEVICE__ int __nv_vcmpgtu4(int __a, int __b); +__DEVICE__ int __nv_vcmples2(int __a, int __b); +__DEVICE__ int __nv_vcmples4(int __a, int __b); +__DEVICE__ int __nv_vcmpleu2(int __a, int __b); +__DEVICE__ int __nv_vcmpleu4(int __a, int __b); +__DEVICE__ int __nv_vcmplts2(int __a, int __b); +__DEVICE__ int __nv_vcmplts4(int __a, int __b); +__DEVICE__ int __nv_vcmpltu2(int __a, int __b); +__DEVICE__ int __nv_vcmpltu4(int __a, int __b); +__DEVICE__ int __nv_vcmpne2(int __a, int __b); +__DEVICE__ int __nv_vcmpne4(int __a, int __b); +__DEVICE__ int __nv_vhaddu2(int __a, int __b); +__DEVICE__ int __nv_vhaddu4(int __a, int __b); +__DEVICE__ int __nv_vmaxs2(int __a, int __b); +__DEVICE__ int __nv_vmaxs4(int __a, int __b); +__DEVICE__ int __nv_vmaxu2(int __a, int __b); +__DEVICE__ int __nv_vmaxu4(int __a, int __b); +__DEVICE__ int __nv_vmins2(int __a, int __b); +__DEVICE__ int __nv_vmins4(int __a, int __b); +__DEVICE__ int __nv_vminu2(int __a, int __b); +__DEVICE__ int __nv_vminu4(int __a, int __b); +__DEVICE__ int __nv_vneg2(int __a); +__DEVICE__ int __nv_vneg4(int __a); +__DEVICE__ int __nv_vnegss2(int __a); +__DEVICE__ int __nv_vnegss4(int __a); +__DEVICE__ int __nv_vsads2(int __a, int __b); +__DEVICE__ int __nv_vsads4(int __a, int __b); +__DEVICE__ int __nv_vsadu2(int __a, int __b); +__DEVICE__ int __nv_vsadu4(int __a, int __b); +__DEVICE__ int __nv_vseteq2(int __a, int __b); +__DEVICE__ int __nv_vseteq4(int __a, int __b); +__DEVICE__ int __nv_vsetges2(int __a, int __b); +__DEVICE__ int __nv_vsetges4(int __a, int __b); +__DEVICE__ int __nv_vsetgeu2(int __a, int __b); +__DEVICE__ int __nv_vsetgeu4(int __a, int __b); +__DEVICE__ int __nv_vsetgts2(int __a, int __b); +__DEVICE__ int __nv_vsetgts4(int __a, int __b); +__DEVICE__ int __nv_vsetgtu2(int __a, int __b); +__DEVICE__ int __nv_vsetgtu4(int __a, int __b); +__DEVICE__ int __nv_vsetles2(int __a, int __b); +__DEVICE__ int __nv_vsetles4(int __a, int __b); +__DEVICE__ int __nv_vsetleu2(int __a, int __b); +__DEVICE__ int __nv_vsetleu4(int __a, int __b); +__DEVICE__ int __nv_vsetlts2(int __a, int __b); +__DEVICE__ int __nv_vsetlts4(int __a, int __b); +__DEVICE__ int __nv_vsetltu2(int __a, int __b); +__DEVICE__ int __nv_vsetltu4(int __a, int __b); +__DEVICE__ int __nv_vsetne2(int __a, int __b); +__DEVICE__ int __nv_vsetne4(int __a, int __b); +__DEVICE__ int __nv_vsub2(int __a, int __b); +__DEVICE__ int __nv_vsub4(int __a, int __b); +__DEVICE__ int __nv_vsubss2(int __a, int __b); +__DEVICE__ int __nv_vsubss4(int __a, int __b); +__DEVICE__ int __nv_vsubus2(int __a, int __b); +__DEVICE__ int __nv_vsubus4(int __a, int __b); +#endif // CUDA_VERSION +__DEVICE__ double __nv_y0(double __a); +__DEVICE__ float __nv_y0f(float __a); +__DEVICE__ double __nv_y1(double __a); +__DEVICE__ float __nv_y1f(float __a); +__DEVICE__ float __nv_ynf(int __a, float __b); +__DEVICE__ double __nv_yn(int __a, double __b); + +#if defined(__OPENMP_NVPTX__) +#pragma omp end assumes ext_spmd_amenable no_openmp +#endif + +#if defined(__cplusplus) +} // extern "C" +#endif +#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__ diff --git a/openmp/libomptarget/DeviceLib/include/Headers/__clang_cuda_math.h b/openmp/libomptarget/DeviceLib/include/Headers/__clang_cuda_math.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceLib/include/Headers/__clang_cuda_math.h @@ -0,0 +1,340 @@ +/*===---- __clang_cuda_math.h - Device-side CUDA math support --------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __CLANG_CUDA_MATH_H__ +#define __CLANG_CUDA_MATH_H__ +#ifndef __CUDA__ +#error "This file is for CUDA compilation only." +#endif + +#ifndef __OPENMP_NVPTX__ +#if CUDA_VERSION < 9000 +#error This file is intended to be used with CUDA-9+ only. +#endif +#endif + +// __DEVICE__ is a helper macro with common set of attributes for the wrappers +// we implement in this file. We need static in order to avoid emitting unused +// functions and __forceinline__ helps inlining these wrappers at -O1. +#pragma push_macro("__DEVICE__") +#ifdef __OPENMP_NVPTX__ +#if defined(__cplusplus) +#define __DEVICE__ __attribute__((always_inline, nothrow)) +#else +#define __DEVICE__ __attribute__((always_inline, nothrow)) +#endif +#else +#define __DEVICE__ __device__ __forceinline__ +#endif + +// Specialized version of __DEVICE__ for functions with void return type. Needed +// because the OpenMP overlay requires constexpr functions here but prior to +// c++14 void return functions could not be constexpr. +#pragma push_macro("__DEVICE_VOID__") +#if defined(__OPENMP_NVPTX__) && defined(__cplusplus) && __cplusplus < 201402L +#define __DEVICE_VOID__ __attribute__((always_inline, nothrow)) +#else +#define __DEVICE_VOID__ __DEVICE__ +#endif + +// libdevice provides fast low precision and slow full-recision implementations +// for some functions. Which one gets selected depends on +// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if +// -ffast-math or -fcuda-approx-transcendentals are in effect. +#pragma push_macro("__FAST_OR_SLOW") +#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__) +#define __FAST_OR_SLOW(fast, slow) fast +#else +#define __FAST_OR_SLOW(fast, slow) slow +#endif + +__DEVICE__ int abs(int __a) { return __nv_abs(__a); } +__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); } +__DEVICE__ double acos(double __a) { return __nv_acos(__a); } +__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); } +__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); } +__DEVICE__ float acoshf(float __a) { return __nv_acoshf(__a); } +__DEVICE__ double asin(double __a) { return __nv_asin(__a); } +__DEVICE__ float asinf(float __a) { return __nv_asinf(__a); } +__DEVICE__ double asinh(double __a) { return __nv_asinh(__a); } +__DEVICE__ float asinhf(float __a) { return __nv_asinhf(__a); } +__DEVICE__ double atan(double __a) { return __nv_atan(__a); } +__DEVICE__ double atan2(double __a, double __b) { return __nv_atan2(__a, __b); } +__DEVICE__ float atan2f(float __a, float __b) { return __nv_atan2f(__a, __b); } +__DEVICE__ float atanf(float __a) { return __nv_atanf(__a); } +__DEVICE__ double atanh(double __a) { return __nv_atanh(__a); } +__DEVICE__ float atanhf(float __a) { return __nv_atanhf(__a); } +__DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); } +__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); } +__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); } +__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); } +__DEVICE__ double copysign(double __a, double __b) { + return __nv_copysign(__a, __b); +} +__DEVICE__ float copysignf(float __a, float __b) { + return __nv_copysignf(__a, __b); +} +__DEVICE__ double cos(double __a) { return __nv_cos(__a); } +__DEVICE__ float cosf(float __a) { + return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a); +} +__DEVICE__ double cosh(double __a) { return __nv_cosh(__a); } +__DEVICE__ float coshf(float __a) { return __nv_coshf(__a); } +__DEVICE__ double cospi(double __a) { return __nv_cospi(__a); } +__DEVICE__ float cospif(float __a) { return __nv_cospif(__a); } +__DEVICE__ double cyl_bessel_i0(double __a) { return __nv_cyl_bessel_i0(__a); } +__DEVICE__ float cyl_bessel_i0f(float __a) { return __nv_cyl_bessel_i0f(__a); } +__DEVICE__ double cyl_bessel_i1(double __a) { return __nv_cyl_bessel_i1(__a); } +__DEVICE__ float cyl_bessel_i1f(float __a) { return __nv_cyl_bessel_i1f(__a); } +__DEVICE__ double erf(double __a) { return __nv_erf(__a); } +__DEVICE__ double erfc(double __a) { return __nv_erfc(__a); } +__DEVICE__ float erfcf(float __a) { return __nv_erfcf(__a); } +__DEVICE__ double erfcinv(double __a) { return __nv_erfcinv(__a); } +__DEVICE__ float erfcinvf(float __a) { return __nv_erfcinvf(__a); } +__DEVICE__ double erfcx(double __a) { return __nv_erfcx(__a); } +__DEVICE__ float erfcxf(float __a) { return __nv_erfcxf(__a); } +__DEVICE__ float erff(float __a) { return __nv_erff(__a); } +__DEVICE__ double erfinv(double __a) { return __nv_erfinv(__a); } +__DEVICE__ float erfinvf(float __a) { return __nv_erfinvf(__a); } +__DEVICE__ double exp(double __a) { return __nv_exp(__a); } +__DEVICE__ double exp10(double __a) { return __nv_exp10(__a); } +__DEVICE__ float exp10f(float __a) { return __nv_exp10f(__a); } +__DEVICE__ double exp2(double __a) { return __nv_exp2(__a); } +__DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); } +__DEVICE__ float expf(float __a) { return __nv_expf(__a); } +__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); } +__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); } +__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); } +__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); } +__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); } +__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; } +__DEVICE__ float fdividef(float __a, float __b) { +#if __FAST_MATH__ && !__CUDA_PREC_DIV + return __nv_fast_fdividef(__a, __b); +#else + return __a / __b; +#endif +} +__DEVICE__ double floor(double __f) { return __nv_floor(__f); } +__DEVICE__ float floorf(float __f) { return __nv_floorf(__f); } +__DEVICE__ double fma(double __a, double __b, double __c) { + return __nv_fma(__a, __b, __c); +} +__DEVICE__ float fmaf(float __a, float __b, float __c) { + return __nv_fmaf(__a, __b, __c); +} +__DEVICE__ double fmax(double __a, double __b) { return __nv_fmax(__a, __b); } +__DEVICE__ float fmaxf(float __a, float __b) { return __nv_fmaxf(__a, __b); } +__DEVICE__ double fmin(double __a, double __b) { return __nv_fmin(__a, __b); } +__DEVICE__ float fminf(float __a, float __b) { return __nv_fminf(__a, __b); } +__DEVICE__ double fmod(double __a, double __b) { return __nv_fmod(__a, __b); } +__DEVICE__ float fmodf(float __a, float __b) { return __nv_fmodf(__a, __b); } +__DEVICE__ double frexp(double __a, int *__b) { return __nv_frexp(__a, __b); } +__DEVICE__ float frexpf(float __a, int *__b) { return __nv_frexpf(__a, __b); } +__DEVICE__ double hypot(double __a, double __b) { return __nv_hypot(__a, __b); } +__DEVICE__ float hypotf(float __a, float __b) { return __nv_hypotf(__a, __b); } +__DEVICE__ int ilogb(double __a) { return __nv_ilogb(__a); } +__DEVICE__ int ilogbf(float __a) { return __nv_ilogbf(__a); } +__DEVICE__ double j0(double __a) { return __nv_j0(__a); } +__DEVICE__ float j0f(float __a) { return __nv_j0f(__a); } +__DEVICE__ double j1(double __a) { return __nv_j1(__a); } +__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); } +__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); } +__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); } +#if defined(__LP64__) || defined(_WIN64) +__DEVICE__ long labs(long __a) { return __nv_llabs(__a); }; +#else +__DEVICE__ long labs(long __a) { return __nv_abs(__a); }; +#endif +__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); } +__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); } +__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); } +__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); } +__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); } +__DEVICE__ long long llmax(long long __a, long long __b) { + return __nv_llmax(__a, __b); +} +__DEVICE__ long long llmin(long long __a, long long __b) { + return __nv_llmin(__a, __b); +} +__DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); } +__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); } +__DEVICE__ long long llround(double __a) { return __nv_llround(__a); } +__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); } +__DEVICE__ double round(double __a) { return __nv_round(__a); } +__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); } +__DEVICE__ double log(double __a) { return __nv_log(__a); } +__DEVICE__ double log10(double __a) { return __nv_log10(__a); } +__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); } +__DEVICE__ double log1p(double __a) { return __nv_log1p(__a); } +__DEVICE__ float log1pf(float __a) { return __nv_log1pf(__a); } +__DEVICE__ double log2(double __a) { return __nv_log2(__a); } +__DEVICE__ float log2f(float __a) { + return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a); +} +__DEVICE__ double logb(double __a) { return __nv_logb(__a); } +__DEVICE__ float logbf(float __a) { return __nv_logbf(__a); } +__DEVICE__ float logf(float __a) { + return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a); +} +#if defined(__LP64__) || defined(_WIN64) +__DEVICE__ long lrint(double __a) { return llrint(__a); } +__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); } +__DEVICE__ long lround(double __a) { return llround(__a); } +__DEVICE__ long lroundf(float __a) { return llroundf(__a); } +#else +__DEVICE__ long lrint(double __a) { return (long)rint(__a); } +__DEVICE__ long lrintf(float __a) { return __float2int_rn(__a); } +__DEVICE__ long lround(double __a) { return round(__a); } +__DEVICE__ long lroundf(float __a) { return roundf(__a); } +#endif +__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); } +__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); } +__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); } +__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); } +__DEVICE__ double nearbyint(double __a) { return __builtin_nearbyint(__a); } +__DEVICE__ float nearbyintf(float __a) { return __builtin_nearbyintf(__a); } +__DEVICE__ double nextafter(double __a, double __b) { + return __nv_nextafter(__a, __b); +} +__DEVICE__ float nextafterf(float __a, float __b) { + return __nv_nextafterf(__a, __b); +} +__DEVICE__ double norm(int __dim, const double *__t) { + return __nv_norm(__dim, __t); +} +__DEVICE__ double norm3d(double __a, double __b, double __c) { + return __nv_norm3d(__a, __b, __c); +} +__DEVICE__ float norm3df(float __a, float __b, float __c) { + return __nv_norm3df(__a, __b, __c); +} +__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) { + return __nv_norm4d(__a, __b, __c, __d); +} +__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) { + return __nv_norm4df(__a, __b, __c, __d); +} +__DEVICE__ double normcdf(double __a) { return __nv_normcdf(__a); } +__DEVICE__ float normcdff(float __a) { return __nv_normcdff(__a); } +__DEVICE__ double normcdfinv(double __a) { return __nv_normcdfinv(__a); } +__DEVICE__ float normcdfinvf(float __a) { return __nv_normcdfinvf(__a); } +__DEVICE__ float normf(int __dim, const float *__t) { + return __nv_normf(__dim, __t); +} +__DEVICE__ double pow(double __a, double __b) { return __nv_pow(__a, __b); } +__DEVICE__ float powf(float __a, float __b) { return __nv_powf(__a, __b); } +__DEVICE__ double powi(double __a, int __b) { return __nv_powi(__a, __b); } +__DEVICE__ float powif(float __a, int __b) { return __nv_powif(__a, __b); } +__DEVICE__ double rcbrt(double __a) { return __nv_rcbrt(__a); } +__DEVICE__ float rcbrtf(float __a) { return __nv_rcbrtf(__a); } +__DEVICE__ double remainder(double __a, double __b) { + return __nv_remainder(__a, __b); +} +__DEVICE__ float remainderf(float __a, float __b) { + return __nv_remainderf(__a, __b); +} +__DEVICE__ double remquo(double __a, double __b, int *__c) { + return __nv_remquo(__a, __b, __c); +} +__DEVICE__ float remquof(float __a, float __b, int *__c) { + return __nv_remquof(__a, __b, __c); +} +__DEVICE__ double rhypot(double __a, double __b) { + return __nv_rhypot(__a, __b); +} +__DEVICE__ float rhypotf(float __a, float __b) { + return __nv_rhypotf(__a, __b); +} +// __nv_rint* in libdevice is buggy and produces incorrect results. +__DEVICE__ double rint(double __a) { return __builtin_rint(__a); } +__DEVICE__ float rintf(float __a) { return __builtin_rintf(__a); } +__DEVICE__ double rnorm(int __a, const double *__b) { + return __nv_rnorm(__a, __b); +} +__DEVICE__ double rnorm3d(double __a, double __b, double __c) { + return __nv_rnorm3d(__a, __b, __c); +} +__DEVICE__ float rnorm3df(float __a, float __b, float __c) { + return __nv_rnorm3df(__a, __b, __c); +} +__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) { + return __nv_rnorm4d(__a, __b, __c, __d); +} +__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) { + return __nv_rnorm4df(__a, __b, __c, __d); +} +__DEVICE__ float rnormf(int __dim, const float *__t) { + return __nv_rnormf(__dim, __t); +} +__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); } +__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); } +__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); } +__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); } +__DEVICE__ double scalbln(double __a, long __b) { + return scalbn(__a, (int)__b); +} +__DEVICE__ float scalblnf(float __a, long __b) { + return scalbnf(__a, (int)__b); +} +__DEVICE__ double sin(double __a) { return __nv_sin(__a); } +__DEVICE_VOID__ void sincos(double __a, double *__s, double *__c) { + return __nv_sincos(__a, __s, __c); +} +__DEVICE_VOID__ void sincosf(float __a, float *__s, float *__c) { + return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c); +} +__DEVICE_VOID__ void sincospi(double __a, double *__s, double *__c) { + return __nv_sincospi(__a, __s, __c); +} +__DEVICE_VOID__ void sincospif(float __a, float *__s, float *__c) { + return __nv_sincospif(__a, __s, __c); +} +__DEVICE__ float sinf(float __a) { + return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a); +} +__DEVICE__ double sinh(double __a) { return __nv_sinh(__a); } +__DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); } +__DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); } +__DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); } +__DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); } +__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); } +__DEVICE__ double tan(double __a) { return __nv_tan(__a); } +__DEVICE__ float tanf(float __a) { return __nv_tanf(__a); } +__DEVICE__ double tanh(double __a) { return __nv_tanh(__a); } +__DEVICE__ float tanhf(float __a) { return __nv_tanhf(__a); } +__DEVICE__ double tgamma(double __a) { return __nv_tgamma(__a); } +__DEVICE__ float tgammaf(float __a) { return __nv_tgammaf(__a); } +__DEVICE__ double trunc(double __a) { return __nv_trunc(__a); } +__DEVICE__ float truncf(float __a) { return __nv_truncf(__a); } +__DEVICE__ unsigned long long ullmax(unsigned long long __a, + unsigned long long __b) { + return __nv_ullmax(__a, __b); +} +__DEVICE__ unsigned long long ullmin(unsigned long long __a, + unsigned long long __b) { + return __nv_ullmin(__a, __b); +} +__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) { + return __nv_umax(__a, __b); +} +__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) { + return __nv_umin(__a, __b); +} +__DEVICE__ double y0(double __a) { return __nv_y0(__a); } +__DEVICE__ float y0f(float __a) { return __nv_y0f(__a); } +__DEVICE__ double y1(double __a) { return __nv_y1(__a); } +__DEVICE__ float y1f(float __a) { return __nv_y1f(__a); } +__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); } +__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); } + +#pragma pop_macro("__DEVICE__") +#pragma pop_macro("__DEVICE_VOID__") +#pragma pop_macro("__FAST_OR_SLOW") + +#endif // __CLANG_CUDA_MATH_H__ diff --git a/openmp/libomptarget/DeviceLib/include/Headers/__clang_hip_libdevice_declares.h b/openmp/libomptarget/DeviceLib/include/Headers/__clang_hip_libdevice_declares.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceLib/include/Headers/__clang_hip_libdevice_declares.h @@ -0,0 +1,350 @@ +/*===---- __clang_hip_libdevice_declares.h - HIP device library decls -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__ +#define __CLANG_HIP_LIBDEVICE_DECLARES_H__ + +#ifdef __OPENMP_AMDGCN__ +#define __device__ +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// BEGIN FLOAT +__device__ __attribute__((const)) float __ocml_acos_f32(float); +__device__ __attribute__((pure)) float __ocml_acosh_f32(float); +__device__ __attribute__((const)) float __ocml_asin_f32(float); +__device__ __attribute__((pure)) float __ocml_asinh_f32(float); +__device__ __attribute__((const)) float __ocml_atan2_f32(float, float); +__device__ __attribute__((const)) float __ocml_atan_f32(float); +__device__ __attribute__((pure)) float __ocml_atanh_f32(float); +__device__ __attribute__((pure)) float __ocml_cbrt_f32(float); +__device__ __attribute__((const)) float __ocml_ceil_f32(float); +__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float, + float); +__device__ float __ocml_cos_f32(float); +__device__ float __ocml_native_cos_f32(float); +__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float); +__device__ float __ocml_cospi_f32(float); +__device__ float __ocml_i0_f32(float); +__device__ float __ocml_i1_f32(float); +__device__ __attribute__((pure)) float __ocml_erfc_f32(float); +__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float); +__device__ __attribute__((pure)) float __ocml_erfcx_f32(float); +__device__ __attribute__((pure)) float __ocml_erf_f32(float); +__device__ __attribute__((pure)) float __ocml_erfinv_f32(float); +__device__ __attribute__((pure)) float __ocml_exp10_f32(float); +__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float); +__device__ __attribute__((pure)) float __ocml_exp2_f32(float); +__device__ __attribute__((pure)) float __ocml_exp_f32(float); +__device__ __attribute__((pure)) float __ocml_native_exp_f32(float); +__device__ __attribute__((pure)) float __ocml_expm1_f32(float); +__device__ __attribute__((const)) float __ocml_fabs_f32(float); +__device__ __attribute__((const)) float __ocml_fdim_f32(float, float); +__device__ __attribute__((const)) float __ocml_floor_f32(float); +__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float); +__device__ __attribute__((const)) float __ocml_fmax_f32(float, float); +__device__ __attribute__((const)) float __ocml_fmin_f32(float, float); +__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float, + float); +__device__ float __ocml_frexp_f32(float, + __attribute__((address_space(5))) int *); +__device__ __attribute__((const)) float __ocml_hypot_f32(float, float); +__device__ __attribute__((const)) int __ocml_ilogb_f32(float); +__device__ __attribute__((const)) int __ocml_isfinite_f32(float); +__device__ __attribute__((const)) int __ocml_isinf_f32(float); +__device__ __attribute__((const)) int __ocml_isnan_f32(float); +__device__ float __ocml_j0_f32(float); +__device__ float __ocml_j1_f32(float); +__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int); +__device__ float __ocml_lgamma_f32(float); +__device__ __attribute__((pure)) float __ocml_log10_f32(float); +__device__ __attribute__((pure)) float __ocml_native_log10_f32(float); +__device__ __attribute__((pure)) float __ocml_log1p_f32(float); +__device__ __attribute__((pure)) float __ocml_log2_f32(float); +__device__ __attribute__((pure)) float __ocml_native_log2_f32(float); +__device__ __attribute__((const)) float __ocml_logb_f32(float); +__device__ __attribute__((pure)) float __ocml_log_f32(float); +__device__ __attribute__((pure)) float __ocml_native_log_f32(float); +__device__ float __ocml_modf_f32(float, + __attribute__((address_space(5))) float *); +__device__ __attribute__((const)) float __ocml_nearbyint_f32(float); +__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float); +__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float); +__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float, + float); +__device__ __attribute__((pure)) float __ocml_ncdf_f32(float); +__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float); +__device__ __attribute__((pure)) float __ocml_pow_f32(float, float); +__device__ __attribute__((pure)) float __ocml_pown_f32(float, int); +__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float); +__device__ __attribute__((const)) float __ocml_remainder_f32(float, float); +__device__ float __ocml_remquo_f32(float, float, + __attribute__((address_space(5))) int *); +__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float); +__device__ __attribute__((const)) float __ocml_rint_f32(float); +__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float); +__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float, + float); +__device__ __attribute__((const)) float __ocml_round_f32(float); +__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float); +__device__ __attribute__((const)) float __ocml_scalb_f32(float, float); +__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int); +__device__ __attribute__((const)) int __ocml_signbit_f32(float); +__device__ float __ocml_sincos_f32(float, + __attribute__((address_space(5))) float *); +__device__ float __ocml_sincospi_f32(float, + __attribute__((address_space(5))) float *); +__device__ float __ocml_sin_f32(float); +__device__ float __ocml_native_sin_f32(float); +__device__ __attribute__((pure)) float __ocml_sinh_f32(float); +__device__ float __ocml_sinpi_f32(float); +__device__ __attribute__((const)) float __ocml_sqrt_f32(float); +__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float); +__device__ float __ocml_tan_f32(float); +__device__ __attribute__((pure)) float __ocml_tanh_f32(float); +__device__ float __ocml_tgamma_f32(float); +__device__ __attribute__((const)) float __ocml_trunc_f32(float); +__device__ float __ocml_y0_f32(float); +__device__ float __ocml_y1_f32(float); + +// BEGIN INTRINSICS +__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float); +__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float); +__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float); +__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float); +__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float); +__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float); +__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float); +__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float); +__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float); +__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float); +__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float); +__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float); +__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float); +__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float); +__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float); +__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float); +__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float); +__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float); +__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float); +__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float); +__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float); +__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float); +__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float); +__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float); + +__device__ inline __attribute__((const)) float +__llvm_amdgcn_cos_f32(float __x) { + return __builtin_amdgcn_cosf(__x); +} +__device__ inline __attribute__((const)) float +__llvm_amdgcn_rcp_f32(float __x) { + return __builtin_amdgcn_rcpf(__x); +} +__device__ inline __attribute__((const)) float +__llvm_amdgcn_rsq_f32(float __x) { + return __builtin_amdgcn_rsqf(__x); +} +__device__ inline __attribute__((const)) float +__llvm_amdgcn_sin_f32(float __x) { + return __builtin_amdgcn_sinf(__x); +} +// END INTRINSICS +// END FLOAT + +// BEGIN DOUBLE +__device__ __attribute__((const)) double __ocml_acos_f64(double); +__device__ __attribute__((pure)) double __ocml_acosh_f64(double); +__device__ __attribute__((const)) double __ocml_asin_f64(double); +__device__ __attribute__((pure)) double __ocml_asinh_f64(double); +__device__ __attribute__((const)) double __ocml_atan2_f64(double, double); +__device__ __attribute__((const)) double __ocml_atan_f64(double); +__device__ __attribute__((pure)) double __ocml_atanh_f64(double); +__device__ __attribute__((pure)) double __ocml_cbrt_f64(double); +__device__ __attribute__((const)) double __ocml_ceil_f64(double); +__device__ __attribute__((const)) double __ocml_copysign_f64(double, double); +__device__ double __ocml_cos_f64(double); +__device__ __attribute__((pure)) double __ocml_cosh_f64(double); +__device__ double __ocml_cospi_f64(double); +__device__ double __ocml_i0_f64(double); +__device__ double __ocml_i1_f64(double); +__device__ __attribute__((pure)) double __ocml_erfc_f64(double); +__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double); +__device__ __attribute__((pure)) double __ocml_erfcx_f64(double); +__device__ __attribute__((pure)) double __ocml_erf_f64(double); +__device__ __attribute__((pure)) double __ocml_erfinv_f64(double); +__device__ __attribute__((pure)) double __ocml_exp10_f64(double); +__device__ __attribute__((pure)) double __ocml_exp2_f64(double); +__device__ __attribute__((pure)) double __ocml_exp_f64(double); +__device__ __attribute__((pure)) double __ocml_expm1_f64(double); +__device__ __attribute__((const)) double __ocml_fabs_f64(double); +__device__ __attribute__((const)) double __ocml_fdim_f64(double, double); +__device__ __attribute__((const)) double __ocml_floor_f64(double); +__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double); +__device__ __attribute__((const)) double __ocml_fmax_f64(double, double); +__device__ __attribute__((const)) double __ocml_fmin_f64(double, double); +__device__ __attribute__((const)) double __ocml_fmod_f64(double, double); +__device__ double __ocml_frexp_f64(double, + __attribute__((address_space(5))) int *); +__device__ __attribute__((const)) double __ocml_hypot_f64(double, double); +__device__ __attribute__((const)) int __ocml_ilogb_f64(double); +__device__ __attribute__((const)) int __ocml_isfinite_f64(double); +__device__ __attribute__((const)) int __ocml_isinf_f64(double); +__device__ __attribute__((const)) int __ocml_isnan_f64(double); +__device__ double __ocml_j0_f64(double); +__device__ double __ocml_j1_f64(double); +__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int); +__device__ double __ocml_lgamma_f64(double); +__device__ __attribute__((pure)) double __ocml_log10_f64(double); +__device__ __attribute__((pure)) double __ocml_log1p_f64(double); +__device__ __attribute__((pure)) double __ocml_log2_f64(double); +__device__ __attribute__((const)) double __ocml_logb_f64(double); +__device__ __attribute__((pure)) double __ocml_log_f64(double); +__device__ double __ocml_modf_f64(double, + __attribute__((address_space(5))) double *); +__device__ __attribute__((const)) double __ocml_nearbyint_f64(double); +__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double); +__device__ __attribute__((const)) double __ocml_len3_f64(double, double, + double); +__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double, + double); +__device__ __attribute__((pure)) double __ocml_ncdf_f64(double); +__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double); +__device__ __attribute__((pure)) double __ocml_pow_f64(double, double); +__device__ __attribute__((pure)) double __ocml_pown_f64(double, int); +__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double); +__device__ __attribute__((const)) double __ocml_remainder_f64(double, double); +__device__ double __ocml_remquo_f64(double, double, + __attribute__((address_space(5))) int *); +__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double); +__device__ __attribute__((const)) double __ocml_rint_f64(double); +__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double, + double); +__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double, + double, double); +__device__ __attribute__((const)) double __ocml_round_f64(double); +__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double); +__device__ __attribute__((const)) double __ocml_scalb_f64(double, double); +__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int); +__device__ __attribute__((const)) int __ocml_signbit_f64(double); +__device__ double __ocml_sincos_f64(double, + __attribute__((address_space(5))) double *); +__device__ double +__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *); +__device__ double __ocml_sin_f64(double); +__device__ __attribute__((pure)) double __ocml_sinh_f64(double); +__device__ double __ocml_sinpi_f64(double); +__device__ __attribute__((const)) double __ocml_sqrt_f64(double); +__device__ double __ocml_tan_f64(double); +__device__ __attribute__((pure)) double __ocml_tanh_f64(double); +__device__ double __ocml_tgamma_f64(double); +__device__ __attribute__((const)) double __ocml_trunc_f64(double); +__device__ double __ocml_y0_f64(double); +__device__ double __ocml_y1_f64(double); + +// BEGIN INTRINSICS +__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double); +__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double); +__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double); +__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double); +__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double); +__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double); +__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double); +__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double); +__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double); +__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double); +__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double); +__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double); +__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double); +__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double); +__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double); +__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double); +__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double); +__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double); +__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double); +__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double); +__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double, + double); +__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double, + double); +__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double, + double); +__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double, + double); + +__device__ inline __attribute__((const)) double +__llvm_amdgcn_rcp_f64(double __x) { + return __builtin_amdgcn_rcp(__x); +} +__device__ inline __attribute__((const)) double +__llvm_amdgcn_rsq_f64(double __x) { + return __builtin_amdgcn_rsq(__x); +} + +__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16); +__device__ _Float16 __ocml_cos_f16(_Float16); +__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16); +__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16); +__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16); +__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16); +__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16, + _Float16); +__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16); +__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16); +__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16); +__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16); +__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16); +__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16); +__device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16); +__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16); +__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16); +__device__ _Float16 __ocml_sin_f16(_Float16); +__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16); +__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16); +__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int); + +typedef _Float16 __2f16 __attribute__((ext_vector_type(2))); +typedef short __2i16 __attribute__((ext_vector_type(2))); + +__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, + float c, bool s); +__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16); +__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16); +__device__ __2f16 __ocml_cos_2f16(__2f16); +__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16); +__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16); +__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16); +__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16); +__device__ __attribute__((const)) +__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16); +__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16); +__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16); +__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16); +__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16); +__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16); +__device__ inline __2f16 +__llvm_amdgcn_rcp_2f16(__2f16 __x) // Not currently exposed by ROCDL. +{ + return (__2f16)(__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)); +} +__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16); +__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16); +__device__ __2f16 __ocml_sin_2f16(__2f16); +__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16); +__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16); +__device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__ diff --git a/openmp/libomptarget/DeviceLib/include/Headers/__clang_hip_math.h b/openmp/libomptarget/DeviceLib/include/Headers/__clang_hip_math.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceLib/include/Headers/__clang_hip_math.h @@ -0,0 +1,1302 @@ +/*===---- __clang_hip_math.h - Device-side HIP math support ----------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __CLANG_HIP_MATH_H__ +#define __CLANG_HIP_MATH_H__ + +#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__) +#error "This file is for HIP and OpenMP AMDGCN device compilation only." +#endif + +#include +#include + +#pragma push_macro("__DEVICE__") + +#ifdef __OPENMP_AMDGCN__ +#define __DEVICE__ static inline __attribute__((always_inline, nothrow)) +#else +#define __DEVICE__ static __device__ inline __attribute__((always_inline)) +#endif + +// A few functions return bool type starting only in C++11. +#pragma push_macro("__RETURN_TYPE") +#ifdef __OPENMP_AMDGCN__ +#define __RETURN_TYPE int +#else +#if defined(__cplusplus) +#define __RETURN_TYPE bool +#else +#define __RETURN_TYPE int +#endif +#endif // __OPENMP_AMDGCN__ + +#if defined(__cplusplus) && __cplusplus < 201103L +// emulate static_assert on type sizes +template struct __compare_result {}; +template <> struct __compare_result { + static const __device__ bool valid; +}; + +__DEVICE__ +void __suppress_unused_warning(bool b){}; +template +__DEVICE__ void __static_assert_equal_size() { + __suppress_unused_warning(__compare_result::valid); +} + +#define __static_assert_type_size_equal(A, B) __static_assert_equal_size() + +#else +#define __static_assert_type_size_equal(A, B) static_assert((A) == (B), "") + +#endif + +extern "C" { + +__DEVICE__ +uint64_t __make_mantissa_base8(const char *__tagp) { + uint64_t __r = 0; + while (__tagp) { + char __tmp = *__tagp; + + if (__tmp >= '0' && __tmp <= '7') + __r = (__r * 8u) + __tmp - '0'; + else + return 0; + + ++__tagp; + } + + return __r; +} + +__DEVICE__ +uint64_t __make_mantissa_base10(const char *__tagp) { + uint64_t __r = 0; + while (__tagp) { + char __tmp = *__tagp; + + if (__tmp >= '0' && __tmp <= '9') + __r = (__r * 10u) + __tmp - '0'; + else + return 0; + + ++__tagp; + } + + return __r; +} + +__DEVICE__ +uint64_t __make_mantissa_base16(const char *__tagp) { + uint64_t __r = 0; + while (__tagp) { + char __tmp = *__tagp; + + if (__tmp >= '0' && __tmp <= '9') + __r = (__r * 16u) + __tmp - '0'; + else if (__tmp >= 'a' && __tmp <= 'f') + __r = (__r * 16u) + __tmp - 'a' + 10; + else if (__tmp >= 'A' && __tmp <= 'F') + __r = (__r * 16u) + __tmp - 'A' + 10; + else + return 0; + + ++__tagp; + } + + return __r; +} + +__DEVICE__ +uint64_t __make_mantissa(const char *__tagp) { + if (!__tagp) + return 0u; + + if (*__tagp == '0') { + ++__tagp; + + if (*__tagp == 'x' || *__tagp == 'X') + return __make_mantissa_base16(__tagp); + else + return __make_mantissa_base8(__tagp); + } + + return __make_mantissa_base10(__tagp); +} + +// BEGIN FLOAT +#if defined(__cplusplus) +__DEVICE__ +int abs(int __x) { + int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1); + return (__x ^ __sgn) - __sgn; +} +__DEVICE__ +long labs(long __x) { + long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1); + return (__x ^ __sgn) - __sgn; +} +__DEVICE__ +long long llabs(long long __x) { + long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1); + return (__x ^ __sgn) - __sgn; +} +#endif + +__DEVICE__ +float acosf(float __x) { return __ocml_acos_f32(__x); } + +__DEVICE__ +float acoshf(float __x) { return __ocml_acosh_f32(__x); } + +__DEVICE__ +float asinf(float __x) { return __ocml_asin_f32(__x); } + +__DEVICE__ +float asinhf(float __x) { return __ocml_asinh_f32(__x); } + +__DEVICE__ +float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); } + +__DEVICE__ +float atanf(float __x) { return __ocml_atan_f32(__x); } + +__DEVICE__ +float atanhf(float __x) { return __ocml_atanh_f32(__x); } + +__DEVICE__ +float cbrtf(float __x) { return __ocml_cbrt_f32(__x); } + +__DEVICE__ +float ceilf(float __x) { return __ocml_ceil_f32(__x); } + +__DEVICE__ +float copysignf(float __x, float __y) { return __ocml_copysign_f32(__x, __y); } + +__DEVICE__ +float cosf(float __x) { return __ocml_cos_f32(__x); } + +__DEVICE__ +float coshf(float __x) { return __ocml_cosh_f32(__x); } + +__DEVICE__ +float cospif(float __x) { return __ocml_cospi_f32(__x); } + +__DEVICE__ +float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); } + +__DEVICE__ +float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); } + +__DEVICE__ +float erfcf(float __x) { return __ocml_erfc_f32(__x); } + +__DEVICE__ +float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); } + +__DEVICE__ +float erfcxf(float __x) { return __ocml_erfcx_f32(__x); } + +__DEVICE__ +float erff(float __x) { return __ocml_erf_f32(__x); } + +__DEVICE__ +float erfinvf(float __x) { return __ocml_erfinv_f32(__x); } + +__DEVICE__ +float exp10f(float __x) { return __ocml_exp10_f32(__x); } + +__DEVICE__ +float exp2f(float __x) { return __ocml_exp2_f32(__x); } + +__DEVICE__ +float expf(float __x) { return __ocml_exp_f32(__x); } + +__DEVICE__ +float expm1f(float __x) { return __ocml_expm1_f32(__x); } + +__DEVICE__ +float fabsf(float __x) { return __ocml_fabs_f32(__x); } + +__DEVICE__ +float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); } + +__DEVICE__ +float fdividef(float __x, float __y) { return __x / __y; } + +__DEVICE__ +float floorf(float __x) { return __ocml_floor_f32(__x); } + +__DEVICE__ +float fmaf(float __x, float __y, float __z) { + return __ocml_fma_f32(__x, __y, __z); +} + +__DEVICE__ +float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); } + +__DEVICE__ +float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); } + +__DEVICE__ +float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); } + +__DEVICE__ +float frexpf(float __x, int *__nptr) { + int __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif + float __r = + __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp); + *__nptr = __tmp; + + return __r; +} + +__DEVICE__ +float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); } + +__DEVICE__ +int ilogbf(float __x) { return __ocml_ilogb_f32(__x); } + +__DEVICE__ +__RETURN_TYPE __finitef(float __x) { return __ocml_isfinite_f32(__x); } + +__DEVICE__ +__RETURN_TYPE __isinff(float __x) { return __ocml_isinf_f32(__x); } + +__DEVICE__ +__RETURN_TYPE __isnanf(float __x) { return __ocml_isnan_f32(__x); } + +__DEVICE__ +float j0f(float __x) { return __ocml_j0_f32(__x); } + +__DEVICE__ +float j1f(float __x) { return __ocml_j1_f32(__x); } + +__DEVICE__ +float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication + // and the Miller & Brown algorithm + // for linear recurrences to get O(log n) steps, but it's unclear if + // it'd be beneficial in this case. + if (__n == 0) + return j0f(__x); + if (__n == 1) + return j1f(__x); + + float __x0 = j0f(__x); + float __x1 = j1f(__x); + for (int __i = 1; __i < __n; ++__i) { + float __x2 = (2 * __i) / __x * __x1 - __x0; + __x0 = __x1; + __x1 = __x2; + } + + return __x1; +} + +__DEVICE__ +float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); } + +__DEVICE__ +float lgammaf(float __x) { return __ocml_lgamma_f32(__x); } + +__DEVICE__ +long long int llrintf(float __x) { return __ocml_rint_f32(__x); } + +__DEVICE__ +long long int llroundf(float __x) { return __ocml_round_f32(__x); } + +__DEVICE__ +float log10f(float __x) { return __ocml_log10_f32(__x); } + +__DEVICE__ +float log1pf(float __x) { return __ocml_log1p_f32(__x); } + +__DEVICE__ +float log2f(float __x) { return __ocml_log2_f32(__x); } + +__DEVICE__ +float logbf(float __x) { return __ocml_logb_f32(__x); } + +__DEVICE__ +float logf(float __x) { return __ocml_log_f32(__x); } + +__DEVICE__ +long int lrintf(float __x) { return __ocml_rint_f32(__x); } + +__DEVICE__ +long int lroundf(float __x) { return __ocml_round_f32(__x); } + +__DEVICE__ +float modff(float __x, float *__iptr) { + float __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif + float __r = + __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); + *__iptr = __tmp; + return __r; +} + +__DEVICE__ +float nanf(const char *__tagp) { + union { + float val; + struct ieee_float { + unsigned int mantissa : 22; + unsigned int quiet : 1; + unsigned int exponent : 8; + unsigned int sign : 1; + } bits; + } __tmp; + __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); + + __tmp.bits.sign = 0u; + __tmp.bits.exponent = ~0u; + __tmp.bits.quiet = 1u; + __tmp.bits.mantissa = __make_mantissa(__tagp); + + return __tmp.val; +} + +__DEVICE__ +float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); } + +__DEVICE__ +float nextafterf(float __x, float __y) { + return __ocml_nextafter_f32(__x, __y); +} + +__DEVICE__ +float norm3df(float __x, float __y, float __z) { + return __ocml_len3_f32(__x, __y, __z); +} + +__DEVICE__ +float norm4df(float __x, float __y, float __z, float __w) { + return __ocml_len4_f32(__x, __y, __z, __w); +} + +__DEVICE__ +float normcdff(float __x) { return __ocml_ncdf_f32(__x); } + +__DEVICE__ +float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); } + +__DEVICE__ +float normf(int __dim, + const float *__a) { // TODO: placeholder until OCML adds support. + float __r = 0; + while (__dim--) { + __r += __a[0] * __a[0]; + ++__a; + } + + return __ocml_sqrt_f32(__r); +} + +__DEVICE__ +float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } + +__DEVICE__ +float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); } + +__DEVICE__ +float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); } + +__DEVICE__ +float remainderf(float __x, float __y) { + return __ocml_remainder_f32(__x, __y); +} + +__DEVICE__ +float remquof(float __x, float __y, int *__quo) { + int __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif + float __r = __ocml_remquo_f32( + __x, __y, (__attribute__((address_space(5))) int *)&__tmp); + *__quo = __tmp; + + return __r; +} + +__DEVICE__ +float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); } + +__DEVICE__ +float rintf(float __x) { return __ocml_rint_f32(__x); } + +__DEVICE__ +float rnorm3df(float __x, float __y, float __z) { + return __ocml_rlen3_f32(__x, __y, __z); +} + +__DEVICE__ +float rnorm4df(float __x, float __y, float __z, float __w) { + return __ocml_rlen4_f32(__x, __y, __z, __w); +} + +__DEVICE__ +float rnormf(int __dim, + const float *__a) { // TODO: placeholder until OCML adds support. + float __r = 0; + while (__dim--) { + __r += __a[0] * __a[0]; + ++__a; + } + + return __ocml_rsqrt_f32(__r); +} + +__DEVICE__ +float roundf(float __x) { return __ocml_round_f32(__x); } + +__DEVICE__ +float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); } + +__DEVICE__ +float scalblnf(float __x, long int __n) { + return (__n < INT_MAX) ? __ocml_scalbn_f32(__x, __n) + : __ocml_scalb_f32(__x, __n); +} + +__DEVICE__ +float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); } + +__DEVICE__ +__RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); } + +__DEVICE__ +void sincosf(float __x, float *__sinptr, float *__cosptr) { + float __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif + *__sinptr = + __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); + *__cosptr = __tmp; +} + +__DEVICE__ +void sincospif(float __x, float *__sinptr, float *__cosptr) { + float __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif + *__sinptr = __ocml_sincospi_f32( + __x, (__attribute__((address_space(5))) float *)&__tmp); + *__cosptr = __tmp; +} + +__DEVICE__ +float sinf(float __x) { return __ocml_sin_f32(__x); } + +__DEVICE__ +float sinhf(float __x) { return __ocml_sinh_f32(__x); } + +__DEVICE__ +float sinpif(float __x) { return __ocml_sinpi_f32(__x); } + +__DEVICE__ +float sqrtf(float __x) { return __ocml_sqrt_f32(__x); } + +__DEVICE__ +float tanf(float __x) { return __ocml_tan_f32(__x); } + +__DEVICE__ +float tanhf(float __x) { return __ocml_tanh_f32(__x); } + +__DEVICE__ +float tgammaf(float __x) { return __ocml_tgamma_f32(__x); } + +__DEVICE__ +float truncf(float __x) { return __ocml_trunc_f32(__x); } + +__DEVICE__ +float y0f(float __x) { return __ocml_y0_f32(__x); } + +__DEVICE__ +float y1f(float __x) { return __ocml_y1_f32(__x); } + +__DEVICE__ +float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication + // and the Miller & Brown algorithm + // for linear recurrences to get O(log n) steps, but it's unclear if + // it'd be beneficial in this case. Placeholder until OCML adds + // support. + if (__n == 0) + return y0f(__x); + if (__n == 1) + return y1f(__x); + + float __x0 = y0f(__x); + float __x1 = y1f(__x); + for (int __i = 1; __i < __n; ++__i) { + float __x2 = (2 * __i) / __x * __x1 - __x0; + __x0 = __x1; + __x1 = __x2; + } + + return __x1; +} +} + +// BEGIN INTRINSICS + +__DEVICE__ +float __cosf(float __x) { return __ocml_native_cos_f32(__x); } + +__DEVICE__ +float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); } + +__DEVICE__ +float __expf(float __x) { return __ocml_native_exp_f32(__x); } + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); } +__DEVICE__ +float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); } +__DEVICE__ +float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); } +__DEVICE__ +float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); } +#else +__DEVICE__ +float __fadd_rn(float __x, float __y) { return __x + __y; } +#endif + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); } +__DEVICE__ +float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); } +__DEVICE__ +float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); } +__DEVICE__ +float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); } +#else +__DEVICE__ +float __fdiv_rn(float __x, float __y) { return __x / __y; } +#endif + +__DEVICE__ +float __fdividef(float __x, float __y) { return __x / __y; } + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +float __fmaf_rd(float __x, float __y, float __z) { + return __ocml_fma_rtn_f32(__x, __y, __z); +} +__DEVICE__ +float __fmaf_rn(float __x, float __y, float __z) { + return __ocml_fma_rte_f32(__x, __y, __z); +} +__DEVICE__ +float __fmaf_ru(float __x, float __y, float __z) { + return __ocml_fma_rtp_f32(__x, __y, __z); +} +__DEVICE__ +float __fmaf_rz(float __x, float __y, float __z) { + return __ocml_fma_rtz_f32(__x, __y, __z); +} +#else +__DEVICE__ +float __fmaf_rn(float __x, float __y, float __z) { + return __ocml_fma_f32(__x, __y, __z); +} +#endif + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); } +__DEVICE__ +float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); } +__DEVICE__ +float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); } +__DEVICE__ +float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); } +#else +__DEVICE__ +float __fmul_rn(float __x, float __y) { return __x * __y; } +#endif + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); } +__DEVICE__ +float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); } +__DEVICE__ +float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); } +__DEVICE__ +float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); } +#else +__DEVICE__ +float __frcp_rn(float __x) { return 1.0f / __x; } +#endif + +__DEVICE__ +float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); } + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); } +__DEVICE__ +float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); } +__DEVICE__ +float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); } +__DEVICE__ +float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); } +#else +__DEVICE__ +float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); } +#endif + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); } +__DEVICE__ +float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); } +__DEVICE__ +float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); } +__DEVICE__ +float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); } +#else +__DEVICE__ +float __fsub_rn(float __x, float __y) { return __x - __y; } +#endif + +__DEVICE__ +float __log10f(float __x) { return __ocml_native_log10_f32(__x); } + +__DEVICE__ +float __log2f(float __x) { return __ocml_native_log2_f32(__x); } + +__DEVICE__ +float __logf(float __x) { return __ocml_native_log_f32(__x); } + +__DEVICE__ +float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } + +__DEVICE__ +float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); } + +__DEVICE__ +void __sincosf(float __x, float *__sinptr, float *__cosptr) { + *__sinptr = __ocml_native_sin_f32(__x); + *__cosptr = __ocml_native_cos_f32(__x); +} + +__DEVICE__ +float __sinf(float __x) { return __ocml_native_sin_f32(__x); } + +__DEVICE__ +float __tanf(float __x) { return __ocml_tan_f32(__x); } +// END INTRINSICS +// END FLOAT + +// BEGIN DOUBLE +__DEVICE__ +double acos(double __x) { return __ocml_acos_f64(__x); } + +__DEVICE__ +double acosh(double __x) { return __ocml_acosh_f64(__x); } + +__DEVICE__ +double asin(double __x) { return __ocml_asin_f64(__x); } + +__DEVICE__ +double asinh(double __x) { return __ocml_asinh_f64(__x); } + +__DEVICE__ +double atan(double __x) { return __ocml_atan_f64(__x); } + +__DEVICE__ +double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); } + +__DEVICE__ +double atanh(double __x) { return __ocml_atanh_f64(__x); } + +__DEVICE__ +double cbrt(double __x) { return __ocml_cbrt_f64(__x); } + +__DEVICE__ +double ceil(double __x) { return __ocml_ceil_f64(__x); } + +__DEVICE__ +double copysign(double __x, double __y) { + return __ocml_copysign_f64(__x, __y); +} + +__DEVICE__ +double cos(double __x) { return __ocml_cos_f64(__x); } + +__DEVICE__ +double cosh(double __x) { return __ocml_cosh_f64(__x); } + +__DEVICE__ +double cospi(double __x) { return __ocml_cospi_f64(__x); } + +__DEVICE__ +double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); } + +__DEVICE__ +double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); } + +__DEVICE__ +double erf(double __x) { return __ocml_erf_f64(__x); } + +__DEVICE__ +double erfc(double __x) { return __ocml_erfc_f64(__x); } + +__DEVICE__ +double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); } + +__DEVICE__ +double erfcx(double __x) { return __ocml_erfcx_f64(__x); } + +__DEVICE__ +double erfinv(double __x) { return __ocml_erfinv_f64(__x); } + +__DEVICE__ +double exp(double __x) { return __ocml_exp_f64(__x); } + +__DEVICE__ +double exp10(double __x) { return __ocml_exp10_f64(__x); } + +__DEVICE__ +double exp2(double __x) { return __ocml_exp2_f64(__x); } + +__DEVICE__ +double expm1(double __x) { return __ocml_expm1_f64(__x); } + +__DEVICE__ +double fabs(double __x) { return __ocml_fabs_f64(__x); } + +__DEVICE__ +double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); } + +__DEVICE__ +double floor(double __x) { return __ocml_floor_f64(__x); } + +__DEVICE__ +double fma(double __x, double __y, double __z) { + return __ocml_fma_f64(__x, __y, __z); +} + +__DEVICE__ +double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); } + +__DEVICE__ +double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); } + +__DEVICE__ +double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); } + +__DEVICE__ +double frexp(double __x, int *__nptr) { + int __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif + double __r = + __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp); + *__nptr = __tmp; + return __r; +} + +__DEVICE__ +double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); } + +__DEVICE__ +int ilogb(double __x) { return __ocml_ilogb_f64(__x); } + +__DEVICE__ +__RETURN_TYPE __finite(double __x) { return __ocml_isfinite_f64(__x); } + +__DEVICE__ +__RETURN_TYPE __isinf(double __x) { return __ocml_isinf_f64(__x); } + +__DEVICE__ +__RETURN_TYPE __isnan(double __x) { return __ocml_isnan_f64(__x); } + +__DEVICE__ +double j0(double __x) { return __ocml_j0_f64(__x); } + +__DEVICE__ +double j1(double __x) { return __ocml_j1_f64(__x); } + +__DEVICE__ +double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication + // and the Miller & Brown algorithm + // for linear recurrences to get O(log n) steps, but it's unclear if + // it'd be beneficial in this case. Placeholder until OCML adds + // support. + if (__n == 0) + return j0(__x); + if (__n == 1) + return j1(__x); + + double __x0 = j0(__x); + double __x1 = j1(__x); + for (int __i = 1; __i < __n; ++__i) { + double __x2 = (2 * __i) / __x * __x1 - __x0; + __x0 = __x1; + __x1 = __x2; + } + return __x1; +} + +__DEVICE__ +double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); } + +__DEVICE__ +double lgamma(double __x) { return __ocml_lgamma_f64(__x); } + +__DEVICE__ +long long int llrint(double __x) { return __ocml_rint_f64(__x); } + +__DEVICE__ +long long int llround(double __x) { return __ocml_round_f64(__x); } + +__DEVICE__ +double log(double __x) { return __ocml_log_f64(__x); } + +__DEVICE__ +double log10(double __x) { return __ocml_log10_f64(__x); } + +__DEVICE__ +double log1p(double __x) { return __ocml_log1p_f64(__x); } + +__DEVICE__ +double log2(double __x) { return __ocml_log2_f64(__x); } + +__DEVICE__ +double logb(double __x) { return __ocml_logb_f64(__x); } + +__DEVICE__ +long int lrint(double __x) { return __ocml_rint_f64(__x); } + +__DEVICE__ +long int lround(double __x) { return __ocml_round_f64(__x); } + +__DEVICE__ +double modf(double __x, double *__iptr) { + double __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif + double __r = + __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp); + *__iptr = __tmp; + + return __r; +} + +__DEVICE__ +double nan(const char *__tagp) { +#if !_WIN32 + union { + double val; + struct ieee_double { + uint64_t mantissa : 51; + uint32_t quiet : 1; + uint32_t exponent : 11; + uint32_t sign : 1; + } bits; + } __tmp; + __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); + + __tmp.bits.sign = 0u; + __tmp.bits.exponent = ~0u; + __tmp.bits.quiet = 1u; + __tmp.bits.mantissa = __make_mantissa(__tagp); + + return __tmp.val; +#else + __static_assert_type_size_equal(sizeof(uint64_t), sizeof(double)); + uint64_t __val = __make_mantissa(__tagp); + __val |= 0xFFF << 51; + return *reinterpret_cast(&__val); +#endif +} + +__DEVICE__ +double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); } + +__DEVICE__ +double nextafter(double __x, double __y) { + return __ocml_nextafter_f64(__x, __y); +} + +__DEVICE__ +double norm(int __dim, + const double *__a) { // TODO: placeholder until OCML adds support. + double __r = 0; + while (__dim--) { + __r += __a[0] * __a[0]; + ++__a; + } + + return __ocml_sqrt_f64(__r); +} + +__DEVICE__ +double norm3d(double __x, double __y, double __z) { + return __ocml_len3_f64(__x, __y, __z); +} + +__DEVICE__ +double norm4d(double __x, double __y, double __z, double __w) { + return __ocml_len4_f64(__x, __y, __z, __w); +} + +__DEVICE__ +double normcdf(double __x) { return __ocml_ncdf_f64(__x); } + +__DEVICE__ +double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); } + +__DEVICE__ +double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); } + +__DEVICE__ +double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); } + +__DEVICE__ +double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); } + +__DEVICE__ +double remainder(double __x, double __y) { + return __ocml_remainder_f64(__x, __y); +} + +__DEVICE__ +double remquo(double __x, double __y, int *__quo) { + int __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif + double __r = __ocml_remquo_f64( + __x, __y, (__attribute__((address_space(5))) int *)&__tmp); + *__quo = __tmp; + + return __r; +} + +__DEVICE__ +double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); } + +__DEVICE__ +double rint(double __x) { return __ocml_rint_f64(__x); } + +__DEVICE__ +double rnorm(int __dim, + const double *__a) { // TODO: placeholder until OCML adds support. + double __r = 0; + while (__dim--) { + __r += __a[0] * __a[0]; + ++__a; + } + + return __ocml_rsqrt_f64(__r); +} + +__DEVICE__ +double rnorm3d(double __x, double __y, double __z) { + return __ocml_rlen3_f64(__x, __y, __z); +} + +__DEVICE__ +double rnorm4d(double __x, double __y, double __z, double __w) { + return __ocml_rlen4_f64(__x, __y, __z, __w); +} + +__DEVICE__ +double round(double __x) { return __ocml_round_f64(__x); } + +__DEVICE__ +double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); } + +__DEVICE__ +double scalbln(double __x, long int __n) { + return (__n < INT_MAX) ? __ocml_scalbn_f64(__x, __n) + : __ocml_scalb_f64(__x, __n); +} +__DEVICE__ +double scalbn(double __x, int __n) { return __ocml_scalbn_f64(__x, __n); } + +__DEVICE__ +__RETURN_TYPE __signbit(double __x) { return __ocml_signbit_f64(__x); } + +__DEVICE__ +double sin(double __x) { return __ocml_sin_f64(__x); } + +__DEVICE__ +void sincos(double __x, double *__sinptr, double *__cosptr) { + double __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif + *__sinptr = __ocml_sincos_f64( + __x, (__attribute__((address_space(5))) double *)&__tmp); + *__cosptr = __tmp; +} + +__DEVICE__ +void sincospi(double __x, double *__sinptr, double *__cosptr) { + double __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif + *__sinptr = __ocml_sincospi_f64( + __x, (__attribute__((address_space(5))) double *)&__tmp); + *__cosptr = __tmp; +} + +__DEVICE__ +double sinh(double __x) { return __ocml_sinh_f64(__x); } + +__DEVICE__ +double sinpi(double __x) { return __ocml_sinpi_f64(__x); } + +__DEVICE__ +double sqrt(double __x) { return __ocml_sqrt_f64(__x); } + +__DEVICE__ +double tan(double __x) { return __ocml_tan_f64(__x); } + +__DEVICE__ +double tanh(double __x) { return __ocml_tanh_f64(__x); } + +__DEVICE__ +double tgamma(double __x) { return __ocml_tgamma_f64(__x); } + +__DEVICE__ +double trunc(double __x) { return __ocml_trunc_f64(__x); } + +__DEVICE__ +double y0(double __x) { return __ocml_y0_f64(__x); } + +__DEVICE__ +double y1(double __x) { return __ocml_y1_f64(__x); } + +__DEVICE__ +double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication + // and the Miller & Brown algorithm + // for linear recurrences to get O(log n) steps, but it's unclear if + // it'd be beneficial in this case. Placeholder until OCML adds + // support. + if (__n == 0) + return y0(__x); + if (__n == 1) + return y1(__x); + + double __x0 = y0(__x); + double __x1 = y1(__x); + for (int __i = 1; __i < __n; ++__i) { + double __x2 = (2 * __i) / __x * __x1 - __x0; + __x0 = __x1; + __x1 = __x2; + } + + return __x1; +} + +// BEGIN INTRINSICS +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +double __dadd_rd(double __x, double __y) { + return __ocml_add_rtn_f64(__x, __y); +} +__DEVICE__ +double __dadd_rn(double __x, double __y) { + return __ocml_add_rte_f64(__x, __y); +} +__DEVICE__ +double __dadd_ru(double __x, double __y) { + return __ocml_add_rtp_f64(__x, __y); +} +__DEVICE__ +double __dadd_rz(double __x, double __y) { + return __ocml_add_rtz_f64(__x, __y); +} +#else +__DEVICE__ +double __dadd_rn(double __x, double __y) { return __x + __y; } +#endif + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +double __ddiv_rd(double __x, double __y) { + return __ocml_div_rtn_f64(__x, __y); +} +__DEVICE__ +double __ddiv_rn(double __x, double __y) { + return __ocml_div_rte_f64(__x, __y); +} +__DEVICE__ +double __ddiv_ru(double __x, double __y) { + return __ocml_div_rtp_f64(__x, __y); +} +__DEVICE__ +double __ddiv_rz(double __x, double __y) { + return __ocml_div_rtz_f64(__x, __y); +} +#else +__DEVICE__ +double __ddiv_rn(double __x, double __y) { return __x / __y; } +#endif + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +double __dmul_rd(double __x, double __y) { + return __ocml_mul_rtn_f64(__x, __y); +} +__DEVICE__ +double __dmul_rn(double __x, double __y) { + return __ocml_mul_rte_f64(__x, __y); +} +__DEVICE__ +double __dmul_ru(double __x, double __y) { + return __ocml_mul_rtp_f64(__x, __y); +} +__DEVICE__ +double __dmul_rz(double __x, double __y) { + return __ocml_mul_rtz_f64(__x, __y); +} +#else +__DEVICE__ +double __dmul_rn(double __x, double __y) { return __x * __y; } +#endif + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +double __drcp_rd(double __x) { return __ocml_div_rtn_f64(1.0, __x); } +__DEVICE__ +double __drcp_rn(double __x) { return __ocml_div_rte_f64(1.0, __x); } +__DEVICE__ +double __drcp_ru(double __x) { return __ocml_div_rtp_f64(1.0, __x); } +__DEVICE__ +double __drcp_rz(double __x) { return __ocml_div_rtz_f64(1.0, __x); } +#else +__DEVICE__ +double __drcp_rn(double __x) { return 1.0 / __x; } +#endif + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); } +__DEVICE__ +double __dsqrt_rn(double __x) { return __ocml_sqrt_rte_f64(__x); } +__DEVICE__ +double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); } +__DEVICE__ +double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); } +#else +__DEVICE__ +double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); } +#endif + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +double __dsub_rd(double __x, double __y) { + return __ocml_sub_rtn_f64(__x, __y); +} +__DEVICE__ +double __dsub_rn(double __x, double __y) { + return __ocml_sub_rte_f64(__x, __y); +} +__DEVICE__ +double __dsub_ru(double __x, double __y) { + return __ocml_sub_rtp_f64(__x, __y); +} +__DEVICE__ +double __dsub_rz(double __x, double __y) { + return __ocml_sub_rtz_f64(__x, __y); +} +#else +__DEVICE__ +double __dsub_rn(double __x, double __y) { return __x - __y; } +#endif + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +double __fma_rd(double __x, double __y, double __z) { + return __ocml_fma_rtn_f64(__x, __y, __z); +} +__DEVICE__ +double __fma_rn(double __x, double __y, double __z) { + return __ocml_fma_rte_f64(__x, __y, __z); +} +__DEVICE__ +double __fma_ru(double __x, double __y, double __z) { + return __ocml_fma_rtp_f64(__x, __y, __z); +} +__DEVICE__ +double __fma_rz(double __x, double __y, double __z) { + return __ocml_fma_rtz_f64(__x, __y, __z); +} +#else +__DEVICE__ +double __fma_rn(double __x, double __y, double __z) { + return __ocml_fma_f64(__x, __y, __z); +} +#endif +// END INTRINSICS +// END DOUBLE + +// C only macros +#if !defined(__cplusplus) && __STDC_VERSION__ >= 201112L +#define isfinite(__x) _Generic((__x), float : __finitef, double : __finite)(__x) +#define isinf(__x) _Generic((__x), float : __isinff, double : __isinf)(__x) +#define isnan(__x) _Generic((__x), float : __isnanf, double : __isnan)(__x) +#define signbit(__x) \ + _Generic((__x), float : __signbitf, double : __signbit)(__x) +#endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L + +#if defined(__cplusplus) +template __DEVICE__ T min(T __arg1, T __arg2) { + return (__arg1 < __arg2) ? __arg1 : __arg2; +} + +template __DEVICE__ T max(T __arg1, T __arg2) { + return (__arg1 > __arg2) ? __arg1 : __arg2; +} +#endif + +__DEVICE__ int min(int __arg1, int __arg2) { + return (__arg1 < __arg2) ? __arg1 : __arg2; +} +__DEVICE__ int max(int __arg1, int __arg2) { + return (__arg1 > __arg2) ? __arg1 : __arg2; +} + +__DEVICE__ +float max(float __x, float __y) { return fmaxf(__x, __y); } + +__DEVICE__ +double max(double __x, double __y) { return fmax(__x, __y); } + +__DEVICE__ +float min(float __x, float __y) { return fminf(__x, __y); } + +__DEVICE__ +double min(double __x, double __y) { return fmin(__x, __y); } + +#pragma pop_macro("__DEVICE__") +#pragma pop_macro("__RETURN_TYPE") + +#endif // __CLANG_HIP_MATH_H__ diff --git a/openmp/libomptarget/DeviceLib/include/OpenMPMath.h b/openmp/libomptarget/DeviceLib/include/OpenMPMath.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceLib/include/OpenMPMath.h @@ -0,0 +1,208 @@ +//===---------- OpenMPMath.h - Prototypes of __device__ math fns -----------=== +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------------=== + +#ifndef __OPENMP_MATH_H__ +#define __OPENMP_MATH_H__ + +#if defined(__cplusplus) +extern "C" { +#endif + +// Forward declarations of the standard math functions we have a wrapper for. +int abs(int __a); +double fabs(double __a); +double acos(double __a); +float acosf(float __a); +double acosh(double __a); +float acoshf(float __a); +double asin(double __a); +float asinf(float __a); +double asinh(double __a); +float asinhf(float __a); +double atan(double __a); +double atan2(double __a, double __b); +float atan2f(float __a, float __b); +float atanf(float __a); +double atanh(double __a); +float atanhf(float __a); +double cbrt(double __a); +float cbrtf(float __a); +double ceil(double __a); +float ceilf(float __a); +double copysign(double __a, double __b); +float copysignf(float __a, float __b); +double cos(double __a); +float cosf(float __a); +double cosh(double __a); +float coshf(float __a); +double cospi(double __a); +float cospif(float __a); +double cyl_bessel_i0(double __a); +float cyl_bessel_i0f(float __a); +double cyl_bessel_i1(double __a); +float cyl_bessel_i1f(float __a); +double erf(double __a); +double erfc(double __a); +float erfcf(float __a); +double erfcinv(double __a); +float erfcinvf(float __a); +double erfcx(double __a); +float erfcxf(float __a); +float erff(float __a); +double erfinv(double __a); +float erfinvf(float __a); +double exp(double __a); +double exp10(double __a); +float exp10f(float __a); +double exp2(double __a); +float exp2f(float __a); +float expf(float __a); +double expm1(double __a); +float expm1f(float __a); +float fabsf(float __a); +double fdim(double __a, double __b); +float fdimf(float __a, float __b); +double fdivide(double __a, double __b); +float fdividef(float __a, float __b); +double floor(double __f); +float floorf(float __f); +double fma(double __a, double __b, double __c); +float fmaf(float __a, float __b, float __c); +double fmax(double __a, double __b); +float fmaxf(float __a, float __b); +double fmin(double __a, double __b); +float fminf(float __a, float __b); +double fmod(double __a, double __b); +float fmodf(float __a, float __b); +double frexp(double __a, int *__b); +float frexpf(float __a, int *__b); +double hypot(double __a, double __b); +float hypotf(float __a, float __b); +int ilogb(double __a); +int ilogbf(float __a); +double j0(double __a); +float j0f(float __a); +double j1(double __a); +float j1f(float __a); +double jn(int __n, double __a); +float jnf(int __n, float __a); +long labs(long __a); +double ldexp(double __a, int __b); +float ldexpf(float __a, int __b); +double lgamma(double __a); +float lgammaf(float __a); +long long llabs(long long __a); +long long llmax(long long __a, long long __b); +long long llmin(long long __a, long long __b); +long long llrint(double __a); +long long llrintf(float __a); +long long llround(double __a); +long long llroundf(float __a); +double round(double __a); +float roundf(float __a); +double log(double __a); +double log10(double __a); +float log10f(float __a); +double log1p(double __a); +float log1pf(float __a); +double log2(double __a); +float log2f(float __a); +double logb(double __a); +float logbf(float __a); +float logf(float __a); +#if defined(__LP64__) +long lrint(double __a); +long lrintf(float __a); +long lround(double __a); +long lroundf(float __a); +#else +long lrint(double __a); +long lrintf(float __a); +long lround(double __a); +long lroundf(float __a); +#endif +int max(int __a, int __b); +int min(int __a, int __b); +double modf(double __a, double *__b); +float modff(float __a, float *__b); +double nearbyint(double __a); +float nearbyintf(float __a); +double nextafter(double __a, double __b); +float nextafterf(float __a, float __b); +double norm(int __dim, const double *__t); +double norm3d(double __a, double __b, double __c); +float norm3df(float __a, float __b, float __c); +double norm4d(double __a, double __b, double __c, double __d); +float norm4df(float __a, float __b, float __c, float __d); +double normcdf(double __a); +float normcdff(float __a); +double normcdfinv(double __a); +float normcdfinvf(float __a); +float normf(int __dim, const float *__t); +double pow(double __a, double __b); +float powf(float __a, float __b); +double powi(double __a, int __b); +float powif(float __a, int __b); +double rcbrt(double __a); +float rcbrtf(float __a); +double remainder(double __a, double __b); +float remainderf(float __a, float __b); +double remquo(double __a, double __b, int *__c); +float remquof(float __a, float __b, int *__c); +double rhypot(double __a, double __b); +float rhypotf(float __a, float __b); +double rint(double __a); +float rintf(float __a); +double rnorm(int __a, const double *__b); +double rnorm3d(double __a, double __b, double __c); +float rnorm3df(float __a, float __b, float __c); +double rnorm4d(double __a, double __b, double __c, double __d); +float rnorm4df(float __a, float __b, float __c, float __d); +float rnormf(int __dim, const float *__t); +double rsqrt(double __a); +float rsqrtf(float __a); +double scalbn(double __a, int __b); +float scalbnf(float __a, int __b); +double scalbln(double __a, long __b); +float scalblnf(float __a, long __b); +double sin(double __a); +void sincos(double __a, double *__s, double *__c); +void sincosf(float __a, float *__s, float *__c); +void sincospi(double __a, double *__s, double *__c); +void sincospif(float __a, float *__s, float *__c); +float sinf(float __a); +double sinh(double __a); +float sinhf(float __a); +double sinpi(double __a); +float sinpif(float __a); +double sqrt(double __a); +float sqrtf(float __a); +double tan(double __a); +float tanf(float __a); +double tanh(double __a); +float tanhf(float __a); +double tgamma(double __a); +float tgammaf(float __a); +double trunc(double __a); +float truncf(float __a); +unsigned long long ullmax(unsigned long long __a, unsigned long long __b); +unsigned long long ullmin(unsigned long long __a, unsigned long long __b); +unsigned int umax(unsigned int __a, unsigned int __b); +unsigned int umin(unsigned int __a, unsigned int __b); +double y0(double __a); +float y0f(float __a); +double y1(double __a); +float y1f(float __a); +double yn(int __a, double __b); +float ynf(int __a, float __b); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/openmp/libomptarget/DeviceLib/src/DeviceLibm.cpp b/openmp/libomptarget/DeviceLib/src/DeviceLibm.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceLib/src/DeviceLibm.cpp @@ -0,0 +1,32 @@ +//===-------- OpenMPMath.cpp - Implementation of OpenMP math fns -----------=== +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------------=== +#if defined(__CUDA__) +#define __OPENMP_NVPTX__ + +// Include declarations for libdevice functions. +#include +// Include the device functions. +#include + +extern "C" { +// Call libdevice functions from the standard math names. +#include +} + +#undef __OPENMP_NVPTX__ +#elif defined(__AMDGPU__) +#define __OPENMP_AMDGCN__ + +// Include declarations for libdevice functions. +#include + +// Call libdevice functions from the standard math names. +#include + +#undef __OPENMP_AMDGCN__ +#endif diff --git a/openmp/libomptarget/DeviceLib/src/OpenMPMath.cpp b/openmp/libomptarget/DeviceLib/src/OpenMPMath.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceLib/src/OpenMPMath.cpp @@ -0,0 +1,277 @@ +//===-------- OpenMPMath.cpp - Implementation of OpenMP math fns -----------=== +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------------=== + +#include + +#define __DEVICE__ __attribute__((always_inline, nothrow)) + +extern "C" { + +__DEVICE__ int __omp_abs(int __a) { return abs(__a); } +__DEVICE__ double __omp_fabs(double __a) { return fabs(__a); } +__DEVICE__ double __omp_acos(double __a) { return acos(__a); } +__DEVICE__ float __omp_acosf(float __a) { return acosf(__a); } +__DEVICE__ double __omp_acosh(double __a) { return acosh(__a); } +__DEVICE__ float __omp_acoshf(float __a) { return acoshf(__a); } +__DEVICE__ double __omp_asin(double __a) { return asin(__a); } +__DEVICE__ float __omp_asinf(float __a) { return asinf(__a); } +__DEVICE__ double __omp_asinh(double __a) { return asinh(__a); } +__DEVICE__ float __omp_asinhf(float __a) { return asinhf(__a); } +__DEVICE__ double __omp_atan(double __a) { return atan(__a); } +__DEVICE__ double __omp_atan2(double __a, double __b) { + return atan2(__a, __b); +} +__DEVICE__ float __omp_atan2f(float __a, float __b) { return atan2f(__a, __b); } +__DEVICE__ float __omp_atanf(float __a) { return atanf(__a); } +__DEVICE__ double __omp_atanh(double __a) { return atanh(__a); } +__DEVICE__ float __omp_atanhf(float __a) { return atanhf(__a); } +__DEVICE__ double __omp_cbrt(double __a) { return cbrt(__a); } +__DEVICE__ float __omp_cbrtf(float __a) { return cbrtf(__a); } +__DEVICE__ double __omp_ceil(double __a) { return ceil(__a); } +__DEVICE__ float __omp_ceilf(float __a) { return ceilf(__a); } +__DEVICE__ double __omp_copysign(double __a, double __b) { + return copysign(__a, __b); +} +__DEVICE__ float __omp_copysignf(float __a, float __b) { + return copysignf(__a, __b); +} +__DEVICE__ double __omp_cos(double __a) { return cos(__a); } +__DEVICE__ float __omp_cosf(float __a) { return cosf(__a); } +__DEVICE__ double __omp_cosh(double __a) { return cosh(__a); } +__DEVICE__ float __omp_coshf(float __a) { return coshf(__a); } +__DEVICE__ double __omp_cospi(double __a) { return cospi(__a); } +__DEVICE__ float __omp_cospif(float __a) { return cospif(__a); } +__DEVICE__ double __omp_cyl_bessel_i0(double __a) { return cyl_bessel_i0(__a); } +__DEVICE__ float __omp_cyl_bessel_i0f(float __a) { return cyl_bessel_i0f(__a); } +__DEVICE__ double __omp_cyl_bessel_i1(double __a) { return cyl_bessel_i1(__a); } +__DEVICE__ float __omp_cyl_bessel_i1f(float __a) { return cyl_bessel_i1f(__a); } +__DEVICE__ double __omp_erf(double __a) { return erf(__a); } +__DEVICE__ double __omp_erfc(double __a) { return erfc(__a); } +__DEVICE__ float __omp_erfcf(float __a) { return erfcf(__a); } +__DEVICE__ double __omp_erfcinv(double __a) { return erfcinv(__a); } +__DEVICE__ float __omp_erfcinvf(float __a) { return erfcinvf(__a); } +__DEVICE__ double __omp_erfcx(double __a) { return erfcx(__a); } +__DEVICE__ float __omp_erfcxf(float __a) { return erfcxf(__a); } +__DEVICE__ float __omp_erff(float __a) { return erff(__a); } +__DEVICE__ double __omp_erfinv(double __a) { return erfinv(__a); } +__DEVICE__ float __omp_erfinvf(float __a) { return erfinvf(__a); } +__DEVICE__ double __omp_exp(double __a) { return exp(__a); } +__DEVICE__ double __omp_exp10(double __a) { return exp10(__a); } +__DEVICE__ float __omp_exp10f(float __a) { return exp10f(__a); } +__DEVICE__ double __omp_exp2(double __a) { return exp2(__a); } +__DEVICE__ float __omp_exp2f(float __a) { return exp2f(__a); } +__DEVICE__ float __omp_expf(float __a) { return expf(__a); } +__DEVICE__ double __omp_expm1(double __a) { return expm1(__a); } +__DEVICE__ float __omp_expm1f(float __a) { return expm1f(__a); } +__DEVICE__ float __omp_fabsf(float __a) { return fabsf(__a); } +__DEVICE__ double __omp_fdim(double __a, double __b) { return fdim(__a, __b); } +__DEVICE__ float __omp_fdimf(float __a, float __b) { return fdimf(__a, __b); } +__DEVICE__ double __omp_fdivide(double __a, double __b) { return __a / __b; } +__DEVICE__ float __omp_fdividef(float __a, float __b) { return __a / __b; } +__DEVICE__ double __omp_floor(double __f) { return floor(__f); } +__DEVICE__ float __omp_floorf(float __f) { return floorf(__f); } +__DEVICE__ double __omp_fma(double __a, double __b, double __c) { + return fma(__a, __b, __c); +} +__DEVICE__ float __omp_fmaf(float __a, float __b, float __c) { + return fmaf(__a, __b, __c); +} +__DEVICE__ double __omp_fmax(double __a, double __b) { return fmax(__a, __b); } +__DEVICE__ float __omp_fmaxf(float __a, float __b) { return fmaxf(__a, __b); } +__DEVICE__ double __omp_fmin(double __a, double __b) { return fmin(__a, __b); } +__DEVICE__ float __omp_fminf(float __a, float __b) { return fminf(__a, __b); } +__DEVICE__ double __omp_fmod(double __a, double __b) { return fmod(__a, __b); } +__DEVICE__ float __omp_fmodf(float __a, float __b) { return fmodf(__a, __b); } +__DEVICE__ double __omp_frexp(double __a, int *__b) { return frexp(__a, __b); } +__DEVICE__ float __omp_frexpf(float __a, int *__b) { return frexpf(__a, __b); } +__DEVICE__ double __omp_hypot(double __a, double __b) { + return hypot(__a, __b); +} +__DEVICE__ float __omp_hypotf(float __a, float __b) { return hypotf(__a, __b); } +__DEVICE__ int __omp_ilogb(double __a) { return ilogb(__a); } +__DEVICE__ int __omp_ilogbf(float __a) { return ilogbf(__a); } +__DEVICE__ double __omp_j0(double __a) { return j0(__a); } +__DEVICE__ float __omp_j0f(float __a) { return j0f(__a); } +__DEVICE__ double __omp_j1(double __a) { return j1(__a); } +__DEVICE__ float __omp_j1f(float __a) { return j1f(__a); } +__DEVICE__ double __omp_jn(int __n, double __a) { return jn(__n, __a); } +__DEVICE__ float __omp_jnf(int __n, float __a) { return jnf(__n, __a); } +#if defined(__LP64__) || defined(_WIN64) +__DEVICE__ long __omp_labs(long __a) { return llabs(__a); }; +#else +__DEVICE__ long __omp_labs(long __a) { return abs(__a); }; +#endif +__DEVICE__ double __omp_ldexp(double __a, int __b) { return ldexp(__a, __b); } +__DEVICE__ float __omp_ldexpf(float __a, int __b) { return ldexpf(__a, __b); } +__DEVICE__ double __omp_lgamma(double __a) { return lgamma(__a); } +__DEVICE__ float __omp_lgammaf(float __a) { return lgammaf(__a); } +__DEVICE__ long long __omp_llabs(long long __a) { return llabs(__a); } +__DEVICE__ long long __omp_llmax(long long __a, long long __b) { + return llmax(__a, __b); +} +__DEVICE__ long long __omp_llmin(long long __a, long long __b) { + return llmin(__a, __b); +} +__DEVICE__ long long __omp_llrint(double __a) { return llrint(__a); } +__DEVICE__ long long __omp_llrintf(float __a) { return llrintf(__a); } +__DEVICE__ long long __omp_llround(double __a) { return llround(__a); } +__DEVICE__ long long __omp_llroundf(float __a) { return llroundf(__a); } +__DEVICE__ double __omp_round(double __a) { return round(__a); } +__DEVICE__ float __omp_roundf(float __a) { return roundf(__a); } +__DEVICE__ double __omp_log(double __a) { return log(__a); } +__DEVICE__ double __omp_log10(double __a) { return log10(__a); } +__DEVICE__ float __omp_log10f(float __a) { return log10f(__a); } +__DEVICE__ double __omp_log1p(double __a) { return log1p(__a); } +__DEVICE__ float __omp_log1pf(float __a) { return log1pf(__a); } +__DEVICE__ double __omp_log2(double __a) { return log2(__a); } +__DEVICE__ float __omp_log2f(float __a) { return log2f(__a); } +__DEVICE__ double __omp_logb(double __a) { return logb(__a); } +__DEVICE__ float __omp_logbf(float __a) { return logbf(__a); } +__DEVICE__ float __omp_logf(float __a) { return logf(__a); } +__DEVICE__ long __omp_lrint(double __a) { return lrint(__a); } +__DEVICE__ long __omp_lrintf(float __a) { return lrintf(__a); } +__DEVICE__ long __omp_lround(double __a) { return lround(__a); } +__DEVICE__ long __omp_lroundf(float __a) { return lroundf(__a); } +__DEVICE__ int __omp_max(int __a, int __b) { return max(__a, __b); } +__DEVICE__ int __omp_min(int __a, int __b) { return min(__a, __b); } +__DEVICE__ double __omp_modf(double __a, double *__b) { return modf(__a, __b); } +__DEVICE__ float __omp_modff(float __a, float *__b) { return modff(__a, __b); } +__DEVICE__ double __omp_nearbyint(double __a) { return nearbyint(__a); } +__DEVICE__ float __omp_nearbyintf(float __a) { return nearbyintf(__a); } +__DEVICE__ double __omp_nextafter(double __a, double __b) { + return nextafter(__a, __b); +} +__DEVICE__ float __omp_nextafterf(float __a, float __b) { + return nextafterf(__a, __b); +} +__DEVICE__ double __omp_norm(int __dim, const double *__t) { + return norm(__dim, __t); +} +__DEVICE__ double __omp_norm3d(double __a, double __b, double __c) { + return norm3d(__a, __b, __c); +} +__DEVICE__ float __omp_norm3df(float __a, float __b, float __c) { + return norm3df(__a, __b, __c); +} +__DEVICE__ double __omp_norm4d(double __a, double __b, double __c, double __d) { + return norm4d(__a, __b, __c, __d); +} +__DEVICE__ float __omp_norm4df(float __a, float __b, float __c, float __d) { + return norm4df(__a, __b, __c, __d); +} +__DEVICE__ double __omp_normcdf(double __a) { return normcdf(__a); } +__DEVICE__ float __omp_normcdff(float __a) { return normcdff(__a); } +__DEVICE__ double __omp_normcdfinv(double __a) { return normcdfinv(__a); } +__DEVICE__ float __omp_normcdfinvf(float __a) { return normcdfinvf(__a); } +__DEVICE__ float __omp_normf(int __dim, const float *__t) { + return normf(__dim, __t); +} +__DEVICE__ double __omp_pow(double __a, double __b) { return pow(__a, __b); } +__DEVICE__ float __omp_powf(float __a, float __b) { return powf(__a, __b); } +__DEVICE__ double __omp_powi(double __a, int __b) { return powi(__a, __b); } +__DEVICE__ float __omp_powif(float __a, int __b) { return powif(__a, __b); } +__DEVICE__ double __omp_rcbrt(double __a) { return rcbrt(__a); } +__DEVICE__ float __omp_rcbrtf(float __a) { return rcbrtf(__a); } +__DEVICE__ double __omp_remainder(double __a, double __b) { + return remainder(__a, __b); +} +__DEVICE__ float __omp_remainderf(float __a, float __b) { + return remainderf(__a, __b); +} +__DEVICE__ double __omp_remquo(double __a, double __b, int *__c) { + return remquo(__a, __b, __c); +} +__DEVICE__ float __omp_remquof(float __a, float __b, int *__c) { + return remquof(__a, __b, __c); +} +__DEVICE__ double __omp_rhypot(double __a, double __b) { + return rhypot(__a, __b); +} +__DEVICE__ float __omp_rhypotf(float __a, float __b) { + return rhypotf(__a, __b); +} +__DEVICE__ double __omp_rint(double __a) { return rint(__a); } +__DEVICE__ float __omp_rintf(float __a) { return rintf(__a); } +__DEVICE__ double __omp_rnorm(int __a, const double *__b) { + return rnorm(__a, __b); +} +__DEVICE__ double __omp_rnorm3d(double __a, double __b, double __c) { + return rnorm3d(__a, __b, __c); +} +__DEVICE__ float __omp_rnorm3df(float __a, float __b, float __c) { + return rnorm3df(__a, __b, __c); +} +__DEVICE__ double __omp_rnorm4d(double __a, double __b, double __c, + double __d) { + return rnorm4d(__a, __b, __c, __d); +} +__DEVICE__ float __omp_rnorm4df(float __a, float __b, float __c, float __d) { + return rnorm4df(__a, __b, __c, __d); +} +__DEVICE__ float __omp_rnormf(int __dim, const float *__t) { + return rnormf(__dim, __t); +} +__DEVICE__ double __omp_rsqrt(double __a) { return rsqrt(__a); } +__DEVICE__ float __omp_rsqrtf(float __a) { return rsqrtf(__a); } +__DEVICE__ double __omp_scalbn(double __a, int __b) { return scalbn(__a, __b); } +__DEVICE__ float __omp_scalbnf(float __a, int __b) { return scalbnf(__a, __b); } +__DEVICE__ double __omp_scalbln(double __a, long __b) { + return scalbn(__a, (int)__b); +} +__DEVICE__ float __omp_scalblnf(float __a, long __b) { + return scalbnf(__a, (int)__b); +} +__DEVICE__ double __omp_sin(double __a) { return sin(__a); } +__DEVICE__ void __omp_sincos(double __a, double *__s, double *__c) { + return sincos(__a, __s, __c); +} +__DEVICE__ void __omp_sincosf(float __a, float *__s, float *__c) { + return sincosf(__a, __s, __c); +} +__DEVICE__ void __omp_sincospi(double __a, double *__s, double *__c) { + return sincospi(__a, __s, __c); +} +__DEVICE__ void __omp_sincospif(float __a, float *__s, float *__c) { + return sincospif(__a, __s, __c); +} +__DEVICE__ float __omp_sinf(float __a) { return sinf(__a); } +__DEVICE__ double __omp_sinh(double __a) { return sinh(__a); } +__DEVICE__ float __omp_sinhf(float __a) { return sinhf(__a); } +__DEVICE__ double __omp_sinpi(double __a) { return sinpi(__a); } +__DEVICE__ float __omp_sinpif(float __a) { return sinpif(__a); } +__DEVICE__ double __omp_sqrt(double __a) { return sqrt(__a); } +__DEVICE__ float __omp_sqrtf(float __a) { return sqrtf(__a); } +__DEVICE__ double __omp_tan(double __a) { return tan(__a); } +__DEVICE__ float __omp_tanf(float __a) { return tanf(__a); } +__DEVICE__ double __omp_tanh(double __a) { return tanh(__a); } +__DEVICE__ float __omp_tanhf(float __a) { return tanhf(__a); } +__DEVICE__ double __omp_tgamma(double __a) { return tgamma(__a); } +__DEVICE__ float __omp_tgammaf(float __a) { return tgammaf(__a); } +__DEVICE__ double __omp_trunc(double __a) { return trunc(__a); } +__DEVICE__ float __omp_truncf(float __a) { return truncf(__a); } +__DEVICE__ unsigned long long __omp_ullmax(unsigned long long __a, + unsigned long long __b) { + return ullmax(__a, __b); +} +__DEVICE__ unsigned long long __omp_ullmin(unsigned long long __a, + unsigned long long __b) { + return ullmin(__a, __b); +} +__DEVICE__ unsigned int __omp_umax(unsigned int __a, unsigned int __b) { + return umax(__a, __b); +} +__DEVICE__ unsigned int __omp_umin(unsigned int __a, unsigned int __b) { + return umin(__a, __b); +} +__DEVICE__ double __omp_y0(double __a) { return y0(__a); } +__DEVICE__ float __omp_y0f(float __a) { return y0f(__a); } +__DEVICE__ double __omp_y1(double __a) { return y1(__a); } +__DEVICE__ float __omp_y1f(float __a) { return y1f(__a); } +__DEVICE__ double __omp_yn(int __a, double __b) { return yn(__a, __b); } +__DEVICE__ float __omp_ynf(int __a, float __b) { return ynf(__a, __b); } +}