diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1199,6 +1199,18 @@ CmdArgs.push_back("-include"); CmdArgs.push_back("__clang_openmp_device_functions.h"); + + { + auto *CTC = static_cast( + C.getSingleOffloadToolChain()); + assert(CTC && "Expected valid CUDA Toolchain."); + CudaVersion Ver = CTC->CudaInstallation.version(); + CmdArgs.push_back("-include"); + const char *Header = (Ver >= CudaVersion::CUDA_90) + ? "__clang_openmp_devicertl_cuda_ge90.h" + : "__clang_openmp_devicertl_cuda_lt90.h"; + CmdArgs.push_back(Header); + } } // Add -i* options, and automatically translate to diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -160,6 +160,8 @@ openmp_wrappers/complex.h openmp_wrappers/complex openmp_wrappers/__clang_openmp_device_functions.h + openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h + openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h openmp_wrappers/complex_cmath.h openmp_wrappers/new ) diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h @@ -0,0 +1,53 @@ +//===--- __clang_openmp_devicertl_cuda_ge90.h -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef __CLANG_OPENMP_DEVICERTL_CUDA_GE90_H__ +#define __CLANG_OPENMP_DEVICERTL_CUDA_GE90_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#pragma push_macro("DEVICE") + +#ifdef _OPENMP +#define DEVICE __attribute__((used)) +#else +#define DEVICE __attribute__((used)) __attribute__((device)) +#endif + +// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). +inline DEVICE unsigned __kmpc_impl_activemask() { + unsigned mask; + asm volatile("activemask.b32 %0;" : "=r"(mask)); + return mask; +} + +// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. +inline DEVICE int __kmpc_impl_shfl_sync(unsigned Mask, int Var, int SrcLane) { + int WARPSIZE = 32; + return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, WARPSIZE - 1); +} + +inline DEVICE int __kmpc_impl_shfl_down_sync(unsigned Mask, int Var, + unsigned Delta, int Width) { + int WARPSIZE = 32; + int tmp = ((WARPSIZE - Width) << 8) | 0x1f; + return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, tmp); +} + +inline DEVICE void __kmpc_impl_syncwarp(unsigned Mask) { + __nvvm_bar_warp_sync(Mask); +} + +#pragma pop_macro("DEVICE") + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h @@ -0,0 +1,52 @@ +//===--- __clang_openmp_devicertl_cuda_lt90.h -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef __CLANG_OPENMP_DEVICERTL_CUDA_LT90_H__ +#define __CLANG_OPENMP_DEVICERTL_CUDA_LT90_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#pragma push_macro("DEVICE") + +#ifdef _OPENMP +#define DEVICE __attribute__((used)) +#else +#define DEVICE __attribute__((used)) __attribute__((device)) +#endif + +// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). +inline DEVICE unsigned __kmpc_impl_activemask() { + return __nvvm_vote_ballot(1); +} + +// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. +inline DEVICE int __kmpc_impl_shfl_sync(unsigned Mask, int Var, int SrcLane) { + int WARPSIZE = 32; + return __nvvm_shfl_idx_i32(Var, SrcLane, WARPSIZE - 1); +} + +inline DEVICE int __kmpc_impl_shfl_down_sync(unsigned Mask, int Var, + unsigned Delta, int Width) { + int WARPSIZE = 32; + int tmp = ((WARPSIZE - Width) << 8) | 0x1f; + return __nvvm_shfl_down_i32(Var, Delta, tmp); +} + +inline DEVICE void __kmpc_impl_syncwarp(unsigned Mask) { + (void)Mask; + // In Cuda < 9.0 no need to sync threads in warps. +} + +#pragma pop_macro("DEVICE") + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -85,19 +85,17 @@ INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); } -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask(); -DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, +EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, int32_t SrcLane); -DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var, +EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var, uint32_t Delta, int32_t Width); -INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); } +EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t); -INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) { - // AMDGCN doesn't need to sync threads in a warp -} +INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); } // AMDGCN specific kernel initialization DEVICE void __kmpc_impl_target_init(); diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -46,11 +46,11 @@ } // Warp vote function -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() { return __builtin_amdgcn_read_exec(); } -DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var, +EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var, int32_t srcLane) { int width = WARPSIZE; int self = GetLaneId(); @@ -58,7 +58,7 @@ return __builtin_amdgcn_ds_bpermute(index << 2, var); } -DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var, +EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var, uint32_t laneDelta, int32_t width) { int self = GetLaneId(); int index = self + laneDelta; @@ -66,6 +66,10 @@ return __builtin_amdgcn_ds_bpermute(index << 2, var); } +EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) { + // AMDGCN doesn't need to sync threads in a warp +} + static DEVICE SHARED uint32_t L1_Barrier; DEVICE void __kmpc_impl_target_init() { diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -100,17 +100,18 @@ #error CUDA_VERSION macro is undefined, something wrong with cuda. #endif -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask(); -DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, +EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, int32_t SrcLane); -DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, +EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, uint32_t Delta, int32_t Width); +EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask); + DEVICE void __kmpc_impl_syncthreads(); -DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask); // NVPTX specific kernel initialization DEVICE void __kmpc_impl_target_init(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -18,17 +18,6 @@ // Forward declaration of CUDA primitives which will be evetually transformed // into LLVM intrinsics. -extern "C" { -unsigned int __activemask(); -unsigned int __ballot(unsigned); -// The default argument here is based on NVIDIA's website -// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/ -int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE); -int __shfl(int val, int src_line, int width = WARPSIZE); -int __shfl_down(int var, unsigned detla, int width); -int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width); -void __syncwarp(int mask); -} DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); @@ -69,45 +58,8 @@ return (double)nsecs * __kmpc_impl_get_wtick(); } -// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { -#if CUDA_VERSION >= 9000 - return __activemask(); -#else - return __ballot(1); -#endif -} - -// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. -DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, - int32_t SrcLane) { -#if CUDA_VERSION >= 9000 - return __shfl_sync(Mask, Var, SrcLane); -#else - return __shfl(Var, SrcLane); -#endif // CUDA_VERSION -} - -DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, - int32_t Var, uint32_t Delta, - int32_t Width) { -#if CUDA_VERSION >= 9000 - return __shfl_down_sync(Mask, Var, Delta, Width); -#else - return __shfl_down(Var, Delta, Width); -#endif // CUDA_VERSION -} - DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); } -DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { -#if CUDA_VERSION >= 9000 - __syncwarp(Mask); -#else - // In Cuda < 9.0 no need to sync threads in warps. -#endif // CUDA_VERSION -} - // NVPTX specific kernel initialization DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */ }