diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -130,6 +130,7 @@ } EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +EXTERN unsigned GetWarpSize() { return WARPSIZE; } EXTERN unsigned GetLaneId() { return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); } diff --git a/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h b/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h --- a/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h +++ b/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h @@ -33,10 +33,12 @@ /// Forward declarations /// ///{ +extern "C" { unsigned GetLaneId(); unsigned GetWarpSize(); void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi); uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi); +} ///} /// Fallback implementations of the shuffle sync idiom. diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -13,64 +13,65 @@ #include "common/debug.h" #include "target_impl.h" +#include "target_interface.h" -DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { +EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); } -DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { +EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { uint64_t val; asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); return val; } -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { __kmpc_impl_lanemask_t res; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res)); return res; } -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { __kmpc_impl_lanemask_t res; asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res)); return res; } -DEVICE uint32_t __kmpc_impl_smid() { +EXTERN uint32_t __kmpc_impl_smid() { uint32_t id; asm("mov.u32 %0, %%smid;" : "=r"(id)); return id; } -DEVICE double __kmpc_impl_get_wtick() { +EXTERN double __kmpc_impl_get_wtick() { // Timer precision is 1ns return ((double)1E-9); } -DEVICE double __kmpc_impl_get_wtime() { +EXTERN double __kmpc_impl_get_wtime() { unsigned long long nsecs; asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs)); return (double)nsecs * __kmpc_impl_get_wtick(); } -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() { unsigned int Mask; asm volatile("activemask.b32 %0;" : "=r"(Mask)); return Mask; } -DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); } +EXTERN void __kmpc_impl_syncthreads() { __syncthreads(); } -DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { +EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } // NVPTX specific kernel initialization -DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */ +EXTERN void __kmpc_impl_target_init() { /* nvptx needs no extra setup */ } // Barrier until num_threads arrive. -DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) { +EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) { // The named barrier for active parallel threads of a team in an L1 parallel // region to synchronize with each other. int barrier = 1; @@ -80,19 +81,20 @@ : "memory"); } -DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); } -DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); } -DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); } +EXTERN void __kmpc_impl_threadfence() { __nvvm_membar_gl(); } +EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); } +EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); } // Calls to the NVPTX layer (assuming 1D layout) -DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } -DEVICE int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); } -DEVICE int GetNumberOfBlocksInKernel() { +EXTERN int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } +EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); } +EXTERN int GetNumberOfBlocksInKernel() { return __nvvm_read_ptx_sreg_nctaid_x(); } -DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); } -DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } -DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } +EXTERN int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); } +EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +EXTERN unsigned GetWarpSize() { return WARPSIZE; } +EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } // Atomics DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { @@ -135,15 +137,15 @@ #define UNSET 0u #define SET 1u -DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock) { +EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) { __kmpc_impl_unset_lock(lock); } -DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) { +EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) { __kmpc_impl_unset_lock(lock); } -DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) { +EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) { // TODO: not sure spinning is a good idea here.. while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) { int32_t start = __nvvm_read_ptx_sreg_clock(); @@ -158,15 +160,15 @@ } // wait for 0 to be the read value } -DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock) { +EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) { (void)__kmpc_atomic_exchange(lock, UNSET); } -DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { +EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) { return __kmpc_atomic_add(lock, 0u); } -DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); } -DEVICE void __kmpc_impl_free(void *x) { free(x); } +EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); } +EXTERN void __kmpc_impl_free(void *x) { free(x); } #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h --- a/openmp/libomptarget/deviceRTLs/target_interface.h +++ b/openmp/libomptarget/deviceRTLs/target_interface.h @@ -21,19 +21,20 @@ EXTERN int GetNumberOfBlocksInKernel(); EXTERN int GetNumberOfThreadsInBlock(); EXTERN unsigned GetWarpId(); +EXTERN unsigned GetWarpSize(); EXTERN unsigned GetLaneId(); // Atomics -extern DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); -extern DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); -extern DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); -extern DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); -extern DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); +DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); -extern DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, - unsigned long long); -extern DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, - unsigned long long); +DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, + unsigned long long); +DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, + unsigned long long); // Locks EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);