diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -70,6 +70,12 @@ ${devicertl_base_directory}/common/src/sync.cu ${devicertl_base_directory}/common/src/task.cu) +# Functions implemented in IR are used where there is not yet a corresponding +# intrinsic available in clang. The intent is for these functions to be removed +# as clang is extended. +set(llvm_sources + ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_atomic.ll) + set(h_files ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_atomics.h @@ -137,6 +143,7 @@ foreach(mcpu ${mcpus}) set(bc_files) add_cuda_bc_library(${cuda_sources}) + list(APPEND bc_files ${llvm_sources}) set(bc_libname lib${libname}-${mcpu}.bc) add_custom_command( diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_atomic.ll b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_atomic.ll new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_atomic.ll @@ -0,0 +1,28 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +target triple = "amdgcn-amd-amdhsa" + +; These functions are implemented in IR as there is not yet a corresponding intrinsic +; available in clang. The intent is to remove it once said intrinsic is implemented. + +declare i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #1 + +; Function Attrs: alwaysinline nounwind +define i32 @__amdgcn_atomic_inc_u32(i32* %x, i32 %v) #0 { +entry: + %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %x, i32 %v, + i32 5, ; Ordering. AtomicOrdering.h: sequentially consistent + i32 2, ; Scope. SyncScope.h: OpenCLAllSVMDevices is 2 + i1 0 ; Volatile. False for consistency with other atomic operations + ) + ret i32 %ret +} + +define i64 @__amdgcn_atomic_max_u64(i64* %a, i64 %v) #0 { +entry: + %0 = atomicrmw umax i64* %a, i64 %v seq_cst + ret i64 %0 +} + + +attributes #0 = { alwaysinline nounwind } +attributes #1 = { nounwind argmemonly } diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h @@ -11,29 +11,35 @@ #include "target_impl.h" -DEVICE unsigned atomicAdd(unsigned *address, unsigned val); -DEVICE int atomicAdd(int *address, int val); -DEVICE unsigned long long atomicAdd(unsigned long long *address, - unsigned long long val); - -DEVICE unsigned atomicInc(unsigned *address); -DEVICE unsigned atomicInc(unsigned *address, unsigned max); -DEVICE int atomicInc(int *address); - -DEVICE int atomicMax(int *address, int val); -DEVICE unsigned atomicMax(unsigned *address, unsigned val); -DEVICE unsigned long long atomicMax(unsigned long long *address, - unsigned long long val); - -DEVICE int atomicExch(int *address, int val); -DEVICE unsigned atomicExch(unsigned *address, unsigned val); -DEVICE unsigned long long atomicExch(unsigned long long *address, - unsigned long long val); - -DEVICE unsigned atomicCAS(unsigned *address, unsigned compare, unsigned val); -DEVICE int atomicCAS(int *address, int compare, int val); -DEVICE unsigned long long atomicCAS(unsigned long long *address, - unsigned long long compare, - unsigned long long val); - +namespace { + +template DEVICE T atomicAdd( T *x, T v) { + return __atomic_fetch_add(x, v, __ATOMIC_SEQ_CST); +} + +// Only implemented for i32 as that's the only call site +EXTERN uint32_t __amdgcn_atomic_inc_u32( uint32_t *, uint32_t); +INLINE uint32_t atomicInc( uint32_t *address, uint32_t val) { + return __amdgcn_atomic_inc_u32(address, val); +} + +EXTERN uint64_t __amdgcn_atomic_max_u64(uint64_t *, uint64_t); +INLINE uint64_t atomicMax(uint64_t *address, uint64_t val) { + return __amdgcn_atomic_max_u64(address, val); +} + +template DEVICE T atomicExch( T *address, T val) { + T r; + __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST); + return r; +} + +template +DEVICE T atomicCAS( T *address, T compare, T val) { + (void)__atomic_compare_exchange(address, &compare, &val, false, + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); + return compare; +} + +} // namespace #endif diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -793,8 +793,7 @@ // Atomic max of iterations. uint64_t *varArray = (uint64_t *)array; uint64_t elem = varArray[i]; - (void)__kmpc_atomic_max((unsigned long long int *)Buffer, - (unsigned long long int)elem); + (void)__kmpc_atomic_max(Buffer, elem); // Barrier. syncWorkersInGenericMode(NumThreads);