diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -67,88 +67,6 @@ __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 }; -INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { - lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF)); - hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32); -} - -INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { - return (((uint64_t)hi) << 32) | (uint64_t)lo; -} - -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); -DEVICE uint32_t __kmpc_impl_smid(); -DEVICE double __kmpc_impl_get_wtick(); -DEVICE double __kmpc_impl_get_wtime(); - -INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } -INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); } - -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); - -DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, - int32_t SrcLane); - -DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var, - uint32_t Delta, int32_t Width); - -INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); } - -INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) { - // AMDGCN doesn't need to sync threads in a warp -} - -// AMDGCN specific kernel initialization -DEVICE void __kmpc_impl_target_init(); - -// Equivalent to ptx bar.sync 1. Barrier until num_threads arrive. -DEVICE void __kmpc_impl_named_sync(uint32_t num_threads); - -INLINE void __kmpc_impl_threadfence() { - __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); -} - -INLINE void __kmpc_impl_threadfence_block() { - __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); -} - -INLINE void __kmpc_impl_threadfence_system() { - __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); -} - -// Calls to the AMDGCN layer (assuming 1D layout) -INLINE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } -INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); } -DEVICE int GetNumberOfBlocksInKernel(); -DEVICE int GetNumberOfThreadsInBlock(); -DEVICE unsigned GetWarpId(); -DEVICE unsigned GetLaneId(); - -// Atomics -DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); - -static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); -DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, - unsigned long long); -DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, - unsigned long long); - -// Locks -DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock); -DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock); - -// Memory -DEVICE void *__kmpc_impl_malloc(size_t x); -DEVICE void __kmpc_impl_free(void *x); - // DEVICE versions of part of libc INLINE void __assert_fail(const char *, const char *, unsigned int, const char *) { diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -189,4 +189,35 @@ DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; } DEVICE void __kmpc_impl_free(void *) {} +DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { + lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF)); + hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32); +} + +DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { + return (((uint64_t)hi) << 32) | (uint64_t)lo; +} + +DEVICE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); } + +DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) { + // AMDGCN doesn't need to sync threads in a warp +} + +DEVICE void __kmpc_impl_threadfence() { + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); +} + +DEVICE void __kmpc_impl_threadfence_block() { + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); +} + +DEVICE void __kmpc_impl_threadfence_system() { + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); +} + +// Calls to the AMDGCN layer (assuming 1D layout) +DEVICE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } +DEVICE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); } + #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h --- a/openmp/libomptarget/deviceRTLs/common/debug.h +++ b/openmp/libomptarget/deviceRTLs/common/debug.h @@ -29,6 +29,7 @@ #define _OMPTARGET_NVPTX_DEBUG_H_ #include "common/device_environment.h" +#include "target_interface.h" //////////////////////////////////////////////////////////////////////////////// // set desired level of debugging diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -330,6 +330,12 @@ // inlined implementation //////////////////////////////////////////////////////////////////////////////// +INLINE DEVICE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); } + +INLINE DEVICE uint32_t __kmpc_impl_popc(uint32_t x) { + return __builtin_popcount(x); +} + #include "common/omptargeti.h" #endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -85,73 +85,8 @@ __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 }; -DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi); -DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi); -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); -DEVICE uint32_t __kmpc_impl_smid(); -DEVICE double __kmpc_impl_get_wtick(); -DEVICE double __kmpc_impl_get_wtime(); - -INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); } -INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); } - #ifndef CUDA_VERSION #error CUDA_VERSION macro is undefined, something wrong with cuda. #endif -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); - -DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, - int32_t SrcLane); - -DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, - int32_t Var, uint32_t Delta, - int32_t Width); - -DEVICE void __kmpc_impl_syncthreads(); -DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask); - -// NVPTX specific kernel initialization -DEVICE void __kmpc_impl_target_init(); - -// Barrier until num_threads arrive. -DEVICE void __kmpc_impl_named_sync(uint32_t num_threads); - -DEVICE void __kmpc_impl_threadfence(); -DEVICE void __kmpc_impl_threadfence_block(); -DEVICE void __kmpc_impl_threadfence_system(); - -// Calls to the NVPTX layer (assuming 1D layout) -DEVICE int GetThreadIdInBlock(); -DEVICE int GetBlockIdInKernel(); -DEVICE int GetNumberOfBlocksInKernel(); -DEVICE int GetNumberOfThreadsInBlock(); -DEVICE unsigned GetWarpId(); -DEVICE unsigned GetLaneId(); - -// Atomics -DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); - -static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); -DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, - unsigned long long); -DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, - unsigned long long); - -// Locks -DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock); -DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock); - -// Memory -DEVICE void *__kmpc_impl_malloc(size_t); -DEVICE void __kmpc_impl_free(void *); - #endif diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/target_interface.h @@ -0,0 +1,79 @@ +//===------------- target_interface.h - Target interfaces --------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains interfaces that must be implemented by each target. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_TARGET_INTERFACE_H_ +#define _OMPTARGET_TARGET_INTERFACE_H_ + +#include "target_impl.h" + +// Calls to the NVPTX layer (assuming 1D layout) +EXTERN int GetThreadIdInBlock(); +EXTERN int GetBlockIdInKernel(); +EXTERN int GetNumberOfBlocksInKernel(); +EXTERN int GetNumberOfThreadsInBlock(); +EXTERN unsigned GetWarpId(); +EXTERN unsigned GetLaneId(); + +// Atomics +extern DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); +static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); +extern DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, + unsigned long long); +extern DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, + unsigned long long); + +// Locks +EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock); +EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock); + +EXTERN void __kmpc_impl_threadfence(); +EXTERN void __kmpc_impl_threadfence_block(); +EXTERN void __kmpc_impl_threadfence_system(); + +EXTERN double __kmpc_impl_get_wtick(); +EXTERN double __kmpc_impl_get_wtime(); + +EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi); +EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi); +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); +EXTERN uint32_t __kmpc_impl_smid(); + +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask(); + +EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, + int32_t SrcLane); +EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, + int32_t Var, uint32_t Delta, + int32_t Width); + +EXTERN void __kmpc_impl_syncthreads(); +EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask); + +// Kernel initialization +EXTERN void __kmpc_impl_target_init(); + +// Memory +EXTERN void *__kmpc_impl_malloc(size_t); +EXTERN void __kmpc_impl_free(void *); + +// Barrier until num_threads arrive. +EXTERN void __kmpc_impl_named_sync(uint32_t num_threads); + +#endif // _OMPTARGET_TARGET_INTERFACE_H_