diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -82,9 +82,6 @@ DEVICE double __kmpc_impl_get_wtick(); DEVICE double __kmpc_impl_get_wtime(); -INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } -INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); } - DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h --- a/openmp/libomptarget/deviceRTLs/common/debug.h +++ b/openmp/libomptarget/deviceRTLs/common/debug.h @@ -29,6 +29,7 @@ #define _OMPTARGET_NVPTX_DEBUG_H_ #include "common/device_environment.h" +#include "target_interface.h" //////////////////////////////////////////////////////////////////////////////// // set desired level of debugging diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -330,6 +330,12 @@ // inlined implementation //////////////////////////////////////////////////////////////////////////////// +INLINE DEVICE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); } + +INLINE DEVICE uint32_t __kmpc_impl_popc(uint32_t x) { + return __builtin_popcount(x); +} + #include "common/omptargeti.h" #endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -85,73 +85,8 @@ __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 }; -DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi); -DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi); -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); -DEVICE uint32_t __kmpc_impl_smid(); -DEVICE double __kmpc_impl_get_wtick(); -DEVICE double __kmpc_impl_get_wtime(); - -INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); } -INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); } - #ifndef CUDA_VERSION #error CUDA_VERSION macro is undefined, something wrong with cuda. #endif -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); - -DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, - int32_t SrcLane); - -DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, - int32_t Var, uint32_t Delta, - int32_t Width); - -DEVICE void __kmpc_impl_syncthreads(); -DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask); - -// NVPTX specific kernel initialization -DEVICE void __kmpc_impl_target_init(); - -// Barrier until num_threads arrive. -DEVICE void __kmpc_impl_named_sync(uint32_t num_threads); - -DEVICE void __kmpc_impl_threadfence(); -DEVICE void __kmpc_impl_threadfence_block(); -DEVICE void __kmpc_impl_threadfence_system(); - -// Calls to the NVPTX layer (assuming 1D layout) -DEVICE int GetThreadIdInBlock(); -DEVICE int GetBlockIdInKernel(); -DEVICE int GetNumberOfBlocksInKernel(); -DEVICE int GetNumberOfThreadsInBlock(); -DEVICE unsigned GetWarpId(); -DEVICE unsigned GetLaneId(); - -// Atomics -DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); - -static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); -DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, - unsigned long long); -DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, - unsigned long long); - -// Locks -DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock); -DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock); - -// Memory -DEVICE void *__kmpc_impl_malloc(size_t); -DEVICE void __kmpc_impl_free(void *); - #endif diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/target_interface.h @@ -0,0 +1,79 @@ +//===------------- target_interface.h - Target interfaces --------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains interfaces that must be implemented by each target. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_TARGET_INTERFACE_H_ +#define _OMPTARGET_TARGET_INTERFACE_H_ + +#include "target_impl.h" + +// Calls to the NVPTX layer (assuming 1D layout) +EXTERN int GetThreadIdInBlock(); +EXTERN int GetBlockIdInKernel(); +EXTERN int GetNumberOfBlocksInKernel(); +EXTERN int GetNumberOfThreadsInBlock(); +EXTERN unsigned GetWarpId(); +EXTERN unsigned GetLaneId(); + +// Atomics +extern DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); +static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); +extern DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, + unsigned long long); +extern DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, + unsigned long long); + +// Locks +EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock); +EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock); + +EXTERN void __kmpc_impl_threadfence(); +EXTERN void __kmpc_impl_threadfence_block(); +EXTERN void __kmpc_impl_threadfence_system(); + +EXTERN double __kmpc_impl_get_wtick(); +EXTERN double __kmpc_impl_get_wtime(); + +EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi); +EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi); +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); +EXTERN uint32_t __kmpc_impl_smid(); + +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask(); + +EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, + int32_t SrcLane); +EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, + int32_t Var, uint32_t Delta, + int32_t Width); + +EXTERN void __kmpc_impl_syncthreads(); +EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask); + +// Kernel initialization +EXTERN void __kmpc_impl_target_init(); + +// Memory +EXTERN void *__kmpc_impl_malloc(size_t); +EXTERN void __kmpc_impl_free(void *); + +// Barrier until num_threads arrive. +EXTERN void __kmpc_impl_named_sync(uint32_t num_threads); + +#endif // _OMPTARGET_TARGET_INTERFACE_H_