diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h --- a/openmp/libomptarget/deviceRTLs/common/debug.h +++ b/openmp/libomptarget/deviceRTLs/common/debug.h @@ -29,6 +29,9 @@ #define _OMPTARGET_NVPTX_DEBUG_H_ #include "common/device_environment.h" +#include "target_interface.h" + +#include //////////////////////////////////////////////////////////////////////////////// // set desired level of debugging diff --git a/openmp/libomptarget/deviceRTLs/common/device_environment.h b/openmp/libomptarget/deviceRTLs/common/device_environment.h --- a/openmp/libomptarget/deviceRTLs/common/device_environment.h +++ b/openmp/libomptarget/deviceRTLs/common/device_environment.h @@ -13,7 +13,7 @@ #ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_ #define _OMPTARGET_DEVICE_ENVIRONMENT_H_ -#include "target_impl.h" +#include "interface.h" struct omptarget_device_environmentTy { int32_t debug_level; diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -14,11 +14,10 @@ #ifndef OMPTARGET_H #define OMPTARGET_H -#include "target_impl.h" -#include "common/debug.h" // debug -#include "interface.h" // interfaces with omp, compiler, and user +#include "common/debug.h" // debug #include "common/state-queue.h" #include "common/support.h" +#include "interface.h" // interfaces with omp, compiler, and user #define OMPTARGET_NVPTX_VERSION 1.1 diff --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu @@ -11,8 +11,9 @@ //===----------------------------------------------------------------------===// #pragma omp declare target -#include "interface.h" #include "common/debug.h" +#include "interface.h" +#include "target_interface.h" EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal) { diff --git a/openmp/libomptarget/deviceRTLs/common/src/critical.cu b/openmp/libomptarget/deviceRTLs/common/src/critical.cu --- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/critical.cu @@ -11,8 +11,8 @@ //===----------------------------------------------------------------------===// #pragma omp declare target -#include "interface.h" #include "common/debug.h" +#include "interface.h" EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -12,7 +12,6 @@ #pragma omp declare target #include "common/omptarget.h" -#include "target_impl.h" // Return true if this is the master thread. INLINE static bool IsMasterThread(bool isSPMDExecutionMode) { diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -13,7 +13,6 @@ #pragma omp declare target #include "common/omptarget.h" -#include "target_impl.h" EXTERN double omp_get_wtick(void) { double rc = __kmpc_impl_get_wtick(); diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -14,7 +14,6 @@ #pragma omp declare target #include "common/omptarget.h" -#include "target_impl.h" //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu @@ -11,8 +11,8 @@ //===----------------------------------------------------------------------===// #pragma omp declare target -#include "common/omptarget.h" #include "common/device_environment.h" +#include "common/omptarget.h" //////////////////////////////////////////////////////////////////////////////// // global device environment diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -12,7 +12,6 @@ #pragma omp declare target #include "common/omptarget.h" -#include "target_impl.h" //////////////////////////////////////////////////////////////////////////////// // global data tables diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -34,7 +34,6 @@ #pragma omp declare target #include "common/omptarget.h" -#include "target_impl.h" //////////////////////////////////////////////////////////////////////////////// // support for parallel that goes parallel (1 static level only) diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -12,7 +12,6 @@ #pragma omp declare target #include "common/omptarget.h" -#include "target_impl.h" EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid) {} diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -11,9 +11,9 @@ //===----------------------------------------------------------------------===// #pragma omp declare target -#include "common/support.h" #include "common/debug.h" #include "common/omptarget.h" +#include "common/support.h" //////////////////////////////////////////////////////////////////////////////// // Execution Parameters diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu @@ -12,7 +12,6 @@ #pragma omp declare target #include "common/omptarget.h" -#include "target_impl.h" //////////////////////////////////////////////////////////////////////////////// // KMP Ordered calls diff --git a/openmp/libomptarget/deviceRTLs/common/state-queue.h b/openmp/libomptarget/deviceRTLs/common/state-queue.h --- a/openmp/libomptarget/deviceRTLs/common/state-queue.h +++ b/openmp/libomptarget/deviceRTLs/common/state-queue.h @@ -19,9 +19,7 @@ #ifndef __STATE_QUEUE_H #define __STATE_QUEUE_H -#include - -#include "target_impl.h" +#include "interface.h" template class omptarget_nvptx_Queue { private: diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -14,7 +14,6 @@ #define OMPTARGET_SUPPORT_H #include "interface.h" -#include "target_impl.h" //////////////////////////////////////////////////////////////////////////////// // Execution Parameters diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h @@ -9,10 +9,73 @@ #ifndef _NVPTX_INTERFACE_H_ #define _NVPTX_INTERFACE_H_ -#include +#define DEVICE __device__ +#define EXTERN extern "C" DEVICE +#define INLINE __forceinline__ DEVICE +#define NOINLINE __noinline__ DEVICE +#define SHARED __shared__ +#define ALIGN(N) __align__(N) + +//////////////////////////////////////////////////////////////////////////////// +// Kernel options +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// The following def must match the absolute limit hardwired in the host RTL +// max number of threads per team +#define MAX_THREADS_PER_TEAM 1024 + +#define WARPSIZE 32 + +// Maximum number of preallocated arguments to an outlined parallel/simd +// function. Anything more requires dynamic memory allocation. +#define MAX_SHARED_ARGS 20 + +// Maximum number of omp state objects per SM allocated statically in global +// memory. +#if __CUDA_ARCH__ >= 600 +#define OMP_STATE_COUNT 32 +#else +#define OMP_STATE_COUNT 16 +#endif + +#if !defined(MAX_SM) +#if __CUDA_ARCH__ >= 900 +#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option +#elif __CUDA_ARCH__ >= 800 +// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs +// GA102 design has a maxinum of 84 SMs +#define MAX_SM 108 +#elif __CUDA_ARCH__ >= 700 +#define MAX_SM 84 +#elif __CUDA_ARCH__ >= 600 +#define MAX_SM 56 +#else +#define MAX_SM 16 +#endif +#endif + +#define OMP_ACTIVE_PARALLEL_LEVEL 128 + +// Data sharing related quantities, need to match what is used in the compiler. +enum DATA_SHARING_SIZES { + // The maximum number of workers in a kernel. + DS_Max_Worker_Threads = 992, + // The size reserved for data in a shared memory slot. + DS_Slot_Size = 256, + // The slot size that should be reserved for a working warp. + DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, + // The maximum number of warps in use + DS_Max_Warp_Number = 32, + // The size of the preallocated shared memory buffer per team + DS_Shared_Memory_Size = 128, +}; -#define EXTERN extern "C" __device__ typedef uint32_t __kmpc_impl_lanemask_t; typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ +enum : __kmpc_impl_lanemask_t { + __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 +}; + #endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ /dev/null @@ -1,157 +0,0 @@ -//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Definitions of target specific functions -// -//===----------------------------------------------------------------------===// -#ifndef _TARGET_IMPL_H_ -#define _TARGET_IMPL_H_ - -#include -#include -#include -#include -#include - -#include "nvptx_interface.h" - -#define DEVICE __device__ -#define INLINE __forceinline__ DEVICE -#define NOINLINE __noinline__ DEVICE -#define SHARED __shared__ -#define ALIGN(N) __align__(N) - -//////////////////////////////////////////////////////////////////////////////// -// Kernel options -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// The following def must match the absolute limit hardwired in the host RTL -// max number of threads per team -#define MAX_THREADS_PER_TEAM 1024 - -#define WARPSIZE 32 - -// Maximum number of preallocated arguments to an outlined parallel/simd function. -// Anything more requires dynamic memory allocation. -#define MAX_SHARED_ARGS 20 - -// Maximum number of omp state objects per SM allocated statically in global -// memory. -#if __CUDA_ARCH__ >= 600 -#define OMP_STATE_COUNT 32 -#else -#define OMP_STATE_COUNT 16 -#endif - -#if !defined(MAX_SM) -#if __CUDA_ARCH__ >= 900 -#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option -#elif __CUDA_ARCH__ >= 800 -// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs -// GA102 design has a maxinum of 84 SMs -#define MAX_SM 108 -#elif __CUDA_ARCH__ >= 700 -#define MAX_SM 84 -#elif __CUDA_ARCH__ >= 600 -#define MAX_SM 56 -#else -#define MAX_SM 16 -#endif -#endif - -#define OMP_ACTIVE_PARALLEL_LEVEL 128 - -// Data sharing related quantities, need to match what is used in the compiler. -enum DATA_SHARING_SIZES { - // The maximum number of workers in a kernel. - DS_Max_Worker_Threads = 992, - // The size reserved for data in a shared memory slot. - DS_Slot_Size = 256, - // The slot size that should be reserved for a working warp. - DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, - // The maximum number of warps in use - DS_Max_Warp_Number = 32, - // The size of the preallocated shared memory buffer per team - DS_Shared_Memory_Size = 128, -}; - -enum : __kmpc_impl_lanemask_t { - __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0 -}; - -DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi); -DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi); -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); -DEVICE uint32_t __kmpc_impl_smid(); -DEVICE double __kmpc_impl_get_wtick(); -DEVICE double __kmpc_impl_get_wtime(); - -INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); } -INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); } - -#ifndef CUDA_VERSION -#error CUDA_VERSION macro is undefined, something wrong with cuda. -#endif - -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); - -DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, - int32_t SrcLane); - -DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, - int32_t Var, uint32_t Delta, - int32_t Width); - -DEVICE void __kmpc_impl_syncthreads(); -DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask); - -// NVPTX specific kernel initialization -DEVICE void __kmpc_impl_target_init(); - -// Barrier until num_threads arrive. -DEVICE void __kmpc_impl_named_sync(uint32_t num_threads); - -DEVICE void __kmpc_impl_threadfence(); -DEVICE void __kmpc_impl_threadfence_block(); -DEVICE void __kmpc_impl_threadfence_system(); - -// Calls to the NVPTX layer (assuming 1D layout) -DEVICE int GetThreadIdInBlock(); -DEVICE int GetBlockIdInKernel(); -DEVICE int GetNumberOfBlocksInKernel(); -DEVICE int GetNumberOfThreadsInBlock(); -DEVICE unsigned GetWarpId(); -DEVICE unsigned GetLaneId(); - -// Atomics -DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); - -static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); -DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, - unsigned long long); -DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, - unsigned long long); - -// Locks -DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock); -DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock); - -// Memory -DEVICE void *__kmpc_impl_malloc(size_t); -DEVICE void __kmpc_impl_free(void *); - -#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -11,8 +11,8 @@ //===----------------------------------------------------------------------===// #pragma omp declare target -#include "target_impl.h" #include "common/debug.h" +#include "nvptx_interface.h" #include @@ -33,6 +33,10 @@ void __threadfence_system(); } +DEVICE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); } + +DEVICE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); } + DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); } diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/deviceRTLs/target_interface.h @@ -0,0 +1,82 @@ +//===------------- target_interface.h - Target interfaces --------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains interfaces that must be implemented by each target. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_TARGET_INTERFACE_H_ +#define _OMPTARGET_TARGET_INTERFACE_H_ + +#include "interface.h" + +// Calls to the NVPTX layer (assuming 1D layout) +EXTERN int GetThreadIdInBlock(); +EXTERN int GetBlockIdInKernel(); +EXTERN int GetNumberOfBlocksInKernel(); +EXTERN int GetNumberOfThreadsInBlock(); +EXTERN unsigned GetWarpId(); +EXTERN unsigned GetLaneId(); + +// Atomics +extern DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); +extern DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); +static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); +extern DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, + unsigned long long); +extern DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, + unsigned long long); + +// Locks +EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock); +EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock); + +EXTERN void __kmpc_impl_threadfence(); +EXTERN void __kmpc_impl_threadfence_block(); +EXTERN void __kmpc_impl_threadfence_system(); + +EXTERN double __kmpc_impl_get_wtick(); +EXTERN double __kmpc_impl_get_wtime(); + +EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi); +EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi); +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); +EXTERN uint32_t __kmpc_impl_smid(); + +EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask(); + +EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, + int32_t SrcLane); +EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, + int32_t Var, uint32_t Delta, + int32_t Width); + +EXTERN uint32_t __kmpc_impl_ffs(uint32_t x); +EXTERN uint32_t __kmpc_impl_popc(uint32_t x); + +EXTERN void __kmpc_impl_syncthreads(); +EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask); + +// Kernel initialization +EXTERN void __kmpc_impl_target_init(); + +// Memory +EXTERN void *__kmpc_impl_malloc(size_t); +EXTERN void __kmpc_impl_free(void *); + +// Barrier until num_threads arrive. +DEVICE void __kmpc_impl_named_sync(uint32_t num_threads); + +#endif // _OMPTARGET_TARGET_INTERFACE_H_