diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip @@ -18,15 +18,15 @@ #include "common/debug.h" -static DEVICE void warn() { +static void warn() { PRINT0(LD_ALL, "Locks are not supported in this thread mapping model"); } -DEVICE void __kmpc_impl_init_lock(omp_lock_t *) { warn(); } -DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); } -DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); } -DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); } -DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { +void __kmpc_impl_init_lock(omp_lock_t *) { warn(); } +void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); } +void __kmpc_impl_set_lock(omp_lock_t *) { warn(); } +void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); } +int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); return 0; } diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -25,9 +25,8 @@ #define PRId64 "ld" #define PRIu64 "lu" -#define DEVICE -#define INLINE inline DEVICE -#define NOINLINE __attribute__((noinline)) DEVICE +#define INLINE inline +#define NOINLINE __attribute__((noinline)) #define ALIGN(N) __attribute__((aligned(N))) //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -107,12 +107,12 @@ } namespace { -DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) { +uint32_t get_grid_dim(uint32_t n, uint16_t d) { uint32_t q = n / d; return q + (n > q * d); } -DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size, - uint16_t group_size) { +uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size, + uint16_t group_size) { uint32_t r = grid_size - group_id * group_size; return (r < group_size) ? r : group_size; } @@ -140,36 +140,35 @@ } // Atomics -DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { +uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); } -DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { +uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { return __builtin_amdgcn_atomic_inc32(Address, Val, __ATOMIC_SEQ_CST, ""); } -DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { +uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST); } -DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { +uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { uint32_t R; __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); return R; } -DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, - uint32_t Val) { +uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) { (void)__atomic_compare_exchange(Address, &Compare, &Val, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); return Compare; } -DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, - unsigned long long Val) { +unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, + unsigned long long Val) { unsigned long long R; __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); return R; } -DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address, - unsigned long long Val) { +unsigned long long __kmpc_atomic_add(unsigned long long *Address, + unsigned long long Val) { return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); } diff --git a/openmp/libomptarget/deviceRTLs/common/device_environment.h b/openmp/libomptarget/deviceRTLs/common/device_environment.h --- a/openmp/libomptarget/deviceRTLs/common/device_environment.h +++ b/openmp/libomptarget/deviceRTLs/common/device_environment.h @@ -19,6 +19,6 @@ int32_t debug_level; }; -extern DEVICE omptarget_device_environmentTy omptarget_device_environment; +extern omptarget_device_environmentTy omptarget_device_environment; #endif diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -73,8 +73,7 @@ uint32_t nArgs; }; -extern DEVICE - omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs); +extern omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs); // Worker slot type which is initialized with the default worker slot // size of 4*32 bytes. @@ -96,7 +95,7 @@ __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number]; }; -extern DEVICE DataSharingStateTy EXTERN_SHARED(DataSharingState); +extern DataSharingStateTy EXTERN_SHARED(DataSharingState); //////////////////////////////////////////////////////////////////////////////// // task ICV and (implicit & explicit) task state @@ -294,25 +293,23 @@ // global data tables //////////////////////////////////////////////////////////////////////////////// -extern DEVICE omptarget_nvptx_SimpleMemoryManager - omptarget_nvptx_simpleMemoryManager; -extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx); -extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx); +extern omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; +extern uint32_t EXTERN_SHARED(usedMemIdx); +extern uint32_t EXTERN_SHARED(usedSlotIdx); #if _OPENMP -extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +extern uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; #pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) #else -extern DEVICE - uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE]; +extern uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE]; #endif -extern DEVICE uint16_t EXTERN_SHARED(threadLimit); -extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam); -extern DEVICE uint16_t EXTERN_SHARED(nThreads); -extern DEVICE omptarget_nvptx_ThreadPrivateContext * +extern uint16_t EXTERN_SHARED(threadLimit); +extern uint16_t EXTERN_SHARED(threadsInTeam); +extern uint16_t EXTERN_SHARED(nThreads); +extern omptarget_nvptx_ThreadPrivateContext * EXTERN_SHARED(omptarget_nvptx_threadPrivateContext); -extern DEVICE uint32_t EXTERN_SHARED(execution_param); -extern DEVICE void *EXTERN_SHARED(ReductionScratchpadPtr); +extern uint32_t EXTERN_SHARED(execution_param); +extern void *EXTERN_SHARED(ReductionScratchpadPtr); //////////////////////////////////////////////////////////////////////////////// // work function (outlined parallel/simd functions) and arguments. @@ -320,8 +317,7 @@ //////////////////////////////////////////////////////////////////////////////// typedef void *omptarget_nvptx_WorkFn; -extern volatile DEVICE - omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn); +extern volatile omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn); //////////////////////////////////////////////////////////////////////////////// // get private data structures diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu @@ -19,30 +19,30 @@ // global device environment //////////////////////////////////////////////////////////////////////////////// -DEVICE omptarget_device_environmentTy omptarget_device_environment; +omptarget_device_environmentTy omptarget_device_environment; //////////////////////////////////////////////////////////////////////////////// // global data holding OpenMP state information //////////////////////////////////////////////////////////////////////////////// // OpenMP will try to call its ctor if we don't add the attribute explicitly -[[clang::loader_uninitialized]] DEVICE - omptarget_nvptx_Queue - omptarget_nvptx_device_State[MAX_SM]; +[[clang::loader_uninitialized]] omptarget_nvptx_Queue< + omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT> + omptarget_nvptx_device_State[MAX_SM]; -DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; -DEVICE uint32_t SHARED(usedMemIdx); -DEVICE uint32_t SHARED(usedSlotIdx); +omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager; +uint32_t SHARED(usedMemIdx); +uint32_t SHARED(usedSlotIdx); // SHARED doesn't work with array so we add the attribute explicitly. -[[clang::loader_uninitialized]] DEVICE uint8_t +[[clang::loader_uninitialized]] uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; #pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc) -DEVICE uint16_t SHARED(threadLimit); -DEVICE uint16_t SHARED(threadsInTeam); -DEVICE uint16_t SHARED(nThreads); +uint16_t SHARED(threadLimit); +uint16_t SHARED(threadsInTeam); +uint16_t SHARED(nThreads); // Pointer to this team's OpenMP state object -DEVICE omptarget_nvptx_ThreadPrivateContext * +omptarget_nvptx_ThreadPrivateContext * SHARED(omptarget_nvptx_threadPrivateContext); //////////////////////////////////////////////////////////////////////////////// @@ -50,26 +50,26 @@ // communicate with the workers. Since it is in shared memory, there is one // copy of these variables for each kernel, instance, and team. //////////////////////////////////////////////////////////////////////////////// -volatile DEVICE omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn); +volatile omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn); //////////////////////////////////////////////////////////////////////////////// // OpenMP kernel execution parameters //////////////////////////////////////////////////////////////////////////////// -DEVICE uint32_t SHARED(execution_param); +uint32_t SHARED(execution_param); //////////////////////////////////////////////////////////////////////////////// // Data sharing state //////////////////////////////////////////////////////////////////////////////// -DEVICE DataSharingStateTy SHARED(DataSharingState); +DataSharingStateTy SHARED(DataSharingState); //////////////////////////////////////////////////////////////////////////////// // Scratchpad for teams reduction. //////////////////////////////////////////////////////////////////////////////// -DEVICE void *SHARED(ReductionScratchpadPtr); +void *SHARED(ReductionScratchpadPtr); //////////////////////////////////////////////////////////////////////////////// // Data sharing related variables. //////////////////////////////////////////////////////////////////////////////// -DEVICE omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs); +omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs); #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -18,9 +18,9 @@ // global data tables //////////////////////////////////////////////////////////////////////////////// -extern DEVICE - omptarget_nvptx_Queue - omptarget_nvptx_device_State[MAX_SM]; +extern omptarget_nvptx_Queue + omptarget_nvptx_device_State[MAX_SM]; //////////////////////////////////////////////////////////////////////////////// // init entry points diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -174,8 +174,8 @@ INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } -DEVICE static volatile uint32_t IterCnt = 0; -DEVICE static volatile uint32_t Cnt = 0; +static volatile uint32_t IterCnt = 0; +static volatile uint32_t Cnt = 0; EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( kmp_Ident *loc, int32_t global_tid, void *global_buffer, int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -19,20 +19,20 @@ // Execution Parameters //////////////////////////////////////////////////////////////////////////////// -DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { +void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { execution_param = EMode; execution_param |= RMode; } -DEVICE bool isGenericMode() { return (execution_param & ModeMask) == Generic; } +bool isGenericMode() { return (execution_param & ModeMask) == Generic; } -DEVICE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; } +bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; } -DEVICE bool isRuntimeUninitialized() { +bool isRuntimeUninitialized() { return (execution_param & RuntimeMask) == RuntimeUninitialized; } -DEVICE bool isRuntimeInitialized() { +bool isRuntimeInitialized() { return (execution_param & RuntimeMask) == RuntimeInitialized; } @@ -40,7 +40,7 @@ // Execution Modes based on location parameter fields //////////////////////////////////////////////////////////////////////////////// -DEVICE bool checkSPMDMode(kmp_Ident *loc) { +bool checkSPMDMode(kmp_Ident *loc) { if (!loc) return isSPMDMode(); @@ -58,9 +58,9 @@ return isSPMDMode(); } -DEVICE bool checkGenericMode(kmp_Ident *loc) { return !checkSPMDMode(loc); } +bool checkGenericMode(kmp_Ident *loc) { return !checkSPMDMode(loc); } -DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc) { +bool checkRuntimeUninitialized(kmp_Ident *loc) { if (!loc) return isRuntimeUninitialized(); @@ -83,7 +83,7 @@ return isRuntimeUninitialized(); } -DEVICE bool checkRuntimeInitialized(kmp_Ident *loc) { +bool checkRuntimeInitialized(kmp_Ident *loc) { return !checkRuntimeUninitialized(loc); } @@ -105,13 +105,13 @@ // If NumThreads is 1024, master id is 992. // // Called in Generic Execution Mode only. -DEVICE int GetMasterThreadID() { +int GetMasterThreadID() { return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1); } // The last warp is reserved for the master; other warps are workers. // Called in Generic Execution Mode only. -DEVICE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); } +int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); } //////////////////////////////////////////////////////////////////////////////// // get thread id in team @@ -120,7 +120,7 @@ // or a serial region by the master. If the master (whose CUDA thread // id is GetMasterThreadID()) calls this routine, we return 0 because // it is a shadow for the first worker. -DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) { +int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) { // Implemented using control flow (predication) instead of with a modulo // operation. int tid = GetThreadIdInBlock(); @@ -136,7 +136,7 @@ // //////////////////////////////////////////////////////////////////////////////// -DEVICE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) { +int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) { // omp_thread_num int rc; if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { @@ -152,7 +152,7 @@ return rc; } -DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) { +int GetNumberOfOmpThreads(bool isSPMDExecutionMode) { // omp_num_threads int rc; int Level = parallelLevel[GetWarpId()]; @@ -170,12 +170,12 @@ //////////////////////////////////////////////////////////////////////////////// // Team id linked to OpenMP -DEVICE int GetOmpTeamId() { +int GetOmpTeamId() { // omp_team_num return GetBlockIdInKernel(); // assume 1 block per team } -DEVICE int GetNumberOfOmpTeams() { +int GetNumberOfOmpTeams() { // omp_num_teams return GetNumberOfBlocksInKernel(); // assume 1 block per team } @@ -183,12 +183,12 @@ //////////////////////////////////////////////////////////////////////////////// // Masters -DEVICE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } +int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } //////////////////////////////////////////////////////////////////////////////// // Parallel level -DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { +void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { __kmpc_impl_syncwarp(Mask); __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); @@ -200,7 +200,7 @@ __kmpc_impl_syncwarp(Mask); } -DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { +void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { __kmpc_impl_syncwarp(Mask); __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); @@ -216,13 +216,13 @@ // get OpenMP number of procs // Get the number of processors in the device. -DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) { +int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) { if (!isSPMDExecutionMode) return GetNumberOfWorkersInTeam(); return GetNumberOfThreadsInBlock(); } -DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) { +int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) { return GetNumberOfProcsInDevice(isSPMDExecutionMode); } @@ -230,8 +230,8 @@ // Memory //////////////////////////////////////////////////////////////////////////////// -DEVICE unsigned long PadBytes(unsigned long size, - unsigned long alignment) // must be a power of 2 +unsigned long PadBytes(unsigned long size, + unsigned long alignment) // must be a power of 2 { // compute the necessary padding to satisfy alignment constraint ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0, @@ -239,7 +239,7 @@ return (~(unsigned long)size + 1) & (alignment - 1); } -DEVICE void *SafeMalloc(size_t size, const char *msg) // check if success +void *SafeMalloc(size_t size, const char *msg) // check if success { void *ptr = __kmpc_impl_malloc(size); PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n", @@ -247,7 +247,7 @@ return ptr; } -DEVICE void *SafeFree(void *ptr, const char *msg) { +void *SafeFree(void *ptr, const char *msg) { PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg); __kmpc_impl_free(ptr); return NULL; @@ -257,11 +257,11 @@ // Teams Reduction Scratchpad Helpers //////////////////////////////////////////////////////////////////////////////// -DEVICE unsigned int *GetTeamsReductionTimestamp() { +unsigned int *GetTeamsReductionTimestamp() { return static_cast(ReductionScratchpadPtr); } -DEVICE char *GetTeamsReductionScratchpad() { +char *GetTeamsReductionScratchpad() { return static_cast(ReductionScratchpadPtr) + 256; } diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -31,59 +31,59 @@ RuntimeMask = 0x02u, }; -DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode); -DEVICE bool isGenericMode(); -DEVICE bool isSPMDMode(); -DEVICE bool isRuntimeUninitialized(); -DEVICE bool isRuntimeInitialized(); +void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode); +bool isGenericMode(); +bool isSPMDMode(); +bool isRuntimeUninitialized(); +bool isRuntimeInitialized(); //////////////////////////////////////////////////////////////////////////////// // Execution Modes based on location parameter fields //////////////////////////////////////////////////////////////////////////////// -DEVICE bool checkSPMDMode(kmp_Ident *loc); -DEVICE bool checkGenericMode(kmp_Ident *loc); -DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc); -DEVICE bool checkRuntimeInitialized(kmp_Ident *loc); +bool checkSPMDMode(kmp_Ident *loc); +bool checkGenericMode(kmp_Ident *loc); +bool checkRuntimeUninitialized(kmp_Ident *loc); +bool checkRuntimeInitialized(kmp_Ident *loc); //////////////////////////////////////////////////////////////////////////////// // get info from machine //////////////////////////////////////////////////////////////////////////////// // get global ids to locate tread/team info (constant regardless of OMP) -DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); -DEVICE int GetMasterThreadID(); -DEVICE int GetNumberOfWorkersInTeam(); +int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); +int GetMasterThreadID(); +int GetNumberOfWorkersInTeam(); // get OpenMP thread and team ids -DEVICE int GetOmpThreadId(int threadId, - bool isSPMDExecutionMode); // omp_thread_num -DEVICE int GetOmpTeamId(); // omp_team_num +int GetOmpThreadId(int threadId, + bool isSPMDExecutionMode); // omp_thread_num +int GetOmpTeamId(); // omp_team_num // get OpenMP number of threads and team -DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads -DEVICE int GetNumberOfOmpTeams(); // omp_num_teams +int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads +int GetNumberOfOmpTeams(); // omp_num_teams // get OpenMP number of procs -DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); -DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); +int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); +int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); // masters -DEVICE int IsTeamMaster(int ompThreadId); +int IsTeamMaster(int ompThreadId); // Parallel level -DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); -DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); +void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); +void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); //////////////////////////////////////////////////////////////////////////////// // Memory //////////////////////////////////////////////////////////////////////////////// // safe alloc and free -DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success -DEVICE void *SafeFree(void *ptr, const char *msg); +void *SafeMalloc(size_t size, const char *msg); // check if success +void *SafeFree(void *ptr, const char *msg); // pad to a alignment (power of 2 only) -DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment); +unsigned long PadBytes(unsigned long size, unsigned long alignment); #define ADD_BYTES(_addr, _bytes) \ ((void *)((char *)((void *)(_addr)) + (_bytes))) #define SUB_BYTES(_addr, _bytes) \ @@ -92,7 +92,7 @@ //////////////////////////////////////////////////////////////////////////////// // Teams Reduction Scratchpad Helpers //////////////////////////////////////////////////////////////////////////////// -DEVICE unsigned int *GetTeamsReductionTimestamp(); -DEVICE char *GetTeamsReductionScratchpad(); +unsigned int *GetTeamsReductionTimestamp(); +char *GetTeamsReductionScratchpad(); #endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -18,8 +18,7 @@ #include "nvptx_interface.h" -#define DEVICE -#define INLINE inline __attribute__((always_inline)) DEVICE +#define INLINE inline __attribute__((always_inline)) #define NOINLINE __attribute__((noinline)) #define ALIGN(N) __attribute__((aligned(N))) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -97,39 +97,38 @@ EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } // Atomics -DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { +uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); } -DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { +uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { return __nvvm_atom_inc_gen_ui(Address, Val); } -DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { +uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST); } -DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { +uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { uint32_t R; __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); return R; } -DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, - uint32_t Val) { +uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) { (void)__atomic_compare_exchange(Address, &Compare, &Val, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); return Compare; } -DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, - unsigned long long Val) { +unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, + unsigned long long Val) { unsigned long long R; __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); return R; } -DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address, - unsigned long long Val) { +unsigned long long __kmpc_atomic_add(unsigned long long *Address, + unsigned long long Val) { return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); } diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h --- a/openmp/libomptarget/deviceRTLs/target_interface.h +++ b/openmp/libomptarget/deviceRTLs/target_interface.h @@ -25,16 +25,15 @@ EXTERN unsigned GetLaneId(); // Atomics -DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); -DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); +uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); +uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); +uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); +uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); +uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); -DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, - unsigned long long); -DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, - unsigned long long); +unsigned long long __kmpc_atomic_exchange(unsigned long long *, + unsigned long long); +unsigned long long __kmpc_atomic_add(unsigned long long *, unsigned long long); // Locks EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);