diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -55,7 +55,6 @@ src/omptarget-nvptx.cu src/parallel.cu src/reduction.cu - src/support.cu src/sync.cu src/task.cu ) @@ -89,7 +88,7 @@ set(BUILD_SHARED_LIBS OFF) set(CUDA_SEPARABLE_COMPILATION ON) list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory}) - cuda_add_library(omptarget-nvptx STATIC unity.cu + cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects} OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG}) # Install device RTL under the lib destination folder. diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -13,6 +13,11 @@ #include "target_impl.h" #include +// Warp ID in the CUDA block +INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; } +// Lane ID in the CUDA warp. +INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; } + // Return true if this is the first active thread in the warp. INLINE static bool IsWarpMasterActiveThread() { unsigned long long Mask = __kmpc_impl_activemask(); @@ -62,7 +67,7 @@ DSPRINT0(DSFLAG_INIT, "Entering __kmpc_initialize_data_sharing_environment\n"); - unsigned WID = GetWarpId(); + unsigned WID = getWarpId(); DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID); omptarget_nvptx_TeamDescr *teamDescr = @@ -106,7 +111,7 @@ DSPRINT(DSFLAG, "Default Data Size %016llx\n", (unsigned long long)SharingDefaultDataSize); - unsigned WID = GetWarpId(); + unsigned WID = getWarpId(); __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask(); __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; @@ -226,7 +231,7 @@ DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n"); - unsigned WID = GetWarpId(); + unsigned WID = getWarpId(); if (IsEntryPoint) { if (IsWarpMasterActiveThread()) { @@ -354,7 +359,7 @@ // This function initializes the stack pointer with the pointer to the // statically allocated shared memory slots. The size of a shared memory // slot is pre-determined to be 256 bytes. - if (GetThreadIdInBlock() == 0) + if (threadIdx.x == 0) data_sharing_init_stack_common(); __threadfence_block(); @@ -372,7 +377,7 @@ PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; // Frame pointer must be visible to all workers in the same warp. - const unsigned WID = GetWarpId(); + const unsigned WID = getWarpId(); void *FrameP = 0; __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); @@ -462,7 +467,7 @@ // Compute the start address of the frame of each thread in the warp. uintptr_t FrameStartAddress = (uintptr_t) data_sharing_push_stack_common(PushSize); - FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize); + FrameStartAddress += (uintptr_t) (getLaneId() * DataSize); return (void *)FrameStartAddress; } @@ -477,7 +482,7 @@ __threadfence_block(); if (GetThreadIdInBlock() % WARPSIZE == 0) { - unsigned WID = GetWarpId(); + unsigned WID = getWarpId(); // Current slot __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h @@ -128,12 +128,12 @@ #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING #include -#include "support.h" +#include "target_impl.h" template NOINLINE static void log(const char *fmt, Arguments... parameters) { - printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), - (int)GetWarpId(), (int)GetLaneId(), parameters...); + printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), + (int)(threadIdx.x & 0x1F), parameters...); } #endif @@ -144,8 +144,9 @@ NOINLINE static void check(bool cond, const char *fmt, Arguments... parameters) { if (!cond) - printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), - (int)GetWarpId(), (int)GetLaneId(), parameters...); + printf(fmt, (int)blockIdx.x, (int)threadIdx.x, + (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F), + parameters...); assert(cond); } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -364,7 +364,7 @@ for (;;) { now = clock(); clock_t cycles = now > start ? now - start : now + (0xffffffff - start); - if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) { + if (cycles >= __OMP_SPIN * blockIdx.x) { break; } } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -385,5 +385,6 @@ //////////////////////////////////////////////////////////////////////////////// #include "omptarget-nvptxi.h" +#include "supporti.h" #endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h @@ -10,12 +10,7 @@ // //===----------------------------------------------------------------------===// -#ifndef OMPTARGET_SUPPORT_H -#define OMPTARGET_SUPPORT_H - -#include "interface.h" #include "target_impl.h" - //////////////////////////////////////////////////////////////////////////////// // Execution Parameters //////////////////////////////////////////////////////////////////////////////// @@ -31,70 +26,58 @@ RuntimeMask = 0x02u, }; -DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode); -DEVICE bool isGenericMode(); -DEVICE bool isSPMDMode(); -DEVICE bool isRuntimeUninitialized(); -DEVICE bool isRuntimeInitialized(); - -//////////////////////////////////////////////////////////////////////////////// -// Execution Modes based on location parameter fields -//////////////////////////////////////////////////////////////////////////////// - -DEVICE bool checkSPMDMode(kmp_Ident *loc); - -DEVICE bool checkGenericMode(kmp_Ident *loc); - -DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc); - -DEVICE bool checkRuntimeInitialized(kmp_Ident *loc); +INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode); +INLINE bool isGenericMode(); +INLINE bool isSPMDMode(); +INLINE bool isRuntimeUninitialized(); +INLINE bool isRuntimeInitialized(); //////////////////////////////////////////////////////////////////////////////// // get info from machine //////////////////////////////////////////////////////////////////////////////// // get low level ids of resources -DEVICE int GetThreadIdInBlock(); -DEVICE int GetBlockIdInKernel(); -DEVICE int GetNumberOfBlocksInKernel(); -DEVICE int GetNumberOfThreadsInBlock(); -DEVICE unsigned GetWarpId(); -DEVICE unsigned GetLaneId(); +INLINE int GetThreadIdInBlock(); +INLINE int GetBlockIdInKernel(); +INLINE int GetNumberOfBlocksInKernel(); +INLINE int GetNumberOfThreadsInBlock(); +INLINE unsigned GetWarpId(); +INLINE unsigned GetLaneId(); // get global ids to locate tread/team info (constant regardless of OMP) -DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); -DEVICE int GetMasterThreadID(); -DEVICE int GetNumberOfWorkersInTeam(); +INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); +INLINE int GetMasterThreadID(); +INLINE int GetNumberOfWorkersInTeam(); // get OpenMP thread and team ids -DEVICE int GetOmpThreadId(int threadId, +INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode); // omp_thread_num -DEVICE int GetOmpTeamId(); // omp_team_num +INLINE int GetOmpTeamId(); // omp_team_num // get OpenMP number of threads and team -DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads -DEVICE int GetNumberOfOmpTeams(); // omp_num_teams +INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads +INLINE int GetNumberOfOmpTeams(); // omp_num_teams // get OpenMP number of procs -DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); -DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); +INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); +INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); // masters -DEVICE int IsTeamMaster(int ompThreadId); +INLINE int IsTeamMaster(int ompThreadId); // Parallel level -DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); -DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); +INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); +INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); //////////////////////////////////////////////////////////////////////////////// // Memory //////////////////////////////////////////////////////////////////////////////// // safe alloc and free -DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success -DEVICE void *SafeFree(void *ptr, const char *msg); +INLINE void *SafeMalloc(size_t size, const char *msg); // check if success +INLINE void *SafeFree(void *ptr, const char *msg); // pad to a alignment (power of 2 only) -DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment); +INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment); #define ADD_BYTES(_addr, _bytes) \ ((void *)((char *)((void *)(_addr)) + (_bytes))) #define SUB_BYTES(_addr, _bytes) \ @@ -103,8 +86,6 @@ //////////////////////////////////////////////////////////////////////////////// // Teams Reduction Scratchpad Helpers //////////////////////////////////////////////////////////////////////////////// -DEVICE unsigned int *GetTeamsReductionTimestamp(); -DEVICE char *GetTeamsReductionScratchpad(); -DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr); - -#endif +INLINE unsigned int *GetTeamsReductionTimestamp(); +INLINE char *GetTeamsReductionScratchpad(); +INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h rename from openmp/libomptarget/deviceRTLs/nvptx/src/support.cu rename to openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -1,4 +1,4 @@ -//===--------- support.cu - NVPTX OpenMP support functions ------- CUDA -*-===// +//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,14 +10,12 @@ // //===----------------------------------------------------------------------===// -#include "support.h" -#include "debug.h" -#include "omptarget-nvptx.h" - //////////////////////////////////////////////////////////////////////////////// // Execution Parameters //////////////////////////////////////////////////////////////////////////////// +#include "target_impl.h" + INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { execution_param = EMode; execution_param |= RMode; @@ -106,9 +104,9 @@ INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; } -INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; } -INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } +INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); } //////////////////////////////////////////////////////////////////////////////// // @@ -124,9 +122,7 @@ // If NumThreads is 1024, master id is 992. // // Called in Generic Execution Mode only. -INLINE int GetMasterThreadID() { - return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1); -} +INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); } // The last warp is reserved for the master; other warps are workers. // Called in Generic Execution Mode only. diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -15,9 +15,8 @@ #include #include "nvptx_interface.h" -#define DEVICE __device__ -#define INLINE __forceinline__ DEVICE -#define NOINLINE __noinline__ DEVICE +#define INLINE __forceinline__ __device__ +#define NOINLINE __noinline__ __device__ //////////////////////////////////////////////////////////////////////////////// // Kernel options diff --git a/openmp/libomptarget/deviceRTLs/nvptx/unity.cu b/openmp/libomptarget/deviceRTLs/nvptx/unity.cu deleted file mode 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/unity.cu +++ /dev/null @@ -1,26 +0,0 @@ -//===------ unity.cu - Unity build of NVPTX deviceRTL ------------ CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Support compilers, specifically NVCC, which have not implemented link time -// optimisation. This removes the runtime cost of moving inline functions into -// source files in exchange for preventing efficient incremental builds. -// -//===----------------------------------------------------------------------===// - -#include "src/cancel.cu" -#include "src/critical.cu" -#include "src/data_sharing.cu" -#include "src/libcall.cu" -#include "src/loop.cu" -#include "src/omp_data.cu" -#include "src/omptarget-nvptx.cu" -#include "src/parallel.cu" -#include "src/reduction.cu" -#include "src/support.cu" -#include "src/sync.cu" -#include "src/task.cu"