diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -13,11 +13,6 @@ #include "target_impl.h" #include -// Warp ID in the CUDA block -INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; } -// Lane ID in the CUDA warp. -INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; } - // Return true if this is the first active thread in the warp. INLINE static bool IsWarpMasterActiveThread() { unsigned long long Mask = __kmpc_impl_activemask(); @@ -67,7 +62,7 @@ DSPRINT0(DSFLAG_INIT, "Entering __kmpc_initialize_data_sharing_environment\n"); - unsigned WID = getWarpId(); + unsigned WID = GetWarpId(); DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID); omptarget_nvptx_TeamDescr *teamDescr = @@ -111,7 +106,7 @@ DSPRINT(DSFLAG, "Default Data Size %016llx\n", (unsigned long long)SharingDefaultDataSize); - unsigned WID = getWarpId(); + unsigned WID = GetWarpId(); __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask(); __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; @@ -231,7 +226,7 @@ DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n"); - unsigned WID = getWarpId(); + unsigned WID = GetWarpId(); if (IsEntryPoint) { if (IsWarpMasterActiveThread()) { @@ -359,7 +354,7 @@ // This function initializes the stack pointer with the pointer to the // statically allocated shared memory slots. The size of a shared memory // slot is pre-determined to be 256 bytes. - if (threadIdx.x == 0) + if (GetThreadIdInBlock() == 0) data_sharing_init_stack_common(); __threadfence_block(); @@ -377,7 +372,7 @@ PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; // Frame pointer must be visible to all workers in the same warp. - const unsigned WID = getWarpId(); + const unsigned WID = GetWarpId(); void *FrameP = 0; __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); @@ -467,7 +462,7 @@ // Compute the start address of the frame of each thread in the warp. uintptr_t FrameStartAddress = (uintptr_t) data_sharing_push_stack_common(PushSize); - FrameStartAddress += (uintptr_t) (getLaneId() * DataSize); + FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize); return (void *)FrameStartAddress; } @@ -482,7 +477,7 @@ __threadfence_block(); if (GetThreadIdInBlock() % WARPSIZE == 0) { - unsigned WID = getWarpId(); + unsigned WID = GetWarpId(); // Current slot __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h @@ -128,12 +128,12 @@ #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING #include -#include "target_impl.h" +#include "support.h" template NOINLINE static void log(const char *fmt, Arguments... parameters) { - printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), - (int)(threadIdx.x & 0x1F), parameters...); + printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), + (int)GetWarpId(), (int)GetLaneId, parameters...); } #endif @@ -144,9 +144,8 @@ NOINLINE static void check(bool cond, const char *fmt, Arguments... parameters) { if (!cond) - printf(fmt, (int)blockIdx.x, (int)threadIdx.x, - (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F), - parameters...); + printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), + (int)GetWarpId(), (int)GetLaneId, parameters...); assert(cond); } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -364,7 +364,7 @@ for (;;) { now = clock(); clock_t cycles = now > start ? now - start : now + (0xffffffff - start); - if (cycles >= __OMP_SPIN * blockIdx.x) { + if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) { break; } } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu @@ -106,9 +106,9 @@ INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; } -INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; } +INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } -INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); } +INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } //////////////////////////////////////////////////////////////////////////////// // @@ -124,7 +124,9 @@ // If NumThreads is 1024, master id is 992. // // Called in Generic Execution Mode only. -INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); } +INLINE int GetMasterThreadID() { + return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1); +} // The last warp is reserved for the master; other warps are workers. // Called in Generic Execution Mode only.