Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -19,6 +19,8 @@ __device__ static unsigned getThreadId() { return threadIdx.x; } // Warp ID in the CUDA block __device__ static unsigned getWarpId() { return threadIdx.x / WARPSIZE; } +// Lane ID in the CUDA warp. +__device__ static unsigned getLaneId() { return threadIdx.x % WARPSIZE; } // The CUDA thread ID of the master thread. __device__ static unsigned getMasterThreadId() { @@ -37,7 +39,10 @@ unsigned long long Mask = getActiveThreadsMask(); unsigned long long ShNum = WARPSIZE - (getThreadId() % WARPSIZE); unsigned long long Sh = Mask << ShNum; - return Sh == 0; + // Count the set bits in the 32 bit int version of Sh. If no bits + // are set then the thread calling this function is the active warp + // master thread. + return __popc(Sh) == 0; } // Return true if this is the master thread. __device__ static bool IsMasterThread() { @@ -358,7 +363,10 @@ // Called by: master, TODO: call by workers EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, int16_t UseSharedMemory) { - if (IsMasterThread()) { + // Only warp active master threads manage the stack. + void *FrameP = 0; + + if (IsWarpMasterActiveThread()) { unsigned WID = getWarpId(); // SlotP will point to either the shared memory slot or an existing @@ -366,18 +374,25 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *FrameP = 0; + + // Compute the total memory footprint of the requested data. + // The master thread requires a stack only for itself. A worker + // thread (which at this point is a warp master) will require + // space for the variables of each thread in the warp, + // i.e. one DataSize chunk per warp lane. + // TODO: change WARPSIZE to the number of active threads in the warp. + size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize; // Check if we have room for the data in the current slot. const uintptr_t StartAddress = (uintptr_t)StackP; const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; - const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)DataSize; + const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize; // If we requested more data than there is room for in the rest // of the slot then we need to either re-use the next slot, if one exists, // or create a new slot. if (EndAddress < RequestedEndAddress) { - size_t NewSize = DataSize; + size_t NewSize = PushSize; // The new or reused slot for holding the data being pushed. __kmpc_data_sharing_slot *NewSlot = 0; @@ -412,7 +427,7 @@ // Allocate at least the default size. // TODO: generalize this for workers which need a larger data slot // i.e. using DS_Worker_Warp_Slot_Size. - if (DS_Slot_Size > DataSize) + if (DS_Slot_Size > NewSize) NewSize = DS_Slot_Size; NewSlot = (__kmpc_data_sharing_slot *)malloc( sizeof(__kmpc_data_sharing_slot) + NewSize); @@ -431,7 +446,7 @@ // The current slot becomes the new slot. SlotP = NewSlot; // The stack pointer always points to the next free stack frame. - StackP = &NewSlot->Data[DataSize]; + StackP = &NewSlot->Data[PushSize]; // The frame pointer always points to the beginning of the frame. FrameP = &NewSlot->Data[0]; } else { @@ -441,16 +456,14 @@ // Reset stack pointer to the requested address. StackP = (void *)RequestedEndAddress; } - - return FrameP; } - // TODO: add memory fence here when this function can be called by - // worker threads also. For now, this function is only called by the - // master thread of each team. + __threadfence_block(); - // TODO: implement sharing across workers. - return 0; + // Compute the start address of the frame of each thread in the warp. + uintptr_t FrameStartAddress = (uintptr_t)FrameP; + FrameStartAddress += (uintptr_t) (getLaneId() * DataSize); + return (void *)FrameStartAddress; } // Pop the stack and free any memory which can be reclaimed. @@ -459,7 +472,7 @@ // reclaim all outstanding global memory slots since it is // likely we have reached the end of the kernel. EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { - if (IsMasterThread()) { + if (IsWarpMasterActiveThread()) { unsigned WID = getWarpId(); __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; @@ -496,15 +509,9 @@ // Reset StackP StackP = FrameStart; } - - return; } - // TODO: add memory fence here when this function can be called by - // worker threads also. For now, this function is only called by the - // master thread of each team. - - // TODO: implement sharing across workers. + __threadfence_block(); } // Begin a data sharing context. Maintain a list of references to shared