Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -44,7 +44,7 @@ } // Return true if this is the master thread. __device__ static bool IsMasterThread() { - return getMasterThreadId() == getThreadId(); + return !isSPMDMode() && getMasterThreadId() == getThreadId(); } /// Return the provided size aligned to the size of a pointer. @@ -330,39 +330,40 @@ // Runtime functions for trunk data sharing scheme. //////////////////////////////////////////////////////////////////////////////// +INLINE void data_sharing_init_stack_common() { + omptarget_nvptx_TeamDescr *teamDescr = + &omptarget_nvptx_threadPrivateContext->TeamContext(); + + for (int WID = 0; WID < WARPSIZE; WID++) { + __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID); + DataSharingState.SlotPtr[WID] = RootS; + DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; + } +} + // Initialize data sharing data structure. This function needs to be called // once at the beginning of a data sharing context (coincides with the kernel -// initialization). +// initialization). This function is called only by the MASTER thread of each +// team in non-SPMD mode. EXTERN void __kmpc_data_sharing_init_stack() { // This function initializes the stack pointer with the pointer to the // statically allocated shared memory slots. The size of a shared memory // slot is pre-determined to be 256 bytes. + data_sharing_init_stack_common(); + omptarget_nvptx_globalArgs.Init(); +} - // Initialize the data sharing structures. This section should only be - // executed by the warp active master threads. - if (IsWarpMasterActiveThread()) { - unsigned WID = getWarpId(); - omptarget_nvptx_TeamDescr *teamDescr = - &omptarget_nvptx_threadPrivateContext->TeamContext(); - __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID, IsMasterThread()); - - // If a valid address has been returned then proceed with the initalization. - // Otherwise the initialization of the slot has already happened in a - // previous call to this function. - if (RootS) { - DataSharingState.SlotPtr[WID] = RootS; - DataSharingState.TailPtr[WID] = RootS; - DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; - } - } +// Initialize data sharing data structure. This function needs to be called +// once at the beginning of a data sharing context (coincides with the kernel +// initialization). This function is called in SPMD mode only. +EXTERN void __kmpc_data_sharing_init_stack_spmd() { + // This function initializes the stack pointer with the pointer to the + // statically allocated shared memory slots. The size of a shared memory + // slot is pre-determined to be 256 bytes. + if (threadIdx.x == 0) + data_sharing_init_stack_common(); - // Currently we only support the sharing of variables between master and - // workers. The list of references to shared variables exists only for - // the master thread. - if (IsMasterThread()) { - // Initialize the list of references to arguments. - omptarget_nvptx_globalArgs.Init(); - } + __threadfence_block(); } // Called at the time of the kernel initialization. This is used to initilize @@ -372,8 +373,6 @@ // By default the globalized variables are stored in global memory. If the // UseSharedMemory is set to true, the runtime will attempt to use shared memory // as long as the size requested fits the pre-allocated size. -// -// Called by: master, TODO: call by workers EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, int16_t UseSharedMemory) { // Frame pointer must be visible to all workers in the same warp. @@ -385,7 +384,6 @@ // SlotP will point to either the shared memory slot or an existing // global memory slot. __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; // Compute the total memory footprint of the requested data. @@ -405,62 +403,31 @@ // of the slot then we need to either re-use the next slot, if one exists, // or create a new slot. if (EndAddress < RequestedEndAddress) { - size_t NewSize = PushSize; - - // The new or reused slot for holding the data being pushed. __kmpc_data_sharing_slot *NewSlot = 0; + size_t NewSize = PushSize; - // Check if there is a next slot. - if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) { - // Attempt to reuse an existing slot provided the data fits in the slot. - // The leftover data space will not be used. - ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - - (uintptr_t)(&ExistingSlot->Data[0]); - - // Try to add the data in the next available slot. Search for a slot - // with enough space. - while (ExistingSlotSize < NewSize) { - SlotP->Next = ExistingSlot->Next; - SlotP->Next->Prev = ExistingSlot->Prev; - free(ExistingSlot); - ExistingSlot = SlotP->Next; - if (!ExistingSlot) - break; - ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - - (uintptr_t)(&ExistingSlot->Data[0]); - } - - // Check if a slot has been found. - if (ExistingSlotSize >= NewSize) { - NewSlot = ExistingSlot; - NewSlot->PrevSlotStackPtr = StackP; - } - } - - if (!NewSlot) { - // Allocate at least the default size for each type of slot. - size_t DefaultSlotSize = - IsMasterThread() ? DS_Slot_Size : DS_Worker_Warp_Slot_Size; - if (DefaultSlotSize > NewSize) - NewSize = DefaultSlotSize; - NewSlot = (__kmpc_data_sharing_slot *)malloc( - sizeof(__kmpc_data_sharing_slot) + NewSize); - NewSlot->Next = 0; - NewSlot->Prev = SlotP; - NewSlot->PrevSlotStackPtr = StackP; - NewSlot->DataEnd = &NewSlot->Data[NewSize]; - - // Newly allocated slots are also tail slots. - TailSlotP = NewSlot; + // Allocate at least the default size for each type of slot. + // Master is a special case and even though there is only one thread, + // it can share more things with the workers. For uniformity, it uses + // the full size of a worker warp slot. + size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size; + if (DefaultSlotSize > NewSize) + NewSize = DefaultSlotSize; + NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc( + sizeof(__kmpc_data_sharing_slot) + NewSize, + "Global memory slot allocation."); - // Make previous slot point to the newly allocated slot. - SlotP->Next = NewSlot; - } + NewSlot->Next = 0; + NewSlot->Prev = SlotP; + NewSlot->PrevSlotStackPtr = StackP; + NewSlot->DataEnd = &NewSlot->Data[0] + NewSize; + // Make previous slot point to the newly allocated slot. + SlotP->Next = NewSlot; // The current slot becomes the new slot. SlotP = NewSlot; // The stack pointer always points to the next free stack frame. - StackP = &NewSlot->Data[PushSize]; + StackP = &NewSlot->Data[0] + PushSize; // The frame pointer always points to the beginning of the frame. FrameP = &NewSlot->Data[0]; } else { @@ -489,37 +456,27 @@ if (IsWarpMasterActiveThread()) { unsigned WID = getWarpId(); + // Current slot __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + + // Pointer to next available stack. void *&StackP = DataSharingState.StackPtr[WID]; - // Pop current frame from slot. - StackP = FrameStart; + // If the current slot is empty, we need to free the slot after the + // pop. + bool SlotEmpty = (StackP == &SlotP->Data[0]); - // If we try to pop the last frame of the current slot we need to - // move to the previous slot if there is one. - const uintptr_t StartAddress = (uintptr_t)FrameStart; - if (StartAddress == (uintptr_t)&SlotP->Data[0]) { - if (SlotP->Prev) { - // The new stack pointer is the end of the data field of the - // previous slot. This will allow the stack pointer to be - // used in the computation of the remaining data space in - // the current slot. - StackP = SlotP->PrevSlotStackPtr; - // Reset SlotP to previous slot. - SlotP = SlotP->Prev; - } + // Pop the frame. + StackP = FrameStart; - // If this will "pop" the last global memory node then it is likely - // that we are at the end of the data sharing region and we can - // de-allocate any existing global memory slots. - if (!SlotP->Prev) { - __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID]; - while(Tail->Prev) { - Tail = Tail->Prev; - free(Tail->Next); - } - Tail->Next=0; - } + if (SlotEmpty && SlotP->Prev) { + // Before removing the slot we need to reset StackP. + StackP = SlotP->PrevSlotStackPtr; + + // Remove the slot. + SlotP = SlotP->Prev; + SafeFree(SlotP->Next, "Free slot."); + SlotP->Next = 0; } } Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h @@ -485,6 +485,7 @@ EXTERN void __kmpc_data_sharing_init_stack(); +EXTERN void __kmpc_data_sharing_init_stack_spmd(); EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void __kmpc_data_sharing_pop_stack(void *a); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -122,7 +122,6 @@ struct DataSharingStateTy { __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; void *StackPtr[DS_Max_Warp_Number]; - __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number]; void *FramePtr[DS_Max_Warp_Number]; int32_t ActiveThreads[DS_Max_Warp_Number]; }; @@ -302,6 +301,16 @@ return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; } + INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { + worker_rootS[wid].DataEnd = + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; + // We currently do not have a next slot. + worker_rootS[wid].Next = 0; + worker_rootS[wid].Prev = 0; + worker_rootS[wid].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; + } + private: omptarget_nvptx_TaskDescr levelZeroTaskDescr; // icv for team master initial thread @@ -311,7 +320,7 @@ uint64_t lastprivateIterBuffer; __align__(16) - __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE - 1]; + __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE]; __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; };