Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -129,7 +129,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; int32_t &ActiveT = DataSharingState.ActiveThreads[WID]; DSPRINT0(DSFLAG, "Save current slot/stack values.\n"); @@ -283,7 +283,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; SlotP = *SavedSharedSlot; StackP = *SavedSharedStack; @@ -321,7 +321,7 @@ DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID); - void *P = DataSharingState.FramePtr[SourceWID]; + void * volatile P = DataSharingState.FramePtr[SourceWID]; DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); return P; } @@ -369,96 +369,107 @@ __threadfence_block(); } -// Called at the time of the kernel initialization. This is used to initilize -// the list of references to shared variables and to pre-allocate global storage -// for holding the globalized variables. -// -// By default the globalized variables are stored in global memory. If the -// UseSharedMemory is set to true, the runtime will attempt to use shared memory -// as long as the size requested fits the pre-allocated size. -EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, - int16_t UseSharedMemory) { +INLINE void* data_sharing_push_stack_common(size_t PushSize) { if (isRuntimeUninitialized()) { ASSERT0(LT_FUSSY, isSPMDMode(), "Expected SPMD mode with uninitialized runtime."); - return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize); + return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize); } + // Only warp active master threads manage the stack. + bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0; + // Add worst-case padding to DataSize so that future stack allocations are // correctly aligned. const size_t Alignment = 8; - if (DataSize % Alignment != 0) { - DataSize += (Alignment - DataSize % Alignment); - } + PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; // Frame pointer must be visible to all workers in the same warp. unsigned WID = getWarpId(); - void *&FrameP = DataSharingState.FramePtr[WID]; + void *volatile &FrameP = DataSharingState.FramePtr[WID]; - // Only warp active master threads manage the stack. - if (getThreadId() % WARPSIZE == 0) { - // SlotP will point to either the shared memory slot or an existing - // global memory slot. - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - void *&StackP = DataSharingState.StackPtr[WID]; - - // Compute the total memory footprint of the requested data. - // The master thread requires a stack only for itself. A worker - // thread (which at this point is a warp master) will require - // space for the variables of each thread in the warp, - // i.e. one DataSize chunk per warp lane. - // TODO: change WARPSIZE to the number of active threads in the warp. - size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize; + do { + if (IsWarpMaster) { + // SlotP will point to either the shared memory slot or an existing + // global memory slot. + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + void *&StackP = DataSharingState.StackPtr[WID]; - // Check if we have room for the data in the current slot. - const uintptr_t StartAddress = (uintptr_t)StackP; - const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; - const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize; - - // If we requested more data than there is room for in the rest - // of the slot then we need to either re-use the next slot, if one exists, - // or create a new slot. - if (EndAddress < RequestedEndAddress) { - __kmpc_data_sharing_slot *NewSlot = 0; - size_t NewSize = PushSize; - - // Allocate at least the default size for each type of slot. - // Master is a special case and even though there is only one thread, - // it can share more things with the workers. For uniformity, it uses - // the full size of a worker warp slot. - size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size; - if (DefaultSlotSize > NewSize) - NewSize = DefaultSlotSize; - NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc( - sizeof(__kmpc_data_sharing_slot) + NewSize, - "Global memory slot allocation."); + // Check if we have room for the data in the current slot. + const uintptr_t StartAddress = (uintptr_t)StackP; + const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; + const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize; + + // If we requested more data than there is room for in the rest + // of the slot then we need to either re-use the next slot, if one exists, + // or create a new slot. + if (EndAddress < RequestedEndAddress) { + __kmpc_data_sharing_slot *NewSlot = 0; + size_t NewSize = PushSize; + + // Allocate at least the default size for each type of slot. + // Master is a special case and even though there is only one thread, + // it can share more things with the workers. For uniformity, it uses + // the full size of a worker warp slot. + size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size; + if (DefaultSlotSize > NewSize) + NewSize = DefaultSlotSize; + NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc( + sizeof(__kmpc_data_sharing_slot) + NewSize, + "Global memory slot allocation."); + + NewSlot->Next = 0; + NewSlot->Prev = SlotP; + NewSlot->PrevSlotStackPtr = StackP; + NewSlot->DataEnd = &NewSlot->Data[0] + NewSize; + + // Make previous slot point to the newly allocated slot. + SlotP->Next = NewSlot; + // The current slot becomes the new slot. + SlotP = NewSlot; + // The stack pointer always points to the next free stack frame. + StackP = &NewSlot->Data[0] + PushSize; + // The frame pointer always points to the beginning of the frame. + FrameP = &NewSlot->Data[0]; + } else { + // Add the data chunk to the current slot. The frame pointer is set to + // point to the start of the new frame held in StackP. + FrameP = StackP; + // Reset stack pointer to the requested address. + StackP = (void *)RequestedEndAddress; + } + } + } while (!FrameP); - NewSlot->Next = 0; - NewSlot->Prev = SlotP; - NewSlot->PrevSlotStackPtr = StackP; - NewSlot->DataEnd = &NewSlot->Data[0] + NewSize; + return FrameP; +} - // Make previous slot point to the newly allocated slot. - SlotP->Next = NewSlot; - // The current slot becomes the new slot. - SlotP = NewSlot; - // The stack pointer always points to the next free stack frame. - StackP = &NewSlot->Data[0] + PushSize; - // The frame pointer always points to the beginning of the frame. - FrameP = &NewSlot->Data[0]; - } else { - // Add the data chunk to the current slot. The frame pointer is set to - // point to the start of the new frame held in StackP. - FrameP = StackP; - // Reset stack pointer to the requested address. - StackP = (void *)RequestedEndAddress; - } - } +EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize, + int16_t UseSharedMemory) { + return data_sharing_push_stack_common(DataSize); +} - __threadfence_block(); +// Called at the time of the kernel initialization. This is used to initilize +// the list of references to shared variables and to pre-allocate global storage +// for holding the globalized variables. +// +// By default the globalized variables are stored in global memory. If the +// UseSharedMemory is set to true, the runtime will attempt to use shared memory +// as long as the size requested fits the pre-allocated size. +EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, + int16_t UseSharedMemory) { + // Compute the total memory footprint of the requested data. + // The master thread requires a stack only for itself. A worker + // thread (which at this point is a warp master) will require + // space for the variables of each thread in the warp, + // i.e. one DataSize chunk per warp lane. + // TODO: change WARPSIZE to the number of active threads in the warp. + size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ? + DataSize : WARPSIZE * DataSize; // Compute the start address of the frame of each thread in the warp. - uintptr_t FrameStartAddress = (uintptr_t)FrameP; + uintptr_t FrameStartAddress = + (uintptr_t) data_sharing_push_stack_common(PushSize); FrameStartAddress += (uintptr_t) (getLaneId() * DataSize); return (void *)FrameStartAddress; } @@ -475,6 +486,8 @@ return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart); } + __threadfence_block(); + if (getThreadId() % WARPSIZE == 0) { unsigned WID = getWarpId(); @@ -501,8 +514,6 @@ SlotP->Next = 0; } } - - __threadfence_block(); } // Begin a data sharing context. Maintain a list of references to shared Index: libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -478,6 +478,8 @@ EXTERN void __kmpc_data_sharing_init_stack(); EXTERN void __kmpc_data_sharing_init_stack_spmd(); +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, + int16_t UseSharedMemory); EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void __kmpc_data_sharing_pop_stack(void *a); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -123,7 +123,7 @@ struct DataSharingStateTy { __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; void *StackPtr[DS_Max_Warp_Number]; - void *FramePtr[DS_Max_Warp_Number]; + void * volatile FramePtr[DS_Max_Warp_Number]; int32_t ActiveThreads[DS_Max_Warp_Number]; }; // Additional worker slot type which is initialized with the default worker slot Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -40,8 +40,6 @@ INLINE unsigned smid() { unsigned id; asm("mov.u32 %0, %%smid;" : "=r"(id)); - ASSERT0(LT_FUSSY, nsmid() <= MAX_SM, - "Expected number of SMs is less than reported."); return id; } @@ -156,7 +154,6 @@ // omptarget_nvptx_TaskDescr *newTaskDescr = omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, currTeamDescr.LevelZeroTaskDescr()); newTaskDescr->ThreadLimit() = ThreadLimit; Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -188,7 +188,6 @@ { void *ptr = malloc(size); PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr)); - ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg); return ptr; }