Skip to content

Commit f3de222

Browse files
committedMar 15, 2018
[OpenMP][libomptarget] Enable multiple frames per global memory slot
Summary: To save on calls to malloc, this patch enables the re-use of pre-allocated global memory slots. Reviewers: ABataev, grokos, carlo.bertolli, caomhin Reviewed By: grokos Subscribers: guansong, openmp-commits Differential Revision: https://reviews.llvm.org/D44470 llvm-svn: 327637
1 parent 4f4bf7c commit f3de222

File tree

3 files changed

+121
-47
lines changed

3 files changed

+121
-47
lines changed
 

‎openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

Lines changed: 116 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,17 @@ EXTERN void __kmpc_data_sharing_init_stack() {
341341
__kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID);
342342

343343
DataSharingState.SlotPtr[WID] = RootS;
344-
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
344+
DataSharingState.TailPtr[WID] = RootS;
345+
346+
// Initialize the stack pointer to be equal to the end of
347+
// the shared memory slot. This way we ensure that the global
348+
// version of the stack will be used.
349+
// TODO: remove this:
350+
DataSharingState.StackPtr[WID] = RootS->DataEnd;
351+
352+
// TODO: When the use of shared memory is enabled we will have to
353+
// initialize this with the start of the Data region like so:
354+
// DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
345355

346356
// We initialize the list of references to arguments here.
347357
omptarget_nvptx_globalArgs.Init();
@@ -355,12 +365,8 @@ EXTERN void __kmpc_data_sharing_init_stack() {
355365
// UseSharedMemory is set to true, the runtime will attempt to use shared memory
356366
// as long as the size requested fits the pre-allocated size.
357367
//
358-
// TODO: allow more than one push per slot to save on calls to malloc.
359-
// Currently there is only one slot for each push so the data size in the slot
360-
// is the same size as the size being requested.
361-
//
362368
// Called by: master, TODO: call by workers
363-
EXTERN void* __kmpc_data_sharing_push_stack(size_t size,
369+
EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
364370
int16_t UseSharedMemory) {
365371
// TODO: Add shared memory support. For now, use global memory only for
366372
// storing the data sharing slots so ignore the pre-allocated
@@ -374,39 +380,85 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t size,
374380
// global memory slot.
375381
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
376382
__kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID];
383+
void *&StackP = DataSharingState.StackPtr[WID];
384+
void *FrameP = 0;
377385

378-
// The slot for holding the data we are pushing.
379-
__kmpc_data_sharing_slot *NewSlot = 0;
380-
size_t NewSize = size;
381-
382-
// Check if there is a next slot.
383-
if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
384-
// Attempt to re-use an existing slot provided the data fits in the slot.
385-
// The leftover data space will not be used.
386-
ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
387-
(uintptr_t)(&ExistingSlot->Data[0]);
388-
if (ExistingSlotSize >= NewSize)
389-
NewSlot = ExistingSlot;
390-
else
391-
free(ExistingSlot);
392-
}
386+
// Check if we have room for the data in the current slot.
387+
const uintptr_t StartAddress = (uintptr_t)StackP;
388+
const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
389+
const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)DataSize;
393390

394-
if (!NewSlot) {
395-
NewSlot = (__kmpc_data_sharing_slot *)malloc(
396-
sizeof(__kmpc_data_sharing_slot) + NewSize);
397-
NewSlot->Next = 0;
398-
NewSlot->Prev = SlotP;
391+
// If we requested more data than there is room for in the rest
392+
// of the slot then we need to either re-use the next slot, if one exists,
393+
// or create a new slot.
394+
if (EndAddress < RequestedEndAddress) {
395+
size_t NewSize = DataSize;
399396

400-
// This is the last slot, save it.
401-
TailSlotP = NewSlot;
402-
}
397+
// The new or reused slot for holding the data being pushed.
398+
__kmpc_data_sharing_slot *NewSlot = 0;
399+
400+
// Check if there is a next slot.
401+
if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
402+
// Attempt to reuse an existing slot provided the data fits in the slot.
403+
// The leftover data space will not be used.
404+
ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
405+
(uintptr_t)(&ExistingSlot->Data[0]);
406+
407+
// Try to add the data in the next available slot. Search for a slot
408+
// with enough space.
409+
while (ExistingSlotSize < NewSize) {
410+
SlotP->Next = ExistingSlot->Next;
411+
SlotP->Next->Prev = ExistingSlot->Prev;
412+
free(ExistingSlot);
413+
ExistingSlot = SlotP->Next;
414+
if (!ExistingSlot)
415+
break;
416+
ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
417+
(uintptr_t)(&ExistingSlot->Data[0]);
418+
}
419+
420+
// Check if a slot has been found.
421+
if (ExistingSlotSize >= NewSize) {
422+
NewSlot = ExistingSlot;
423+
NewSlot->PrevSlotStackPtr = StackP;
424+
}
425+
}
403426

404-
NewSlot->DataEnd = &NewSlot->Data[NewSize];
427+
if (!NewSlot) {
428+
// Allocate at least the default size.
429+
// TODO: generalize this for workers which need a larger data slot
430+
// i.e. using DS_Worker_Warp_Slot_Size.
431+
if (DS_Slot_Size > DataSize)
432+
NewSize = DS_Slot_Size;
433+
NewSlot = (__kmpc_data_sharing_slot *)malloc(
434+
sizeof(__kmpc_data_sharing_slot) + NewSize);
435+
NewSlot->Next = 0;
436+
NewSlot->Prev = SlotP;
437+
NewSlot->PrevSlotStackPtr = StackP;
438+
NewSlot->DataEnd = &NewSlot->Data[NewSize];
405439

406-
SlotP->Next = NewSlot;
407-
SlotP = NewSlot;
440+
// Newly allocated slots are also tail slots.
441+
TailSlotP = NewSlot;
442+
443+
// Make previous slot point to the newly allocated slot.
444+
SlotP->Next = NewSlot;
445+
}
446+
447+
// The current slot becomes the new slot.
448+
SlotP = NewSlot;
449+
// The stack pointer always points to the next free stack frame.
450+
StackP = &NewSlot->Data[DataSize];
451+
// The frame pointer always points to the beginning of the frame.
452+
FrameP = &NewSlot->Data[0];
453+
} else {
454+
// Add the data chunk to the current slot. The frame pointer is set to
455+
// point to the start of the new frame held in StackP.
456+
FrameP = StackP;
457+
// Reset stack pointer to the requested address.
458+
StackP = (void *)RequestedEndAddress;
459+
}
408460

409-
return (void*)&SlotP->Data[0];
461+
return FrameP;
410462
}
411463

412464
// TODO: add memory fence here when this function can be called by
@@ -422,26 +474,43 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t size,
422474
// When the pop operation removes the last global memory slot,
423475
// reclaim all outstanding global memory slots since it is
424476
// likely we have reached the end of the kernel.
425-
EXTERN void __kmpc_data_sharing_pop_stack(void *a) {
477+
EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
426478
if (IsMasterThread()) {
427479
unsigned WID = getWarpId();
428480

429-
__kmpc_data_sharing_slot *S = DataSharingState.SlotPtr[WID];
430-
431-
if (S->Prev)
432-
S = S->Prev;
481+
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
482+
void *&StackP = DataSharingState.StackPtr[WID];
483+
484+
// If we try to pop the last frame of the current slot we need to
485+
// move to the previous slot if there is one.
486+
const uintptr_t StartAddress = (uintptr_t)FrameStart;
487+
if (StartAddress == (uintptr_t)&SlotP->Data[0]) {
488+
if (SlotP->Prev) {
489+
// The new stack pointer is the end of the data field of the
490+
// previous slot. This will allow the stack pointer to be
491+
// used in the computation of the remaining data space in
492+
// the current slot.
493+
StackP = SlotP->PrevSlotStackPtr;
494+
// Reset SlotP to previous slot.
495+
SlotP = SlotP->Prev;
496+
}
433497

434-
// If this will "pop" the last global memory node then it is likely
435-
// that we are at the end of the data sharing region and we can
436-
// de-allocate any existing global memory slots.
437-
if (!S->Prev) {
438-
__kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID];
498+
// If this will "pop" the last global memory node then it is likely
499+
// that we are at the end of the data sharing region and we can
500+
// de-allocate any existing global memory slots.
501+
if (!SlotP->Prev) {
502+
__kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID];
439503

440-
while(Tail && Tail->Prev) {
441-
Tail = Tail->Prev;
442-
free(Tail->Next);
443-
Tail->Next=0;
504+
while(Tail && Tail->Prev) {
505+
Tail = Tail->Prev;
506+
free(Tail->Next);
507+
Tail->Next=0;
508+
}
444509
}
510+
} else {
511+
// This is not the last frame popped from this slot.
512+
// Reset StackP
513+
StackP = FrameStart;
445514
}
446515

447516
return;

‎openmp/libomptarget/deviceRTLs/nvptx/src/interface.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
497497
struct __kmpc_data_sharing_slot {
498498
__kmpc_data_sharing_slot *Next;
499499
__kmpc_data_sharing_slot *Prev;
500+
void *PrevSlotStackPtr;
500501
void *DataEnd;
501502
char Data[];
502503
};

‎openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ struct DataSharingStateTy {
129129
struct __kmpc_data_sharing_worker_slot_static {
130130
__kmpc_data_sharing_slot *Next;
131131
__kmpc_data_sharing_slot *Prev;
132+
void *PrevSlotStackPtr;
132133
void *DataEnd;
133134
char Data[DS_Worker_Warp_Slot_Size];
134135
};
@@ -137,6 +138,7 @@ struct __kmpc_data_sharing_worker_slot_static {
137138
struct __kmpc_data_sharing_master_slot_static {
138139
__kmpc_data_sharing_slot *Next;
139140
__kmpc_data_sharing_slot *Prev;
141+
void *PrevSlotStackPtr;
140142
void *DataEnd;
141143
char Data[DS_Slot_Size];
142144
};
@@ -267,6 +269,7 @@ class omptarget_nvptx_TeamDescr {
267269
// We currently do not have a next slot.
268270
master_rootS[0].Next = 0;
269271
master_rootS[0].Prev = 0;
272+
master_rootS[0].PrevSlotStackPtr = 0;
270273
return (__kmpc_data_sharing_slot *)&master_rootS[0];
271274
}
272275
// Initialize the pointer to the end of the slot given the size of the data
@@ -276,6 +279,7 @@ class omptarget_nvptx_TeamDescr {
276279
// We currently do not have a next slot.
277280
worker_rootS[wid].Next = 0;
278281
worker_rootS[wid].Prev = 0;
282+
worker_rootS[wid].PrevSlotStackPtr = 0;
279283
return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
280284
}
281285

0 commit comments

Comments
 (0)
Please sign in to comment.