@@ -341,7 +341,17 @@ EXTERN void __kmpc_data_sharing_init_stack() {
341
341
__kmpc_data_sharing_slot *RootS = teamDescr->RootS (WID);
342
342
343
343
DataSharingState.SlotPtr [WID] = RootS;
344
- DataSharingState.StackPtr [WID] = (void *)&RootS->Data [0 ];
344
+ DataSharingState.TailPtr [WID] = RootS;
345
+
346
+ // Initialize the stack pointer to be equal to the end of
347
+ // the shared memory slot. This way we ensure that the global
348
+ // version of the stack will be used.
349
+ // TODO: remove this:
350
+ DataSharingState.StackPtr [WID] = RootS->DataEnd ;
351
+
352
+ // TODO: When the use of shared memory is enabled we will have to
353
+ // initialize this with the start of the Data region like so:
354
+ // DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
345
355
346
356
// We initialize the list of references to arguments here.
347
357
omptarget_nvptx_globalArgs.Init ();
@@ -355,12 +365,8 @@ EXTERN void __kmpc_data_sharing_init_stack() {
355
365
// UseSharedMemory is set to true, the runtime will attempt to use shared memory
356
366
// as long as the size requested fits the pre-allocated size.
357
367
//
358
- // TODO: allow more than one push per slot to save on calls to malloc.
359
- // Currently there is only one slot for each push so the data size in the slot
360
- // is the same size as the size being requested.
361
- //
362
368
// Called by: master, TODO: call by workers
363
- EXTERN void * __kmpc_data_sharing_push_stack (size_t size ,
369
+ EXTERN void * __kmpc_data_sharing_push_stack (size_t DataSize ,
364
370
int16_t UseSharedMemory) {
365
371
// TODO: Add shared memory support. For now, use global memory only for
366
372
// storing the data sharing slots so ignore the pre-allocated
@@ -374,39 +380,85 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t size,
374
380
// global memory slot.
375
381
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr [WID];
376
382
__kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr [WID];
383
+ void *&StackP = DataSharingState.StackPtr [WID];
384
+ void *FrameP = 0 ;
377
385
378
- // The slot for holding the data we are pushing.
379
- __kmpc_data_sharing_slot *NewSlot = 0 ;
380
- size_t NewSize = size;
381
-
382
- // Check if there is a next slot.
383
- if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next ) {
384
- // Attempt to re-use an existing slot provided the data fits in the slot.
385
- // The leftover data space will not be used.
386
- ptrdiff_t ExistingSlotSize = (uintptr_t )ExistingSlot->DataEnd -
387
- (uintptr_t )(&ExistingSlot->Data [0 ]);
388
- if (ExistingSlotSize >= NewSize)
389
- NewSlot = ExistingSlot;
390
- else
391
- free (ExistingSlot);
392
- }
386
+ // Check if we have room for the data in the current slot.
387
+ const uintptr_t StartAddress = (uintptr_t )StackP;
388
+ const uintptr_t EndAddress = (uintptr_t )SlotP->DataEnd ;
389
+ const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t )DataSize;
393
390
394
- if (!NewSlot) {
395
- NewSlot = (__kmpc_data_sharing_slot *) malloc (
396
- sizeof (__kmpc_data_sharing_slot) + NewSize);
397
- NewSlot-> Next = 0 ;
398
- NewSlot-> Prev = SlotP ;
391
+ // If we requested more data than there is room for in the rest
392
+ // of the slot then we need to either re-use the next slot, if one exists,
393
+ // or create a new slot.
394
+ if (EndAddress < RequestedEndAddress) {
395
+ size_t NewSize = DataSize ;
399
396
400
- // This is the last slot, save it.
401
- TailSlotP = NewSlot;
402
- }
397
+ // The new or reused slot for holding the data being pushed.
398
+ __kmpc_data_sharing_slot *NewSlot = 0 ;
399
+
400
+ // Check if there is a next slot.
401
+ if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next ) {
402
+ // Attempt to reuse an existing slot provided the data fits in the slot.
403
+ // The leftover data space will not be used.
404
+ ptrdiff_t ExistingSlotSize = (uintptr_t )ExistingSlot->DataEnd -
405
+ (uintptr_t )(&ExistingSlot->Data [0 ]);
406
+
407
+ // Try to add the data in the next available slot. Search for a slot
408
+ // with enough space.
409
+ while (ExistingSlotSize < NewSize) {
410
+ SlotP->Next = ExistingSlot->Next ;
411
+ SlotP->Next ->Prev = ExistingSlot->Prev ;
412
+ free (ExistingSlot);
413
+ ExistingSlot = SlotP->Next ;
414
+ if (!ExistingSlot)
415
+ break ;
416
+ ExistingSlotSize = (uintptr_t )ExistingSlot->DataEnd -
417
+ (uintptr_t )(&ExistingSlot->Data [0 ]);
418
+ }
419
+
420
+ // Check if a slot has been found.
421
+ if (ExistingSlotSize >= NewSize) {
422
+ NewSlot = ExistingSlot;
423
+ NewSlot->PrevSlotStackPtr = StackP;
424
+ }
425
+ }
403
426
404
- NewSlot->DataEnd = &NewSlot->Data [NewSize];
427
+ if (!NewSlot) {
428
+ // Allocate at least the default size.
429
+ // TODO: generalize this for workers which need a larger data slot
430
+ // i.e. using DS_Worker_Warp_Slot_Size.
431
+ if (DS_Slot_Size > DataSize)
432
+ NewSize = DS_Slot_Size;
433
+ NewSlot = (__kmpc_data_sharing_slot *)malloc (
434
+ sizeof (__kmpc_data_sharing_slot) + NewSize);
435
+ NewSlot->Next = 0 ;
436
+ NewSlot->Prev = SlotP;
437
+ NewSlot->PrevSlotStackPtr = StackP;
438
+ NewSlot->DataEnd = &NewSlot->Data [NewSize];
405
439
406
- SlotP->Next = NewSlot;
407
- SlotP = NewSlot;
440
+ // Newly allocated slots are also tail slots.
441
+ TailSlotP = NewSlot;
442
+
443
+ // Make previous slot point to the newly allocated slot.
444
+ SlotP->Next = NewSlot;
445
+ }
446
+
447
+ // The current slot becomes the new slot.
448
+ SlotP = NewSlot;
449
+ // The stack pointer always points to the next free stack frame.
450
+ StackP = &NewSlot->Data [DataSize];
451
+ // The frame pointer always points to the beginning of the frame.
452
+ FrameP = &NewSlot->Data [0 ];
453
+ } else {
454
+ // Add the data chunk to the current slot. The frame pointer is set to
455
+ // point to the start of the new frame held in StackP.
456
+ FrameP = StackP;
457
+ // Reset stack pointer to the requested address.
458
+ StackP = (void *)RequestedEndAddress;
459
+ }
408
460
409
- return ( void *)&SlotP-> Data [ 0 ] ;
461
+ return FrameP ;
410
462
}
411
463
412
464
// TODO: add memory fence here when this function can be called by
@@ -422,26 +474,43 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t size,
422
474
// When the pop operation removes the last global memory slot,
423
475
// reclaim all outstanding global memory slots since it is
424
476
// likely we have reached the end of the kernel.
425
- EXTERN void __kmpc_data_sharing_pop_stack (void *a ) {
477
+ EXTERN void __kmpc_data_sharing_pop_stack (void *FrameStart ) {
426
478
if (IsMasterThread ()) {
427
479
unsigned WID = getWarpId ();
428
480
429
- __kmpc_data_sharing_slot *S = DataSharingState.SlotPtr [WID];
430
-
431
- if (S->Prev )
432
- S = S->Prev ;
481
+ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr [WID];
482
+ void *&StackP = DataSharingState.StackPtr [WID];
483
+
484
+ // If we try to pop the last frame of the current slot we need to
485
+ // move to the previous slot if there is one.
486
+ const uintptr_t StartAddress = (uintptr_t )FrameStart;
487
+ if (StartAddress == (uintptr_t )&SlotP->Data [0 ]) {
488
+ if (SlotP->Prev ) {
489
+ // The new stack pointer is the end of the data field of the
490
+ // previous slot. This will allow the stack pointer to be
491
+ // used in the computation of the remaining data space in
492
+ // the current slot.
493
+ StackP = SlotP->PrevSlotStackPtr ;
494
+ // Reset SlotP to previous slot.
495
+ SlotP = SlotP->Prev ;
496
+ }
433
497
434
- // If this will "pop" the last global memory node then it is likely
435
- // that we are at the end of the data sharing region and we can
436
- // de-allocate any existing global memory slots.
437
- if (!S ->Prev ) {
438
- __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr [WID];
498
+ // If this will "pop" the last global memory node then it is likely
499
+ // that we are at the end of the data sharing region and we can
500
+ // de-allocate any existing global memory slots.
501
+ if (!SlotP ->Prev ) {
502
+ __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr [WID];
439
503
440
- while (Tail && Tail->Prev ) {
441
- Tail = Tail->Prev ;
442
- free (Tail->Next );
443
- Tail->Next =0 ;
504
+ while (Tail && Tail->Prev ) {
505
+ Tail = Tail->Prev ;
506
+ free (Tail->Next );
507
+ Tail->Next =0 ;
508
+ }
444
509
}
510
+ } else {
511
+ // This is not the last frame popped from this slot.
512
+ // Reset StackP
513
+ StackP = FrameStart;
445
514
}
446
515
447
516
return ;
0 commit comments