Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -369,96 +369,109 @@
   __threadfence_block();
 }
 
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
-    int16_t UseSharedMemory) {
+INLINE void* data_sharing_push_stack_common(size_t PushSize) {
   if (isRuntimeUninitialized()) {
     ASSERT0(LT_FUSSY, isSPMDMode(),
             "Expected SPMD mode with uninitialized runtime.");
-    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
   }
 
+  // Only warp active master threads manage the stack.
+  bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
+
   // Add worst-case padding to DataSize so that future stack allocations are
   // correctly aligned.
   const size_t Alignment = 8;
-  if (DataSize % Alignment != 0) {
-    DataSize += (Alignment - DataSize % Alignment);
-  }
+  PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
 
   // Frame pointer must be visible to all workers in the same warp.
   unsigned WID = getWarpId();
+  // void * volatile FramePointer = 0;
   void *&FrameP = DataSharingState.FramePtr[WID];
 
-  // Only warp active master threads manage the stack.
-  if (getThreadId() % WARPSIZE == 0) {
-    // SlotP will point to either the shared memory slot or an existing
-    // global memory slot.
-    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-    void *&StackP = DataSharingState.StackPtr[WID];
-
-    // Compute the total memory footprint of the requested data.
-    // The master thread requires a stack only for itself. A worker
-    // thread (which at this point is a warp master) will require
-    // space for the variables of each thread in the warp,
-    // i.e. one DataSize chunk per warp lane.
-    // TODO: change WARPSIZE to the number of active threads in the warp.
-    size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize;
+  do {
+    if (IsWarpMaster) {
+      // SlotP will point to either the shared memory slot or an existing
+      // global memory slot.
+      __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+      void *&StackP = DataSharingState.StackPtr[WID];
 
-    // Check if we have room for the data in the current slot.
-    const uintptr_t StartAddress = (uintptr_t)StackP;
-    const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
-    const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
-
-    // If we requested more data than there is room for in the rest
-    // of the slot then we need to either re-use the next slot, if one exists,
-    // or create a new slot.
-    if (EndAddress < RequestedEndAddress) {
-      __kmpc_data_sharing_slot *NewSlot = 0;
-      size_t NewSize = PushSize;
-
-      // Allocate at least the default size for each type of slot.
-      // Master is a special case and even though there is only one thread,
-      // it can share more things with the workers. For uniformity, it uses
-      // the full size of a worker warp slot.
-      size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
-      if (DefaultSlotSize > NewSize)
-        NewSize = DefaultSlotSize;
-      NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
-          sizeof(__kmpc_data_sharing_slot) + NewSize,
-          "Global memory slot allocation.");
+      // Check if we have room for the data in the current slot.
+      const uintptr_t StartAddress = (uintptr_t)StackP;
+      const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
+      const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
+
+      // If we requested more data than there is room for in the rest
+      // of the slot then we need to either re-use the next slot, if one exists,
+      // or create a new slot.
+      if (EndAddress < RequestedEndAddress) {
+        __kmpc_data_sharing_slot *NewSlot = 0;
+        size_t NewSize = PushSize;
+
+        // Allocate at least the default size for each type of slot.
+        // Master is a special case and even though there is only one thread,
+        // it can share more things with the workers. For uniformity, it uses
+        // the full size of a worker warp slot.
+        size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
+        if (DefaultSlotSize > NewSize)
+          NewSize = DefaultSlotSize;
+        NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
+            sizeof(__kmpc_data_sharing_slot) + NewSize,
+            "Global memory slot allocation.");
+
+        NewSlot->Next = 0;
+        NewSlot->Prev = SlotP;
+        NewSlot->PrevSlotStackPtr = StackP;
+        NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
+
+        // Make previous slot point to the newly allocated slot.
+        SlotP->Next = NewSlot;
+        // The current slot becomes the new slot.
+        SlotP = NewSlot;
+        // The stack pointer always points to the next free stack frame.
+        StackP = &NewSlot->Data[0] + PushSize;
+        // The frame pointer always points to the beginning of the frame.
+        FrameP = &NewSlot->Data[0];
+      } else {
+        // Add the data chunk to the current slot. The frame pointer is set to
+        // point to the start of the new frame held in StackP.
+        //atomicExch((unsigned long long *)&FrameP, (unsigned long long)StackP);
+        FrameP = StackP;
+        // Reset stack pointer to the requested address.
+        StackP = (void *)RequestedEndAddress;
+      }
+    }
+  } while (!FrameP);
 
-      NewSlot->Next = 0;
-      NewSlot->Prev = SlotP;
-      NewSlot->PrevSlotStackPtr = StackP;
-      NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
+  return FrameP;
+}
 
-      // Make previous slot point to the newly allocated slot.
-      SlotP->Next = NewSlot;
-      // The current slot becomes the new slot.
-      SlotP = NewSlot;
-      // The stack pointer always points to the next free stack frame.
-      StackP = &NewSlot->Data[0] + PushSize;
-      // The frame pointer always points to the beginning of the frame.
-      FrameP = &NewSlot->Data[0];
-    } else {
-      // Add the data chunk to the current slot. The frame pointer is set to
-      // point to the start of the new frame held in StackP.
-      FrameP = StackP;
-      // Reset stack pointer to the requested address.
-      StackP = (void *)RequestedEndAddress;
-    }
-  }
+EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+    int16_t UseSharedMemory) {
+  return data_sharing_push_stack_common(DataSize);
+}
 
-  __threadfence_block();
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
+    int16_t UseSharedMemory) {
+  // Compute the total memory footprint of the requested data.
+  // The master thread requires a stack only for itself. A worker
+  // thread (which at this point is a warp master) will require
+  // space for the variables of each thread in the warp,
+  // i.e. one DataSize chunk per warp lane.
+  // TODO: change WARPSIZE to the number of active threads in the warp.
+  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ?
+      DataSize : WARPSIZE * DataSize;
 
   // Compute the start address of the frame of each thread in the warp.
-  uintptr_t FrameStartAddress = (uintptr_t)FrameP;
+  uintptr_t FrameStartAddress =
+      (uintptr_t) data_sharing_push_stack_common(PushSize);
   FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
   return (void *)FrameStartAddress;
 }
@@ -475,6 +488,8 @@
     return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
   }
 
+  __threadfence_block();
+
   if (getThreadId() % WARPSIZE == 0) {
     unsigned WID = getWarpId();
 
@@ -501,8 +516,6 @@
       SlotP->Next = 0;
     }
   }
-
-  __threadfence_block();
 }
 
 // Begin a data sharing context. Maintain a list of references to shared
Index: libomptarget/deviceRTLs/nvptx/src/interface.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/interface.h
+++ libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -478,6 +478,8 @@
 
 EXTERN void __kmpc_data_sharing_init_stack();
 EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+    int16_t UseSharedMemory);
 EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
 EXTERN void __kmpc_data_sharing_pop_stack(void *a);
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -40,8 +40,6 @@
 INLINE unsigned smid() {
   unsigned id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
-  ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
-          "Expected number of SMs is less than reported.");
   return id;
 }
 
@@ -156,7 +154,6 @@
   //
   omptarget_nvptx_TaskDescr *newTaskDescr =
       omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
-  ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
   newTaskDescr->InitLevelOneTaskDescr(ThreadLimit,
                                       currTeamDescr.LevelZeroTaskDescr());
   newTaskDescr->ThreadLimit() = ThreadLimit;
Index: libomptarget/deviceRTLs/nvptx/src/supporti.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -188,7 +188,6 @@
 {
   void *ptr = malloc(size);
   PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
-  ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
   return ptr;
 }