diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -207,7 +207,8 @@
 int8_t __kmpc_is_spmd_exec_mode();
 
 int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
-                           bool UseGenericStateMachine, bool);
+                           bool UseGenericStateMachine,
+                           bool RequiresFullRuntime, uint32_t StackSize);
 
 void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool);
 
diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -21,10 +21,8 @@
 
 namespace state {
 
-inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
-
 /// Initialize the state machinery. Must be called by all threads.
-void init(bool IsSPMD);
+void init(bool IsSPMD, uint32_t StackSize);
 
 /// TODO
 enum ValueKind {
diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -21,11 +21,11 @@
 
 #pragma omp declare target
 
-static void inititializeRuntime(bool IsSPMD) {
+static void inititializeRuntime(bool IsSPMD, uint32_t StackSize) {
   // Order is important here.
   synchronize::init(IsSPMD);
   mapping::init(IsSPMD);
-  state::init(IsSPMD);
+  state::init(IsSPMD, StackSize);
 }
 
 /// Simple generic state machine for worker threads.
@@ -65,12 +65,13 @@
 /// \param Ident               Source location identification, can be NULL.
 ///
 int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
-                           bool UseGenericStateMachine, bool) {
+                           bool UseGenericStateMachine,
+                           bool RequiresFullRuntime, uint32_t StackSize) {
   if (IsSPMD) {
-    inititializeRuntime(/* IsSPMD */ true);
+    inititializeRuntime(/* IsSPMD */ true, StackSize);
     synchronize::threads();
   } else {
-    inititializeRuntime(/* IsSPMD */ false);
+    inititializeRuntime(/* IsSPMD */ false, StackSize);
     // No need to wait since only the main threads will execute user
     // code and workers will run into a barrier right away.
   }
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -75,7 +75,7 @@
 ///
 struct SharedMemorySmartStackTy {
   /// Initialize the stack. Must be called by all threads.
-  void init(bool IsSPMD);
+  void init(bool IsSPMD, uint32_t StackSize);
 
   /// Allocate \p Bytes on the stack for the encountering thread. Each thread
   /// can call this function.
@@ -89,7 +89,7 @@
   /// Compute the size of the storage space reserved for a thread.
   uint32_t computeThreadStorageTotal() {
     uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
-    return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
+    return utils::align_down((SharedScratchpadSize / NumLanesInBlock),
                              Alignment);
   }
 
@@ -99,20 +99,25 @@
     return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
   }
 
+  /// The size of the shared scratchpad, initialized by a single thread.
+  uint32_t SharedScratchpadSize;
+
   /// The actual storage, shared among all warps.
-  unsigned char Data[state::SharedScratchpadSize]
-      __attribute__((aligned(Alignment)));
-  unsigned char Usage[mapping::MaxThreadsPerTeam]
-      __attribute__((aligned(Alignment)));
+  unsigned char *Data __attribute__((aligned(Alignment)));
+  unsigned char *Usage __attribute__((aligned(Alignment)));
 };
 
-static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
-              "Shared scratchpad of this size not supported yet.");
-
 /// The allocation of a single shared memory scratchpad.
 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
 
-void SharedMemorySmartStackTy::init(bool IsSPMD) {
+void SharedMemorySmartStackTy::init(bool IsSPMD, uint32_t StackSize) {
+  if (mapping::getThreadIdInBlock() == 0) {
+    SharedScratchpadSize = StackSize;
+    Data = static_cast<unsigned char *>(memory::getDynamicBuffer()) +
+           utils::align_up(config::getDynamicMemorySize(), Alignment);
+    Usage = Data + utils::align_up(StackSize, Alignment);
+  }
+  synchronize::threads();
   Usage[mapping::getThreadIdInBlock()] = 0;
 }
 
@@ -141,7 +146,7 @@
 
 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
-  if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) {
+  if (Ptr >= &Data[0] && Ptr < &Data[SharedScratchpadSize]) {
     int TId = mapping::getThreadIdInBlock();
     Usage[TId] -= AlignedBytes;
     return;
@@ -357,8 +362,9 @@
   __builtin_unreachable();
 }
 
-void state::init(bool IsSPMD) {
-  SharedMemorySmartStack.init(IsSPMD);
+void state::init(bool IsSPMD, uint32_t StackSize) {
+  if (StackSize > 0)
+    SharedMemorySmartStack.init(IsSPMD, StackSize);
   if (!mapping::getThreadIdInBlock())
     TeamState.init(IsSPMD);
 
diff --git a/openmp/libomptarget/deviceRTLs/common/include/target.h b/openmp/libomptarget/deviceRTLs/common/include/target.h
--- a/openmp/libomptarget/deviceRTLs/common/include/target.h
+++ b/openmp/libomptarget/deviceRTLs/common/include/target.h
@@ -74,7 +74,7 @@
 ///
 int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD,
                            bool UseGenericStateMachine,
-                           bool RequiresFullRuntime);
+                           bool RequiresFullRuntime, uint32_t);
 
 /// De-Initialization
 ///
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -204,7 +204,7 @@
 EXTERN
 int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD,
                            bool UseGenericStateMachine,
-                           bool RequiresFullRuntime) {
+                           bool RequiresFullRuntime, uint32_t) {
   int TId = __kmpc_get_hardware_thread_id_in_block();
   if (IsSPMD)
     __kmpc_spmd_kernel_init(RequiresFullRuntime);
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -418,7 +418,7 @@
 // non standard
 EXTERN int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD,
                                   bool UseGenericStateMachine,
-                                  bool RequiresFullRuntime);
+                                  bool RequiresFullRuntime, uint32_t);
 EXTERN void __kmpc_target_deinit(ident_t *Ident, bool IsSPMD,
                                  bool RequiresFullRuntime);
 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn);
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -92,8 +92,11 @@
   /// Maximal number of threads per block for this kernel.
   int MaxThreadsPerBlock = 0;
 
-  KernelTy(CUfunction _Func, int8_t _ExecutionMode)
-      : Func(_Func), ExecutionMode(_ExecutionMode) {}
+  /// Amount of dynamic shared memory to allocate for the data sharing stack.
+  uint32_t StackSize = 0;
+
+  KernelTy(CUfunction _Func, int8_t _ExecutionMode, uint32_t _StackSize)
+      : Func(_Func), ExecutionMode(_ExecutionMode), StackSize(_StackSize) {}
 };
 
 /// Device environment data
@@ -902,7 +905,33 @@
            ExecModeName);
       }
 
-      KernelsList.emplace_back(Func, ExecModeVal);
+      // Find the amount of dynamic shared memory to allocate for the data
+      // sharing stack in the new device RTL.
+      uint32_t StackSizeVal = 0;
+      std::string StackSizeNameStr(E->name);
+      StackSizeNameStr += "_stack_size";
+      const char *StackSizeName = StackSizeNameStr.c_str();
+
+      CUdeviceptr StackSizePtr;
+      Err = cuModuleGetGlobal(&StackSizePtr, &CUSize, Module, StackSizeName);
+      if (Err == CUDA_SUCCESS) {
+        if (CUSize != sizeof(uint32_t)) {
+          DP("Loading global stack_size '%s' - size mismatch (%zd != %zd)\n",
+             StackSizeName, CUSize, sizeof(uint32_t));
+          return nullptr;
+        }
+
+        Err = cuMemcpyDtoH(&StackSizeVal, StackSizePtr, CUSize);
+        if (Err != CUDA_SUCCESS) {
+          REPORT("Error when copying data from device to host. Pointers: "
+                 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
+                 DPxPTR(&StackSizeVal), DPxPTR(StackSizePtr), CUSize);
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
+      }
+
+      KernelsList.emplace_back(Func, ExecModeVal, StackSizeVal);
 
       __tgt_offload_entry Entry = *E;
       Entry.addr = &KernelsList.back();
@@ -1197,10 +1226,24 @@
               : "SPMD"));
 
     CUstream Stream = getStream(DeviceId, AsyncInfo);
+
+    // Round up memory sizes to allow for proper pointer alignment.
+    auto RoundUp = [](uint32_t V, uint32_t Align) {
+      return ((V + uint32_t(Align) - 1) / uint32_t(Align)) * uint32_t(Align);
+    };
+
+    // Amount of shared memory to allocate, must contain enough for the data
+    // sharing stack.
+    uint32_t DynamicSharedMemory = RoundUp(DynamicMemorySize, 8);
+    if (KernelInfo->StackSize > 0)
+      DynamicSharedMemory +=
+          RoundUp(KernelInfo->StackSize, 8) +
+          RoundUp(CudaThreadsPerBlock, DeviceData[DeviceId].WarpSize);
+
     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
                          /* gridDimZ */ 1, CudaThreadsPerBlock,
                          /* blockDimY */ 1, /* blockDimZ */ 1,
-                         DynamicMemorySize, Stream, &Args[0], nullptr);
+                         DynamicSharedMemory, Stream, &Args[0], nullptr);
     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
       return OFFLOAD_FAIL;