diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -32,6 +32,7 @@
     * ``LIBOMPTARGET_INFO=<Num>``
     * ``LIBOMPTARGET_HEAP_SIZE=<Num>``
     * ``LIBOMPTARGET_STACK_SIZE=<Num>``
+    * ``LIBOMPTARGET_SHARED_MEMORY_SIZE=<Num>``
 
 LIBOMPTARGET_DEBUG
 """"""""""""""""""
@@ -338,6 +339,14 @@
 for some applications that allocate too much memory either through the user or
 globalization.
 
+LIBOMPTARGET_SHARED_MEMORY_SIZE
+"""""""""""""""""""""""""""""""
+
+This environment variable sets the amount of dynamic shared memory in bytes used 
+by the kernel once it is launched. A pointer to the dynamic memory buffer can 
+currently only be accessed using the ``__kmpc_get_dynamic_shared`` device 
+runtime call.
+
 .. toctree::
    :hidden:
    :maxdepth: 1
diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
--- a/openmp/libomptarget/DeviceRTL/include/Configuration.h
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -31,6 +31,9 @@
 /// Return the user choosen debug level.
 uint32_t getDebugLevel();
 
+/// Return the amount of dynamic shared memory that was allocated at launch.
+uint64_t getDynamicMemorySize();
+
 bool isDebugMode(DebugLevel Level);
 
 } // namespace config
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -174,6 +174,10 @@
 /// allocated by __kmpc_alloc_shared by the same thread.
 void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
 
+/// Get a pointer to the memory buffer containing dynamically allocated shared
+/// memory configured at launch.
+void *__kmpc_get_dynamic_shared();
+
 /// Allocate sufficient space for \p NumArgs sequential `void*` and store the
 /// allocation address in \p GlobalArgs.
 ///
diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -188,6 +188,9 @@
 /// Alloca \p Size bytes in global memory, if possible, for \p Reason.
 void *allocGlobal(uint64_t Size, const char *Reason);
 
+/// Return a pointer to the dynamic shared memory buffer.
+void *getDynamicBuffer();
+
 /// Free \p Ptr, alloated via allocGlobal, for \p Reason.
 void freeGlobal(void *Ptr, const char *Reason);
 
diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -21,6 +21,7 @@
   uint32_t DebugLevel;
   uint32_t NumDevices;
   uint32_t DeviceNum;
+  uint64_t DynamicMemSize;
 };
 
 #pragma omp declare target
@@ -43,6 +44,10 @@
   return omptarget_device_environment.DeviceNum;
 }
 
+uint64_t config::getDynamicMemorySize() {
+  return omptarget_device_environment.DynamicMemSize;
+}
+
 bool config::isDebugMode(config::DebugLevel Level) {
   return config::getDebugLevel() > Level;
 }
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -25,6 +25,13 @@
 ///
 ///{
 
+/// Add worst-case padding so that future allocations are properly aligned.
+constexpr const uint32_t Alignment = 8;
+
+/// External symbol to access dynamic shared memory.
+extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
+#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
+
 namespace {
 
 /// Fallback implementations are missing to trigger a link time error.
@@ -57,9 +64,6 @@
 #pragma omp end declare variant
 ///}
 
-/// Add worst-case padding so that future allocations are properly aligned.
-constexpr const uint32_t Alignment = 8;
-
 /// A "smart" stack in shared memory.
 ///
 /// The stack exposes a malloc/free interface but works like a stack internally.
@@ -147,6 +151,8 @@
 
 } // namespace
 
+void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
+
 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
   return SharedMemorySmartStack.push(Bytes);
 }
@@ -497,6 +503,10 @@
   memory::freeShared(Ptr, Bytes, "Frontend free shared");
 }
 
+__attribute__((noinline)) void *__kmpc_get_dynamic_shared() {
+  return memory::getDynamicBuffer();
+}
+
 /// Allocate storage in shared memory to communicate arguments from the main
 /// thread to the workers in generic mode. If we exceed
 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -103,6 +103,7 @@
   int32_t debug_level;
   uint32_t num_devices;
   uint32_t device_num;
+  uint64_t dynamic_shared_size;
 };
 
 namespace {
@@ -346,6 +347,8 @@
   int EnvTeamThreadLimit;
   // OpenMP requires flags
   int64_t RequiresFlags;
+  // Amount of dynamic shared memory to use at launch.
+  uint64_t DynamicMemorySize;
 
   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
   static constexpr const int HardThreadLimit = 1024;
@@ -499,7 +502,8 @@
 
   DeviceRTLTy()
       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
-        EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
+        EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
+        DynamicMemorySize(0) {
 
     DP("Start initializing CUDA\n");
 
@@ -540,6 +544,11 @@
       EnvNumTeams = std::stoi(EnvStr);
       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
     }
+    if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
+      // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
+      DynamicMemorySize = std::stoi(EnvStr);
+      DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE", DynamicMemorySize);
+    }
 
     StreamManager =
         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
@@ -904,7 +913,7 @@
       // TODO: The device ID used here is not the real device ID used by OpenMP.
       omptarget_device_environmentTy DeviceEnv{
           0, static_cast<uint32_t>(NumberOfDevices),
-          static_cast<uint32_t>(DeviceId)};
+          static_cast<uint32_t>(DeviceId), DynamicMemorySize};
 
 #ifdef OMPTARGET_DEBUG
       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
@@ -1190,7 +1199,7 @@
     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
                          /* gridDimZ */ 1, CudaThreadsPerBlock,
                          /* blockDimY */ 1, /* blockDimZ */ 1,
-                         /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+                         DynamicMemorySize, Stream, &Args[0], nullptr);
     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
       return OFFLOAD_FAIL;
 
diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
@@ -0,0 +1,31 @@
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -fopenmp-target-new-runtime
+// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=4 \
+// RUN:   %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
+// REQUIRES: nvptx64-nvidia-cuda
+
+#include <omp.h>
+#include <stdio.h>
+
+void *get_dynamic_shared() { return NULL; }
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+extern void *__kmpc_get_dynamic_shared();
+void *get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
+#pragma omp end declare variant
+
+int main() {
+  int x;
+#pragma omp target parallel map(from : x)
+  {
+    int *buf = get_dynamic_shared();
+#pragma omp barrier
+    if (omp_get_thread_num() == 0)
+      *buf = 1;
+#pragma omp barrier
+    if (omp_get_thread_num() == 1)
+      x = *buf;
+  }
+
+  // CHECK: PASS
+  if (x == 1)
+    printf("PASS\n");
+}