diff --git a/openmp/libomptarget/deviceRTLs/common/allocator.h b/openmp/libomptarget/deviceRTLs/common/allocator.h
--- a/openmp/libomptarget/deviceRTLs/common/allocator.h
+++ b/openmp/libomptarget/deviceRTLs/common/allocator.h
@@ -39,6 +39,12 @@
 #define EXTERN_SHARED(NAME)                                                    \
   NAME;                                                                        \
   OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+
+// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
+//       now that's not the case.
+#define THREAD_LOCAL(NAME)                                                     \
+  NAME [[clang::loader_uninitialized, clang::address_space(5)]]
+
 #endif
 
 #endif // OMPTARGET_ALLOCATOR_H
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -132,8 +132,6 @@
   INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
   INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
                                    uint16_t tid, uint16_t tnum);
-  INLINE void SaveLoopData();
-  INLINE void RestoreLoopData() const;
 
 private:
   // bits for flags: (6 used, 2 free)
@@ -147,14 +145,6 @@
   static const uint8_t TaskDescr_IsParConstr = 0x20;
   static const uint8_t TaskDescr_InParL2P = 0x40;
 
-  struct SavedLoopDescr_items {
-    int64_t loopUpperBound;
-    int64_t nextLowerBound;
-    int64_t chunk;
-    int64_t stride;
-    kmp_sched_t schedule;
-  } loopData;
-
   struct TaskDescr_items {
     uint8_t flags; // 6 bit used (see flag above)
     uint8_t unused;
@@ -223,6 +213,7 @@
 // thread private data (struct of arrays for better coalescing)
 // tid refers here to the global thread id
 // do not support multiple concurrent kernel a this time
+
 class omptarget_nvptx_ThreadPrivateContext {
 public:
   // task
@@ -238,13 +229,6 @@
   INLINE uint16_t &NumThreadsForNextParallel(int tid) {
     return nextRegion.tnum[tid];
   }
-  // schedule (for dispatch)
-  INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
-  INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
-  INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
-  INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
-  INLINE int64_t &Stride(int tid) { return stride[tid]; }
-
   INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
 
   INLINE void InitThreadPrivateContext(int tid);
@@ -263,12 +247,6 @@
     uint16_t tnum[MAX_THREADS_PER_TEAM];
   } nextRegion;
   // schedule (for dispatch)
-  kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
-  int64_t chunk[MAX_THREADS_PER_TEAM];
-  int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
-  // state for dispatch with dyn/guided OR static (never use both at a time)
-  int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
-  int64_t stride[MAX_THREADS_PER_TEAM];
   uint64_t cnt;
 };
 
diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@@ -116,30 +116,6 @@
   items.threadId = tid;
 }
 
-INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
-  loopData.loopUpperBound =
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
-  loopData.nextLowerBound =
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
-  loopData.schedule =
-      omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
-  loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
-  loopData.stride =
-      omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
-}
-
-INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
-  omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
-  omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
-      loopData.loopUpperBound;
-  omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
-      loopData.nextLowerBound;
-  omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
-      loopData.stride;
-  omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
-      loopData.schedule;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // Thread Private Context
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@@ -17,6 +17,15 @@
 #include "target/shuffle.h"
 #include "target_impl.h"
 
+struct DynamicScheduleTracker {
+  int64_t Chunk;
+  int64_t LoopUpperBound;
+  int64_t NextLowerBound;
+  int64_t Stride;
+  kmp_sched_t ScheduleType;
+  DynamicScheduleTracker *NextDST;
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 // template class that encapsulate all the helper functions
@@ -203,7 +212,7 @@
 
   INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
                                    kmp_sched_t schedule, T lb, T ub, ST st,
-                                   ST chunk) {
+                                   ST chunk, DynamicScheduleTracker *DST) {
     if (checkRuntimeUninitialized(loc)) {
       // In SPMD mode no need to check parallelism level - dynamic scheduling
       // may appear only in L2 parallel regions with lightweight runtime.
@@ -279,32 +288,29 @@
     if (schedule == kmp_sched_static_chunk) {
       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
       // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      DST->ScheduleType = schedule;
       // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      DST->LoopUpperBound = ub;
       // compute static chunk
       ST stride;
       int lastiter = 0;
       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
       // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
       PRINT(LD_LOOP,
             "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
             ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
-                tid));
+            (int)tnum, DST->LoopUpperBound,
+            (unsigned long long)DST->NextLowerBound,
+            (unsigned long long)DST->Stride);
     } else if (schedule == kmp_sched_static_balanced_chunk) {
       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
       // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      DST->ScheduleType = schedule;
       // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      DST->LoopUpperBound = ub;
       // compute static chunk
       ST stride;
       int lastiter = 0;
@@ -319,49 +325,45 @@
       if (ub > oldUb)
         ub = oldUb;
       // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
       PRINT(LD_LOOP,
             "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
             ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (int)tnum, DST->LoopUpperBound,
+            (unsigned long long)DST->NextLowerBound,
             (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
                 tid));
     } else if (schedule == kmp_sched_static_nochunk) {
       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
       // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      DST->ScheduleType = schedule;
       // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      DST->LoopUpperBound = ub;
       // compute static chunk
       ST stride;
       int lastiter = 0;
       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
       // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
       PRINT(LD_LOOP,
             "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
             ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (int)tnum, DST->LoopUpperBound,
+            (unsigned long long)DST->NextLowerBound,
             (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
                 tid));
     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
       // save data
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      DST->ScheduleType = schedule;
       if (chunk < 1)
         chunk = 1;
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      DST->Chunk = chunk;
+      DST->LoopUpperBound = ub;
+      DST->NextLowerBound = lb;
       __kmpc_barrier(loc, threadId);
       if (tid == 0) {
         omptarget_nvptx_threadPrivateContext->Cnt() = 0;
@@ -371,11 +373,8 @@
       PRINT(LD_LOOP,
             "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
             ", chunk %" PRIu64 "\n",
-            (int)tnum,
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            omptarget_nvptx_threadPrivateContext->Chunk(tid));
+            (int)tnum, (unsigned long long)DST->NextLowerBound,
+            DST->LoopUpperBound, DST->Chunk);
     }
   }
 
@@ -440,7 +439,8 @@
   }
 
   INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
-                                  T *plower, T *pupper, ST *pstride) {
+                                  T *plower, T *pupper, ST *pstride,
+                                  DynamicScheduleTracker *DST) {
     if (checkRuntimeUninitialized(loc)) {
       // In SPMD mode no need to check parallelism level - dynamic scheduling
       // may appear only in L2 parallel regions with lightweight runtime.
@@ -457,14 +457,13 @@
     ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
             "current thread is not needed here; error");
     // retrieve schedule
-    kmp_sched_t schedule =
-        omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
+    kmp_sched_t schedule = DST->ScheduleType;
 
     // xxx reduce to one
     if (schedule == kmp_sched_static_chunk ||
         schedule == kmp_sched_static_nochunk) {
-      T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
-      T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
+      T myLb = DST->NextLowerBound;
+      T ub = DST->LoopUpperBound;
       // finished?
       if (myLb > ub) {
         PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
@@ -472,7 +471,7 @@
         return DISPATCH_FINISHED;
       }
       // not finished, save current bounds
-      ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
+      ST chunk = DST->Chunk;
       *plower = myLb;
       T myUb = myLb + chunk - 1; // Clang uses i <= ub
       if (myUb > ub)
@@ -481,8 +480,8 @@
       *plast = (int32_t)(myUb == ub);
 
       // increment next lower bound by the stride
-      ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
+      ST stride = DST->Stride;
+      DST->NextLowerBound = myLb + stride;
       PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
             (long long)*plower, (long long)*pupper);
       return DISPATCH_NOTFINISHED;
@@ -491,10 +490,8 @@
             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
             "bad sched");
     T myLb, myUb;
-    int finished = DynamicNextChunk(
-        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
-        omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+    int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
+                                    DST->LoopUpperBound);
 
     if (finished == FINISHED)
       return DISPATCH_FINISHED;
@@ -527,89 +524,125 @@
 // KMP interface implementation (dyn loops)
 ////////////////////////////////////////////////////////////////////////////////
 
+// TODO: This is a stopgap. We probably want to expand the dispatch API to take
+//       an DST pointer which can then be allocated properly without malloc.
+DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
+
+// Create a new DST, link the current one, and define the new as current.
+static DynamicScheduleTracker *pushDST() {
+  DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
+      SafeMalloc(sizeof(DynamicScheduleTracker), "new DST"));
+  *NewDST = DynamicScheduleTracker({0});
+  NewDST->NextDST = ThreadDSTPtr;
+  ThreadDSTPtr = NewDST;
+  return ThreadDSTPtr;
+}
+
+// Return the current DST.
+static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
+
+// Pop the current DST and restore the last one.
+static void popDST() {
+  DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
+  SafeFree(ThreadDSTPtr, "remove DST");
+  ThreadDSTPtr = OldDST;
+}
+
 // init
 EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
                                    int32_t schedule, int32_t lb, int32_t ub,
                                    int32_t st, int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
+  DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 }
 
 EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
                                     int32_t schedule, uint32_t lb, uint32_t ub,
                                     int32_t st, int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
+  DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 }
 
 EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
                                    int32_t schedule, int64_t lb, int64_t ub,
                                    int64_t st, int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
+  DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 }
 
 EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
                                     int32_t schedule, uint64_t lb, uint64_t ub,
                                     int64_t st, int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
+  DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
 }
 
 // next
 EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
                                   int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
+  DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
 }
 
 EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, int32_t *p_last,
                                    uint32_t *p_lb, uint32_t *p_ub,
                                    int32_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
+  DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
 }
 
 EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
                                   int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
+  DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
 }
 
 EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, int32_t *p_last,
                                    uint64_t *p_lb, uint64_t *p_ub,
                                    int64_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
+  DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
 }
 
 // fini
 EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
+  popDST();
 }
 
 EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
+  popDST();
 }
 
 EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
+  popDST();
 }
 
 EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
+  popDST();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -216,7 +216,6 @@
 
   // get current task
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
-  currTaskDescr->SaveLoopData();
 
   // allocate new task descriptor and copy value from current one, set prev to
   // it
@@ -256,7 +255,6 @@
   // free
   SafeFree(currTaskDescr, "new seq parallel task");
   currTaskDescr = getMyTopTaskDescriptor(threadId);
-  currTaskDescr->RestoreLoopData();
 }
 
 EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {