diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1548,7 +1548,7 @@
   kmp_int32 tc;
   kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put
                                      after ub */
-
+  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
   // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on )
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are in the same cache line.
@@ -1581,7 +1581,7 @@
   kmp_int64 tc; /* trip count (number of iterations) */
   kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put
                                      after ub */
-
+  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
@@ -1722,11 +1722,7 @@
   kmp_int32 th_disp_index;
   kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
   volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
-  union { // we can use union here because doacross cannot be used in
-    // nonmonotonic loops
-    kmp_int64 *th_doacross_info; // info on loop bounds
-    kmp_lock_t *th_steal_lock; // lock used for chunk stealing (8-byte variable)
-  };
+  kmp_int64 *th_doacross_info; // info on loop bounds
 #if KMP_USE_INTERNODE_ALIGNMENT
   char more_padding[INTERNODE_CACHE_LINE];
 #endif
diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h
--- a/openmp/runtime/src/kmp_dispatch.h
+++ b/openmp/runtime/src/kmp_dispatch.h
@@ -75,7 +75,7 @@
   ST st; // signed
   UT tc; // unsigned
   T static_steal_counter; // for static_steal only; maybe better to put after ub
-
+  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -372,10 +372,10 @@
         // before spending time on this).
         // For now use dynamically allocated per-thread lock,
         // free memory in __kmp_dispatch_next when status==0.
-        KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
-        th->th.th_dispatch->th_steal_lock =
+        KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
+        pr->u.p.th_steal_lock =
             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
-        __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
+        __kmp_init_lock(pr->u.p.th_steal_lock);
       }
       break;
     } else {
@@ -968,7 +968,7 @@
   // all parm3 will be the same, it still exists a bad case like using 0 and 1
   // rather than program life-time increment. So the dedicated variable is
   // required. The 'static_steal_counter' is used.
-  if (schedule == kmp_sch_static_steal) {
+  if (pr->schedule == kmp_sch_static_steal) {
     // Other threads will inspect this variable when searching for a victim.
     // This is a flag showing that other threads may steal from this thread
     // since then.
@@ -1195,7 +1195,7 @@
     if (traits_t<T>::type_size > 4) {
       // use lock for 8-byte and CAS for 4-byte induction
       // variable. TODO (optional): check and use 16-byte CAS
-      kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
+      kmp_lock_t *lck = pr->u.p.th_steal_lock;
       KMP_DEBUG_ASSERT(lck != NULL);
       if (pr->u.p.count < (UT)pr->u.p.ub) {
         __kmp_acquire_lock(lck, gtid);
@@ -1210,37 +1210,38 @@
         kmp_info_t **other_threads = team->t.t_threads;
         int while_limit = pr->u.p.parm3;
         int while_index = 0;
+        T id = pr->u.p.static_steal_counter; // loop id
+        int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                  __kmp_dispatch_num_buffers; // current loop index
+        // note: victim thread can potentially execute another loop
         // TODO: algorithm of searching for a victim
         // should be cleaned up and measured
         while ((!status) && (while_limit != ++while_index)) {
+          dispatch_private_info_template<T> *victim;
           T remaining;
           T victimIdx = pr->u.p.parm4;
           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-          dispatch_private_info_template<T> *victim =
-              reinterpret_cast<dispatch_private_info_template<T> *>(
-                  other_threads[victimIdx]
-                      ->th.th_dispatch->th_dispatch_pr_current);
-          while ((victim == NULL || victim == pr ||
-                  (*(volatile T *)&victim->u.p.static_steal_counter !=
-                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(victim);
+          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
                  oldVictimIdx != victimIdx) {
             victimIdx = (victimIdx + 1) % nproc;
             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                other_threads[victimIdx]
-                    ->th.th_dispatch->th_dispatch_pr_current);
+                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(victim);
           }
-          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
-                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+          if (victim == pr || id != victim->u.p.static_steal_counter) {
             continue; // try once more (nproc attempts in total)
             // no victim is ready yet to participate in stealing
-            // because all victims are still in kmp_init_dispatch
+            // because no victim passed kmp_init_dispatch yet
           }
           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
             continue; // not enough chunks to steal, goto next victim
           }
 
-          lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
+          lck = victim->u.p.th_steal_lock;
           KMP_ASSERT(lck != NULL);
           __kmp_acquire_lock(lck, gtid);
           limit = victim->u.p.ub; // keep initial ub
@@ -1268,10 +1269,10 @@
           status = 1;
           while_index = 0;
           // now update own count and ub with stolen range but init chunk
-          __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
+          __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
           pr->u.p.count = init + 1;
           pr->u.p.ub = limit;
-          __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
+          __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
         } // while (search for victim)
       } // if (try to find victim and steal)
     } else {
@@ -1308,32 +1309,32 @@
         kmp_info_t **other_threads = team->t.t_threads;
         int while_limit = pr->u.p.parm3;
         int while_index = 0;
-
+        T id = pr->u.p.static_steal_counter; // loop id
+        int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                  __kmp_dispatch_num_buffers; // current loop index
+        // note: victim thread can potentially execute another loop
         // TODO: algorithm of searching for a victim
         // should be cleaned up and measured
         while ((!status) && (while_limit != ++while_index)) {
+          dispatch_private_info_template<T> *victim;
           union_i4 vold, vnew;
           kmp_int32 remaining;
           T victimIdx = pr->u.p.parm4;
           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-          dispatch_private_info_template<T> *victim =
-              reinterpret_cast<dispatch_private_info_template<T> *>(
-                  other_threads[victimIdx]
-                      ->th.th_dispatch->th_dispatch_pr_current);
-          while ((victim == NULL || victim == pr ||
-                  (*(volatile T *)&victim->u.p.static_steal_counter !=
-                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(victim);
+          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
                  oldVictimIdx != victimIdx) {
             victimIdx = (victimIdx + 1) % nproc;
             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                other_threads[victimIdx]
-                    ->th.th_dispatch->th_dispatch_pr_current);
+                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(victim);
           }
-          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
-                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+          if (victim == pr || id != victim->u.p.static_steal_counter) {
             continue; // try once more (nproc attempts in total)
             // no victim is ready yet to participate in stealing
-            // because all victims are still in kmp_init_dispatch
+            // because no victim passed kmp_init_dispatch yet
           }
           pr->u.p.parm4 = victimIdx; // new victim found
           while (1) { // CAS loop if victim has enough chunks to steal
@@ -2068,14 +2069,19 @@
         if (pr->schedule == kmp_sch_static_steal &&
             traits_t<T>::type_size > 4) {
           int i;
+          int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                    __kmp_dispatch_num_buffers; // current loop index
           kmp_info_t **other_threads = team->t.t_threads;
           // loop complete, safe to destroy locks used for stealing
           for (i = 0; i < th->th.th_team_nproc; ++i) {
-            kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
+            dispatch_private_info_template<T> *buf =
+                reinterpret_cast<dispatch_private_info_template<T> *>(
+                    &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
+            kmp_lock_t *lck = buf->u.p.th_steal_lock;
             KMP_ASSERT(lck != NULL);
             __kmp_destroy_lock(lck);
             __kmp_free(lck);
-            other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
+            buf->u.p.th_steal_lock = NULL;
           }
         }
 #endif
diff --git a/openmp/runtime/test/worksharing/for/omp_nonmonotonic_nowait.c b/openmp/runtime/test/worksharing/for/omp_nonmonotonic_nowait.c
new file mode 100644
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/omp_nonmonotonic_nowait.c
@@ -0,0 +1,34 @@
+// RUN: %libomp-compile-and-run
+
+// The test checks nonmonotonic scheduling works correctly when threads
+// may execute different loops concurrently.
+
+#include <stdio.h>
+#include <omp.h>
+
+#define N 200
+#define C 20
+int main()
+{
+  int i, l0 = 0, l1 = 0;
+  #pragma omp parallel num_threads(8)
+  {
+    #pragma omp for schedule(nonmonotonic:dynamic,C) nowait
+    for (i = 0; i < N; ++i) {
+      #pragma omp atomic
+        l0++;
+    }
+    #pragma omp for schedule(nonmonotonic:dynamic,C) nowait
+    for (i = 0; i < N * N; ++i) {
+      #pragma omp atomic
+        l1++;
+    }
+  }
+  if (l0 != N || l1 != N * N) {
+    printf("failed l0 = %d, l1 = %d, should be %d %d\n", l0, l1, N, N * N);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}