diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1665,14 +1665,12 @@
   kmp_int32 lb;
   kmp_int32 st;
   kmp_int32 tc;
-  kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put
-                                     after ub */
-  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
-  // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on )
+  kmp_lock_t *steal_lock; // lock used for chunk stealing
+  // KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
   //    a) parm3 is properly aligned and
-  //    b) all parm1-4 are in the same cache line.
+  //    b) all parm1-4 are on the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
-  // if they are in the same line (not measured though).
+  // if they are on the same cache line (not measured though).
 
   struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
     kmp_int32 parm1; //     structures in kmp_dispatch.cpp. This should
@@ -1684,9 +1682,6 @@
   kmp_uint32 ordered_lower;
   kmp_uint32 ordered_upper;
 #if KMP_OS_WINDOWS
-  // This var can be placed in the hole between 'tc' and 'parm1', instead of
-  // 'static_steal_counter'. It would be nice to measure execution times.
-  // Conditional if/endif can be removed at all.
   kmp_int32 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info32_t;
@@ -1698,9 +1693,7 @@
   kmp_int64 lb; /* lower-bound */
   kmp_int64 st; /* stride */
   kmp_int64 tc; /* trip count (number of iterations) */
-  kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put
-                                     after ub */
-  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
+  kmp_lock_t *steal_lock; // lock used for chunk stealing
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
@@ -1719,9 +1712,6 @@
   kmp_uint64 ordered_lower;
   kmp_uint64 ordered_upper;
 #if KMP_OS_WINDOWS
-  // This var can be placed in the hole between 'tc' and 'parm1', instead of
-  // 'static_steal_counter'. It would be nice to measure execution times.
-  // Conditional if/endif can be removed at all.
   kmp_int64 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info64_t;
@@ -1775,9 +1765,8 @@
   } u;
   enum sched_type schedule; /* scheduling algorithm */
   kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+  std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
   kmp_int32 ordered_bumped;
-  // To retain the structure size after making ordered_iteration scalar
-  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
   // Stack of buffers for nest of serial regions
   struct dispatch_private_info *next;
   kmp_int32 type_size; /* the size of types in private_info */
@@ -1792,7 +1781,7 @@
   /* chunk index under dynamic, number of idle threads under static-steal;
      iteration index otherwise */
   volatile kmp_uint32 iteration;
-  volatile kmp_uint32 num_done;
+  volatile kmp_int32 num_done;
   volatile kmp_uint32 ordered_iteration;
   // Dummy to retain the structure size after making ordered_iteration scalar
   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 1];
@@ -1802,7 +1791,7 @@
   /* chunk index under dynamic, number of idle threads under static-steal;
      iteration index otherwise */
   volatile kmp_uint64 iteration;
-  volatile kmp_uint64 num_done;
+  volatile kmp_int64 num_done;
   volatile kmp_uint64 ordered_iteration;
   // Dummy to retain the structure size after making ordered_iteration scalar
   kmp_int64 ordered_dummy[KMP_MAX_ORDERED - 3];
@@ -1838,7 +1827,7 @@
   dispatch_private_info_t *th_dispatch_pr_current;
 
   dispatch_private_info_t *th_disp_buffer;
-  kmp_int32 th_disp_index;
+  kmp_uint32 th_disp_index;
   kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
   volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
   kmp_int64 *th_doacross_info; // info on loop bounds
diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h
--- a/openmp/runtime/src/kmp_dispatch.h
+++ b/openmp/runtime/src/kmp_dispatch.h
@@ -74,8 +74,7 @@
   T lb;
   ST st; // signed
   UT tc; // unsigned
-  T static_steal_counter; // for static_steal only; maybe better to put after ub
-  kmp_lock_t *th_steal_lock; // lock used for chunk stealing
+  kmp_lock_t *steal_lock; // lock used for chunk stealing
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
@@ -134,9 +133,8 @@
   } u;
   enum sched_type schedule; /* scheduling algorithm */
   kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+  std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
   kmp_uint32 ordered_bumped;
-  // to retain the structure size after making order
-  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
   kmp_uint32 type_size;
 #if KMP_USE_HIER_SCHED
@@ -153,10 +151,11 @@
 // dispatch_shared_info{32,64}_t types
 template <typename T> struct dispatch_shared_infoXX_template {
   typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
   /* chunk index under dynamic, number of idle threads under static-steal;
      iteration index otherwise */
   volatile UT iteration;
-  volatile UT num_done;
+  volatile ST num_done;
   volatile UT ordered_iteration;
   // to retain the structure size making ordered_iteration scalar
   UT ordered_dummy[KMP_MAX_ORDERED - 3];
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -90,6 +90,22 @@
   return monotonicity;
 }
 
+#if KMP_STATIC_STEAL_ENABLED
+enum { // values for steal_flag (possible states of private per-loop buffer)
+  UNUSED = 0,
+  CLAIMED = 1, // owner thread started initialization
+  READY = 2, // available for stealing
+  THIEF = 3 // finished by owner, or claimed by thief
+  // possible state changes:
+  // 0 -> 1 owner only, sync
+  // 0 -> 3 thief only, sync
+  // 1 -> 2 owner only, async
+  // 2 -> 3 owner only, async
+  // 3 -> 2 owner only, async
+  // 3 -> 0 last thread finishing the loop, async
+};
+#endif
+
 // Initialize a dispatch_private_info_template<T> buffer for a particular
 // type of schedule,chunk.  The loop description is found in lb (lower bound),
 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
@@ -187,6 +203,8 @@
       schedule = team->t.t_sched.r_sched_type;
       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+      if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
+        monotonicity = SCHEDULE_MONOTONIC;
       // Detail the schedule if needed (global controls are differentiated
       // appropriately)
       if (schedule == kmp_sch_guided_chunked) {
@@ -346,7 +364,7 @@
   }
 
   switch (schedule) {
-#if (KMP_STATIC_STEAL_ENABLED)
+#if KMP_STATIC_STEAL_ENABLED
   case kmp_sch_static_steal: {
     T ntc, init;
 
@@ -359,32 +377,37 @@
       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
       T id = tid;
       T small_chunk, extras;
-
+      kmp_uint32 old = UNUSED;
+      int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
+      if (traits_t<T>::type_size > 4) {
+        // AC: TODO: check if 16-byte CAS available and use it to
+        // improve performance (probably wait for explicit request
+        // before spending time on this).
+        // For now use dynamically allocated per-private-buffer lock,
+        // free memory in __kmp_dispatch_next when status==0.
+        pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
+        __kmp_init_lock(pr->u.p.steal_lock);
+      }
       small_chunk = ntc / nproc;
       extras = ntc % nproc;
 
       init = id * small_chunk + (id < extras ? id : extras);
       pr->u.p.count = init;
-      pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
-
-      pr->u.p.parm2 = lb;
+      if (claimed) { // are we succeeded in claiming own buffer?
+        pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+        // Other threads will inspect steal_flag when searching for a victim.
+        // READY means other threads may steal from this thread from now on.
+        KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
+      } else {
+        // other thread has stolen whole our range
+        KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
+        pr->u.p.ub = init; // mark there is no iterations to work on
+      }
+      pr->u.p.parm2 = ntc; // save number of chunks
       // parm3 is the number of times to attempt stealing which is
-      // proportional to the number of chunks per thread up until
-      // the maximum value of nproc.
-      pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
+      // nproc (just a heuristics, could be optimized later on).
+      pr->u.p.parm3 = nproc;
       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
-      pr->u.p.st = st;
-      if (traits_t<T>::type_size > 4) {
-        // AC: TODO: check if 16-byte CAS available and use it to
-        // improve performance (probably wait for explicit request
-        // before spending time on this).
-        // For now use dynamically allocated per-thread lock,
-        // free memory in __kmp_dispatch_next when status==0.
-        KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
-        pr->u.p.th_steal_lock =
-            (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
-        __kmp_init_lock(pr->u.p.th_steal_lock);
-      }
       break;
     } else {
       /* too few chunks: switching to kmp_sch_dynamic_chunked */
@@ -881,6 +904,18 @@
         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
                   my_buffer_index));
+    if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
+      KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
+                     " sh->buffer_index:%d\n",
+                     gtid, my_buffer_index, sh->buffer_index));
+      __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
+                             __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
+      // Note: KMP_WAIT() cannot be used there: buffer index and
+      // my_buffer_index are *always* 32-bit integers.
+      KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
+                     "sh->buffer_index:%d\n",
+                     gtid, my_buffer_index, sh->buffer_index));
+    }
   }
 
   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
@@ -897,24 +932,6 @@
       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
     }
-  }
-
-  if (active) {
-    /* The name of this buffer should be my_buffer_index when it's free to use
-     * it */
-
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
-                   "sh->buffer_index:%d\n",
-                   gtid, my_buffer_index, sh->buffer_index));
-    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
-                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
-    // Note: KMP_WAIT() cannot be used there: buffer index and
-    // my_buffer_index are *always* 32-bit integers.
-    KMP_MB(); /* is this necessary? */
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
-                   "sh->buffer_index:%d\n",
-                   gtid, my_buffer_index, sh->buffer_index));
-
     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
     th->th.th_dispatch->th_dispatch_sh_current =
         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
@@ -978,21 +995,6 @@
     __kmp_str_free(&buff);
   }
 #endif
-#if (KMP_STATIC_STEAL_ENABLED)
-  // It cannot be guaranteed that after execution of a loop with some other
-  // schedule kind all the parm3 variables will contain the same value. Even if
-  // all parm3 will be the same, it still exists a bad case like using 0 and 1
-  // rather than program life-time increment. So the dedicated variable is
-  // required. The 'static_steal_counter' is used.
-  if (pr->schedule == kmp_sch_static_steal) {
-    // Other threads will inspect this variable when searching for a victim.
-    // This is a flag showing that other threads may steal from this thread
-    // since then.
-    volatile T *p = &pr->u.p.static_steal_counter;
-    *p = *p + 1;
-  }
-#endif // ( KMP_STATIC_STEAL_ENABLED )
-
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_work) {
     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
@@ -1200,10 +1202,10 @@
   }
 
   switch (pr->schedule) {
-#if (KMP_STATIC_STEAL_ENABLED)
+#if KMP_STATIC_STEAL_ENABLED
   case kmp_sch_static_steal: {
     T chunk = pr->u.p.parm1;
-
+    UT nchunks = pr->u.p.parm2;
     KD_TRACE(100,
              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
               gtid));
@@ -1211,11 +1213,12 @@
     trip = pr->u.p.tc - 1;
 
     if (traits_t<T>::type_size > 4) {
-      // use lock for 8-byte and CAS for 4-byte induction
-      // variable. TODO (optional): check and use 16-byte CAS
-      kmp_lock_t *lck = pr->u.p.th_steal_lock;
+      // use lock for 8-byte induction variable.
+      // TODO (optional): check presence and use 16-byte CAS
+      kmp_lock_t *lck = pr->u.p.steal_lock;
       KMP_DEBUG_ASSERT(lck != NULL);
       if (pr->u.p.count < (UT)pr->u.p.ub) {
+        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
         __kmp_acquire_lock(lck, gtid);
         // try to get own chunk of iterations
         init = (pr->u.p.count)++;
@@ -1225,76 +1228,125 @@
         status = 0; // no own chunks
       }
       if (!status) { // try to steal
-        kmp_info_t **other_threads = team->t.t_threads;
+        kmp_lock_t *lckv; // victim buffer's lock
         T while_limit = pr->u.p.parm3;
         T while_index = 0;
-        T id = pr->u.p.static_steal_counter; // loop id
         int idx = (th->th.th_dispatch->th_disp_index - 1) %
                   __kmp_dispatch_num_buffers; // current loop index
         // note: victim thread can potentially execute another loop
-        // TODO: algorithm of searching for a victim
-        // should be cleaned up and measured
+        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
         while ((!status) && (while_limit != ++while_index)) {
-          dispatch_private_info_template<T> *victim;
+          dispatch_private_info_template<T> *v;
           T remaining;
-          T victimIdx = pr->u.p.parm4;
-          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
-          KMP_DEBUG_ASSERT(victim);
-          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
-                 oldVictimIdx != victimIdx) {
-            victimIdx = (victimIdx + 1) % nproc;
-            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
-            KMP_DEBUG_ASSERT(victim);
+          T victimId = pr->u.p.parm4;
+          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
+          v = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(v);
+          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
+                 oldVictimId != victimId) {
+            victimId = (victimId + 1) % nproc;
+            v = reinterpret_cast<dispatch_private_info_template<T> *>(
+                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(v);
           }
-          if (victim == pr || id != victim->u.p.static_steal_counter) {
+          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
+            // if (nchunks == sh->u.s.iteration)
+            //  break; // nothing to steal
+            // else
             continue; // try once more (nproc attempts in total)
-            // no victim is ready yet to participate in stealing
-            // because no victim passed kmp_init_dispatch yet
           }
-          if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
-            pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
-            continue; // not enough chunks to steal, goto next victim
+          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
+            kmp_uint32 old = UNUSED;
+            // try to steal whole range from inactive victim
+            status = v->steal_flag.compare_exchange_strong(old, THIEF);
+            if (status) {
+              // initialize self buffer with victim's whole range of chunks
+              T id = victimId;
+              T small_chunk, extras;
+              small_chunk = nchunks / nproc; // chunks per thread
+              extras = nchunks % nproc;
+              init = id * small_chunk + (id < extras ? id : extras);
+              __kmp_acquire_lock(lck, gtid);
+              pr->u.p.count = init + 1; // exclude one we execute immediately
+              pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+              __kmp_release_lock(lck, gtid);
+              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
+// no need to reinitialize other thread invariants: lb, st, etc.
+#ifdef KMP_DEBUG
+              {
+                char *buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
+                    "count:%%%s ub:%%%s\n",
+                    traits_t<UT>::spec, traits_t<T>::spec);
+                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
+                __kmp_str_free(&buff);
+              }
+#endif
+              // activate non-empty buffer and let others steal from us
+              if (pr->u.p.count < (UT)pr->u.p.ub)
+                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
+              break;
+            }
           }
-
-          lck = victim->u.p.th_steal_lock;
-          KMP_ASSERT(lck != NULL);
-          __kmp_acquire_lock(lck, gtid);
-          limit = victim->u.p.ub; // keep initial ub
-          if (victim->u.p.count >= limit ||
-              (remaining = limit - victim->u.p.count) < 2) {
-            __kmp_release_lock(lck, gtid);
-            pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
-            continue; // not enough chunks to steal
+          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
+              v->u.p.count >= (UT)v->u.p.ub) {
+            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
+            continue; // no chunks to steal, try next victim
           }
-          // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
-          // by 1
-          if (remaining > 3) {
+          lckv = v->u.p.steal_lock;
+          KMP_ASSERT(lckv != NULL);
+          __kmp_acquire_lock(lckv, gtid);
+          limit = v->u.p.ub; // keep initial ub
+          if (v->u.p.count >= limit) {
+            __kmp_release_lock(lckv, gtid);
+            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
+            continue; // no chunks to steal, try next victim
+          }
+
+          // stealing succeded, reduce victim's ub by 1/4 of undone chunks
+          // TODO: is this heuristics good enough??
+          remaining = limit - v->u.p.count;
+          if (remaining > 7) {
             // steal 1/4 of remaining
             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
-            init = (victim->u.p.ub -= (remaining >> 2));
+            init = (v->u.p.ub -= (remaining >> 2));
           } else {
-            // steal 1 chunk of 2 or 3 remaining
+            // steal 1 chunk of 1..7 remaining
             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
-            init = (victim->u.p.ub -= 1);
+            init = (v->u.p.ub -= 1);
           }
-          __kmp_release_lock(lck, gtid);
-
+          __kmp_release_lock(lckv, gtid);
+#ifdef KMP_DEBUG
+          {
+            char *buff;
+            // create format specifiers before the debug output
+            buff = __kmp_str_format(
+                "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
+                "count:%%%s ub:%%%s\n",
+                traits_t<UT>::spec, traits_t<UT>::spec);
+            KD_TRACE(10, (buff, gtid, victimId, init, limit));
+            __kmp_str_free(&buff);
+          }
+#endif
           KMP_DEBUG_ASSERT(init + 1 <= limit);
-          pr->u.p.parm4 = victimIdx; // remember victim to steal from
+          pr->u.p.parm4 = victimId; // remember victim to steal from
           status = 1;
-          while_index = 0;
-          // now update own count and ub with stolen range but init chunk
-          __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
+          // now update own count and ub with stolen range excluding init chunk
+          __kmp_acquire_lock(lck, gtid);
           pr->u.p.count = init + 1;
           pr->u.p.ub = limit;
-          __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
+          __kmp_release_lock(lck, gtid);
+          // activate non-empty buffer and let others steal from us
+          if (init + 1 < limit)
+            KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
         } // while (search for victim)
       } // if (try to find victim and steal)
     } else {
       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
+      // as all operations on pair (count, ub) must be done atomically
       typedef union {
         struct {
           UT count;
@@ -1302,86 +1354,132 @@
         } p;
         kmp_int64 b;
       } union_i4;
-      // All operations on 'count' or 'ub' must be combined atomically
-      // together.
-      {
-        union_i4 vold, vnew;
+      union_i4 vold, vnew;
+      if (pr->u.p.count < (UT)pr->u.p.ub) {
+        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
-        vnew = vold;
-        vnew.p.count++;
-        while (!KMP_COMPARE_AND_STORE_ACQ64(
+        vnew.b = vold.b;
+        vnew.p.count++; // get chunk from head of self range
+        while (!KMP_COMPARE_AND_STORE_REL64(
             (volatile kmp_int64 *)&pr->u.p.count,
             *VOLATILE_CAST(kmp_int64 *) & vold.b,
             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
           KMP_CPU_PAUSE();
           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
-          vnew = vold;
+          vnew.b = vold.b;
           vnew.p.count++;
         }
-        vnew = vold;
-        init = vnew.p.count;
-        status = (init < (UT)vnew.p.ub);
+        init = vold.p.count;
+        status = (init < (UT)vold.p.ub);
+      } else {
+        status = 0; // no own chunks
       }
-
-      if (!status) {
-        kmp_info_t **other_threads = team->t.t_threads;
+      if (!status) { // try to steal
         T while_limit = pr->u.p.parm3;
         T while_index = 0;
-        T id = pr->u.p.static_steal_counter; // loop id
         int idx = (th->th.th_dispatch->th_disp_index - 1) %
                   __kmp_dispatch_num_buffers; // current loop index
         // note: victim thread can potentially execute another loop
-        // TODO: algorithm of searching for a victim
-        // should be cleaned up and measured
+        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
         while ((!status) && (while_limit != ++while_index)) {
-          dispatch_private_info_template<T> *victim;
-          union_i4 vold, vnew;
+          dispatch_private_info_template<T> *v;
           T remaining;
-          T victimIdx = pr->u.p.parm4;
-          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
-          KMP_DEBUG_ASSERT(victim);
-          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
-                 oldVictimIdx != victimIdx) {
-            victimIdx = (victimIdx + 1) % nproc;
-            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
-            KMP_DEBUG_ASSERT(victim);
+          T victimId = pr->u.p.parm4;
+          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
+          v = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(v);
+          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
+                 oldVictimId != victimId) {
+            victimId = (victimId + 1) % nproc;
+            v = reinterpret_cast<dispatch_private_info_template<T> *>(
+                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(v);
           }
-          if (victim == pr || id != victim->u.p.static_steal_counter) {
+          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
+            // if (nchunks == sh->u.s.iteration)
+            //  break; // nothing to steal
+            // else
             continue; // try once more (nproc attempts in total)
-            // no victim is ready yet to participate in stealing
-            // because no victim passed kmp_init_dispatch yet
           }
-          pr->u.p.parm4 = victimIdx; // new victim found
-          while (1) { // CAS loop if victim has enough chunks to steal
-            vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
-            vnew = vold;
-
-            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
-            if (vnew.p.count >= (UT)vnew.p.ub ||
-                (remaining = vnew.p.ub - vnew.p.count) < 2) {
-              pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
-              break; // not enough chunks to steal, goto next victim
+          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
+            kmp_uint32 old = UNUSED;
+            // try to steal whole range from inactive victim
+            status = v->steal_flag.compare_exchange_strong(old, THIEF);
+            if (status) {
+              // initialize self buffer with victim's whole range of chunks
+              T id = victimId;
+              T small_chunk, extras;
+              small_chunk = nchunks / nproc; // chunks per thread
+              extras = nchunks % nproc;
+              init = id * small_chunk + (id < extras ? id : extras);
+              vnew.p.count = init + 1;
+              vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+// write pair (count, ub) at once atomically
+#if KMP_ARCH_X86
+              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
+#else
+              *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
+#endif
+              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
+// no need to initialize other thread invariants: lb, st, etc.
+#ifdef KMP_DEBUG
+              {
+                char *buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
+                    "count:%%%s ub:%%%s\n",
+                    traits_t<UT>::spec, traits_t<T>::spec);
+                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
+                __kmp_str_free(&buff);
+              }
+#endif
+              // activate non-empty buffer and let others steal from us
+              if (pr->u.p.count < (UT)pr->u.p.ub)
+                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
+              break;
             }
-            if (remaining > 3) {
-              // try to steal 1/4 of remaining
-              vnew.p.ub -= remaining >> 2;
+          }
+          while (1) { // CAS loop with check if victim still has enough chunks
+            // many threads may be stealing concurrently from same victim
+            vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
+            if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
+                vold.p.count >= (UT)vold.p.ub) {
+              pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
+              break; // no chunks to steal, try next victim
+            }
+            vnew.b = vold.b;
+            remaining = vold.p.ub - vold.p.count;
+            // try to steal 1/4 of remaining
+            // TODO: is this heuristics good enough??
+            if (remaining > 7) {
+              vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
             } else {
-              vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
+              vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
             }
             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
-            // TODO: Should this be acquire or release?
-            if (KMP_COMPARE_AND_STORE_ACQ64(
-                    (volatile kmp_int64 *)&victim->u.p.count,
+            if (KMP_COMPARE_AND_STORE_REL64(
+                    (volatile kmp_int64 *)&v->u.p.count,
                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
-              // stealing succeeded
+              // stealing succedded
+#ifdef KMP_DEBUG
+              {
+                char *buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
+                    "count:%%%s ub:%%%s\n",
+                    traits_t<T>::spec, traits_t<T>::spec);
+                KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
+                __kmp_str_free(&buff);
+              }
+#endif
               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
                                         vold.p.ub - vnew.p.ub);
               status = 1;
-              while_index = 0;
+              pr->u.p.parm4 = victimId; // keep victim id
               // now update own count and ub
               init = vnew.p.ub;
               vold.p.count = init + 1;
@@ -1390,6 +1488,9 @@
 #else
               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
 #endif
+              // activate non-empty buffer and let others steal from us
+              if (vold.p.count < (UT)vold.p.ub)
+                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
               break;
             } // if (check CAS result)
             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
@@ -1403,13 +1504,16 @@
       if (p_st != NULL)
         *p_st = 0;
     } else {
-      start = pr->u.p.parm2;
+      start = pr->u.p.lb;
       init *= chunk;
       limit = chunk + init - 1;
       incr = pr->u.p.st;
       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
 
       KMP_DEBUG_ASSERT(init <= trip);
+      // keep track of done chunks for possible early exit from stealing
+      // TODO: count executed chunks locally with rare update of shared location
+      // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
       if ((last = (limit >= trip)) != 0)
         limit = trip;
       if (p_st != NULL)
@@ -1422,15 +1526,10 @@
         *p_lb = start + init * incr;
         *p_ub = start + limit * incr;
       }
-
-      if (pr->flags.ordered) {
-        pr->u.p.ordered_lower = init;
-        pr->u.p.ordered_upper = limit;
-      } // if
     } // if
     break;
   } // case
-#endif // ( KMP_STATIC_STEAL_ENABLED )
+#endif // KMP_STATIC_STEAL_ENABLED
   case kmp_sch_static_balanced: {
     KD_TRACE(
         10,
@@ -2075,16 +2174,15 @@
                                                 th->th.th_info.ds.ds_tid);
     // status == 0: no more iterations to execute
     if (status == 0) {
-      UT num_done;
-
-      num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
+      ST num_done;
+      num_done = test_then_inc<ST>(&sh->u.s.num_done);
 #ifdef KMP_DEBUG
       {
         char *buff;
         // create format specifiers before the debug output
         buff = __kmp_str_format(
             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
-            traits_t<UT>::spec);
+            traits_t<ST>::spec);
         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
         __kmp_str_free(&buff);
       }
@@ -2093,28 +2191,31 @@
 #if KMP_USE_HIER_SCHED
       pr->flags.use_hier = FALSE;
 #endif
-      if ((ST)num_done == th->th.th_team_nproc - 1) {
-#if (KMP_STATIC_STEAL_ENABLED)
-        if (pr->schedule == kmp_sch_static_steal &&
-            traits_t<T>::type_size > 4) {
+      if (num_done == th->th.th_team_nproc - 1) {
+#if KMP_STATIC_STEAL_ENABLED
+        if (pr->schedule == kmp_sch_static_steal) {
           int i;
           int idx = (th->th.th_dispatch->th_disp_index - 1) %
                     __kmp_dispatch_num_buffers; // current loop index
-          kmp_info_t **other_threads = team->t.t_threads;
           // loop complete, safe to destroy locks used for stealing
           for (i = 0; i < th->th.th_team_nproc; ++i) {
             dispatch_private_info_template<T> *buf =
                 reinterpret_cast<dispatch_private_info_template<T> *>(
-                    &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
-            kmp_lock_t *lck = buf->u.p.th_steal_lock;
-            KMP_ASSERT(lck != NULL);
-            __kmp_destroy_lock(lck);
-            __kmp_free(lck);
-            buf->u.p.th_steal_lock = NULL;
+                    &team->t.t_dispatch[i].th_disp_buffer[idx]);
+            KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
+            KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
+            if (traits_t<T>::type_size > 4) {
+              // destroy locks used for stealing
+              kmp_lock_t *lck = buf->u.p.steal_lock;
+              KMP_ASSERT(lck != NULL);
+              __kmp_destroy_lock(lck);
+              __kmp_free(lck);
+              buf->u.p.steal_lock = NULL;
+            }
           }
         }
 #endif
-        /* NOTE: release this buffer to be reused */
+        /* NOTE: release shared buffer to be reused */
 
         KMP_MB(); /* Flush all pending memory write invalidates.  */
 
@@ -2126,8 +2227,6 @@
           sh->u.s.ordered_iteration = 0;
         }
 
-        KMP_MB(); /* Flush all pending memory write invalidates.  */
-
         sh->buffer_index += __kmp_dispatch_num_buffers;
         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
                        gtid, sh->buffer_index));
diff --git a/openmp/runtime/src/kmp_dispatch_hier.h b/openmp/runtime/src/kmp_dispatch_hier.h
--- a/openmp/runtime/src/kmp_dispatch_hier.h
+++ b/openmp/runtime/src/kmp_dispatch_hier.h
@@ -924,7 +924,7 @@
                                    T lb, T ub,
                                    typename traits_t<T>::signed_t st) {
   int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
-  int my_buffer_index;
+  unsigned int my_buffer_index;
   kmp_info_t *th;
   kmp_team_t *team;
   dispatch_private_info_template<T> *pr;
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -3985,8 +3985,11 @@
   else if (!__kmp_strcasecmp_with_sentinel("static", ptr, *delim))
     sched = kmp_sch_static;
 #if KMP_STATIC_STEAL_ENABLED
-  else if (!__kmp_strcasecmp_with_sentinel("static_steal", ptr, *delim))
-    sched = kmp_sch_static_steal;
+  else if (!__kmp_strcasecmp_with_sentinel("static_steal", ptr, *delim)) {
+    // replace static_steal with dynamic to better cope with ordered loops
+    sched = kmp_sch_dynamic_chunked;
+    sched_modifier = sched_type::kmp_sch_modifier_nonmonotonic;
+  }
 #endif
   else {
     // If there is no proper schedule kind, then this schedule is invalid
diff --git a/openmp/runtime/test/env/kmp_set_dispatch_buf.c b/openmp/runtime/test/env/kmp_set_dispatch_buf.c
--- a/openmp/runtime/test/env/kmp_set_dispatch_buf.c
+++ b/openmp/runtime/test/env/kmp_set_dispatch_buf.c
@@ -9,7 +9,7 @@
 // RUN: env KMP_DISP_NUM_BUFFERS=3 %libomp-run
 // RUN: env KMP_DISP_NUM_BUFFERS=4 %libomp-run
 // RUN: env KMP_DISP_NUM_BUFFERS=7 %libomp-run
-// UNSUPPORTED: clang-11, clang-12, clang-13
+// UNSUPPORTED: clang-11, clang-12
 #include <stdio.h>
 #include <omp.h>
 #include <stdlib.h>
@@ -78,5 +78,9 @@
       num_failed++;
     }
   }
+  if (num_failed == 0)
+    printf("passed\n");
+  else
+    printf("failed %d\n", num_failed);
   return num_failed;
 }
diff --git a/openmp/runtime/test/worksharing/for/kmp_set_dispatch_buf.c b/openmp/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
--- a/openmp/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
+++ b/openmp/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
@@ -3,7 +3,7 @@
 // RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5
 // RUN: %libomp-compile -DMY_SCHEDULE=guided && %libomp-run 7
 // RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5
-// UNSUPPORTED: clang-11, clang-12, clang-13
+// UNSUPPORTED: clang-11, clang-12
 #include <stdio.h>
 #include <omp.h>
 #include <stdlib.h>
@@ -88,5 +88,9 @@
       num_failed++;
     }
   }
+  if (num_failed == 0)
+    printf("passed\n");
+  else
+    printf("failed %d\n", num_failed);
   return num_failed;
 }
diff --git a/openmp/runtime/test/worksharing/for/omp_for_schedule_runtime.c b/openmp/runtime/test/worksharing/for/omp_for_schedule_runtime.c
--- a/openmp/runtime/test/worksharing/for/omp_for_schedule_runtime.c
+++ b/openmp/runtime/test/worksharing/for/omp_for_schedule_runtime.c
@@ -8,8 +8,8 @@
 // RUN: env OMP_SCHEDULE=auto %libomp-run 4 1
 // RUN: env OMP_SCHEDULE=trapezoidal %libomp-run 101 1
 // RUN: env OMP_SCHEDULE=trapezoidal,13 %libomp-run 101 13
-// RUN: env OMP_SCHEDULE=static_steal %libomp-run 102 1
-// RUN: env OMP_SCHEDULE=static_steal,14 %libomp-run 102 14
+// RUN: env OMP_SCHEDULE=static_steal %libomp-run 2 1
+// RUN: env OMP_SCHEDULE=static_steal,14 %libomp-run 2 14
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/openmp/runtime/test/worksharing/for/omp_par_in_loop.c b/openmp/runtime/test/worksharing/for/omp_par_in_loop.c
new file mode 100644
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/omp_par_in_loop.c
@@ -0,0 +1,28 @@
+// RUN: %libomp-compile-and-run
+//
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <omp.h>
+
+#define TYPE long
+#define MAX_ITER (TYPE)((TYPE)1000000)
+#define EVERY (TYPE)((TYPE)100000)
+
+int main(int argc, char* argv[]) {
+  TYPE x = MAX_ITER;
+  omp_set_max_active_levels(2);
+  omp_set_num_threads(2);
+  #pragma omp parallel for schedule(nonmonotonic:dynamic,1)
+  for (TYPE i = 0; i < x; i++) {
+    int tid = omp_get_thread_num();
+    omp_set_num_threads(1);
+    #pragma omp parallel proc_bind(spread)
+    {
+      if (i % EVERY == (TYPE)0)
+        printf("Outer thread %d at iter %ld\n", tid, i);
+    }
+  }
+  printf("passed\n");
+  return 0;
+}