diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -47,7 +47,7 @@
       src/critical.cu
       src/data_sharing.cu
       src/libcall.cu
-      src/loop.cu
+      src/worksharing.cu
       src/omptarget-nvptx.cu
       src/parallel.cu
       src/reduction.cu
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
deleted file mode 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ /dev/null
@@ -1,807 +0,0 @@
-//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the KMPC interface
-// for the loop construct plus other worksharing constructs that use the same
-// interface as loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "omptarget-nvptx.h"
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-// template class that encapsulate all the helper functions
-//
-// T is loop iteration type (32 | 64)  (unsigned | signed)
-// ST is the signed version of T
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
-public:
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling with chunk
-
-  // Generic implementation of OMP loop scheduling with static policy
-  /*! \brief Calculate initial bounds for static loop and stride
-   *  @param[in] loc location in code of the call (not used here)
-   *  @param[in] global_tid global thread id
-   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
-   *  @param[in] plastiter pointer to last iteration
-   *  @param[in,out] pointer to loop lower bound. it will contain value of
-   *  lower bound of first chunk
-   *  @param[in,out] pointer to loop upper bound. It will contain value of
-   *  upper bound of first chunk
-   *  @param[in,out] pointer to loop stride. It will contain value of stride
-   *  between two successive chunks executed by the same thread
-   *  @param[in] loop increment bump
-   *  @param[in] chunk size
-   */
-
-  // helper function for static chunk
-  INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
-                                    ST chunk, T entityId, T numberOfEntities) {
-    // each thread executes multiple chunks all of the same size, except
-    // the last one
-
-    // distance between two successive chunks
-    stride = numberOfEntities * chunk;
-    lb = lb + entityId * chunk;
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    // Say ub' is the begining of the last chunk. Then who ever has a
-    // lower bound plus a multiple of the increment equal to ub' is
-    // the last one.
-    T beginingLastChunk = inputUb - (inputUb % chunk);
-    last = ((beginingLastChunk - lb) % stride) == 0;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Loop with static scheduling without chunk
-
-  // helper function for static no chunk
-  INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
-                                      ST &chunk, T entityId,
-                                      T numberOfEntities) {
-    // No chunk size specified.  Each thread or warp gets at most one
-    // chunk; chunks are all almost of equal size
-    T loopSize = ub - lb + 1;
-
-    chunk = loopSize / numberOfEntities;
-    T leftOver = loopSize - chunk * numberOfEntities;
-
-    if (entityId < leftOver) {
-      chunk++;
-      lb = lb + entityId * chunk;
-    } else {
-      lb = lb + entityId * chunk + leftOver;
-    }
-
-    T inputUb = ub;
-    ub = lb + chunk - 1; // Clang uses i <= ub
-    last = lb <= inputUb && inputUb <= ub;
-    stride = loopSize; // make sure we only do 1 chunk per warp
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for Static Init
-
-  INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
-                                     int32_t *plastiter, T *plower, T *pupper,
-                                     ST *pstride, ST chunk,
-                                     bool IsSPMDExecutionMode) {
-    // When IsRuntimeUninitialized is true, we assume that the caller is
-    // in an L0 parallel region and that all worker threads participate.
-
-    // Assume we are in teams region or that we use a single block
-    // per target region
-    ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
-
-    // All warps that are in excess of the maximum requested, do
-    // not execute the loop
-    PRINT(LD_LOOP,
-          "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
-          "%d, num tids %d\n",
-          (int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
-          (int)numberOfActiveOMPThreads);
-    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
-            "current thread is not needed here; error");
-
-    // copy
-    int lastiter = 0;
-    T lb = *plower;
-    T ub = *pupper;
-    ST stride = *pstride;
-    // init
-    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
-    case kmp_sched_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        break;
-      }
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_balanced_chunk: {
-      if (chunk > 0) {
-        // round up to make sure the chunk is enough to cover all iterations
-        T tripCount = ub - lb + 1; // +1 because ub is inclusive
-        T span = (tripCount + numberOfActiveOMPThreads - 1) /
-                 numberOfActiveOMPThreads;
-        // perform chunk adjustment
-        chunk = (span + chunk - 1) & ~(chunk - 1);
-
-        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-        T oldUb = ub;
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-        if (ub > oldUb)
-          ub = oldUb;
-        break;
-      }
-    } // note: if chunk <=0, use nochunk
-    case kmp_sched_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
-                       numberOfActiveOMPThreads);
-      break;
-    }
-    case kmp_sched_distr_static_chunk: {
-      if (chunk > 0) {
-        ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
-                       GetNumberOfOmpTeams());
-        break;
-      } // note: if chunk <=0, use nochunk
-    }
-    case kmp_sched_distr_static_nochunk: {
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
-                       GetNumberOfOmpTeams());
-      break;
-    }
-    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
-      ForStaticChunk(lastiter, lb, ub, stride, chunk,
-                     numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
-                     GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
-      break;
-    }
-    default: {
-      ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", (int)schedtype);
-      PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
-            (int)schedtype);
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
-                     numberOfActiveOMPThreads);
-      break;
-    }
-    }
-    // copy back
-    *plastiter = lastiter;
-    *plower = lb;
-    *pupper = ub;
-    *pstride = stride;
-    PRINT(LD_LOOP,
-          "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
-          "%d\n",
-          (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
-          (long long)(*plower), (long long)(*pupper), (long long)(*pstride),
-          (int)lastiter);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch Init
-
-  INLINE static int OrderedSchedule(kmp_sched_t schedule) {
-    return schedule >= kmp_sched_ordered_first &&
-           schedule <= kmp_sched_ordered_last;
-  }
-
-  INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
-                                   kmp_sched_t schedule, T lb, T ub, ST st,
-                                   ST chunk) {
-    if (checkRuntimeUninitialized(loc)) {
-      // In SPMD mode no need to check parallelism level - dynamic scheduling
-      // may appear only in L2 parallel regions with lightweight runtime.
-      ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
-      return;
-    }
-    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
-    T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc));
-    T tripCount = ub - lb + 1; // +1 because ub is inclusive
-    ASSERT0(LT_FUSSY, threadId < tnum,
-            "current thread is not needed here; error");
-
-    /* Currently just ignore the monotonic and non-monotonic modifiers
-     * (the compiler isn't producing them * yet anyway).
-     * When it is we'll want to look at them somewhere here and use that
-     * information to add to our schedule choice. We shouldn't need to pass
-     * them on, they merely affect which schedule we can legally choose for
-     * various dynamic cases. (In paritcular, whether or not a stealing scheme
-     * is legal).
-     */
-    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
-
-    // Process schedule.
-    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
-      if (OrderedSchedule(schedule))
-        __kmpc_barrier(loc, threadId);
-      PRINT(LD_LOOP,
-            "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
-            (long)tnum, (long long)tripCount, (int)schedule);
-      schedule = kmp_sched_static_chunk;
-      chunk = tripCount; // one thread gets the whole loop
-    } else if (schedule == kmp_sched_runtime) {
-      // process runtime
-      omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
-      chunk = currTaskDescr->RuntimeChunkSize();
-      switch (rtSched) {
-      case omp_sched_static: {
-        if (chunk > 0)
-          schedule = kmp_sched_static_chunk;
-        else
-          schedule = kmp_sched_static_nochunk;
-        break;
-      }
-      case omp_sched_auto: {
-        schedule = kmp_sched_static_chunk;
-        chunk = 1;
-        break;
-      }
-      case omp_sched_dynamic:
-      case omp_sched_guided: {
-        schedule = kmp_sched_dynamic;
-        break;
-      }
-      }
-      PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
-            (long long)chunk);
-    } else if (schedule == kmp_sched_auto) {
-      schedule = kmp_sched_static_chunk;
-      chunk = 1;
-      PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
-            (long long)chunk);
-    } else {
-      PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
-            (long long)chunk);
-      ASSERT(LT_FUSSY,
-             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-             "unknown schedule %d & chunk %lld\n", (int)schedule,
-             (long long)chunk);
-    }
-
-    // init schedules
-    if (schedule == kmp_sched_static_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
-      PRINT(LD_LOOP,
-            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
-            ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
-                tid));
-    } else if (schedule == kmp_sched_static_balanced_chunk) {
-      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
-      // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      // round up to make sure the chunk is enough to cover all iterations
-      T span = (tripCount + tnum - 1) / tnum;
-      // perform chunk adjustment
-      chunk = (span + chunk - 1) & ~(chunk - 1);
-
-      T oldUb = ub;
-      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
-      if (ub > oldUb)
-        ub = oldUb;
-      // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
-      PRINT(LD_LOOP,
-            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
-            ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
-                tid));
-    } else if (schedule == kmp_sched_static_nochunk) {
-      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
-      // save sched state
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      // save ub
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      // compute static chunk
-      ST stride;
-      int lastiter = 0;
-      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
-      // save computed params
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
-      PRINT(LD_LOOP,
-            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
-            ", next lower bound = %llu, stride = %llu\n",
-            (int)tnum,
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
-                tid));
-    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      // save data
-      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      if (chunk < 1)
-        chunk = 1;
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
-      __kmpc_barrier(loc, threadId);
-      if (tid == 0) {
-        omptarget_nvptx_threadPrivateContext->Cnt() = 0;
-        __threadfence_block();
-      }
-      __kmpc_barrier(loc, threadId);
-      PRINT(LD_LOOP,
-            "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
-            ", chunk %" PRIu64 "\n",
-            (int)tnum,
-            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            omptarget_nvptx_threadPrivateContext->Chunk(tid));
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Support for dispatch next
-
-  INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
-    int lo, hi;
-    asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
-    hi = __SHFL_SYNC(active, hi, leader);
-    lo = __SHFL_SYNC(active, lo, leader);
-    asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
-    return val;
-  }
-
-  INLINE static uint64_t NextIter() {
-    unsigned int active = __ACTIVEMASK();
-    int leader = __ffs(active) - 1;
-    int change = __popc(active);
-    unsigned lane_mask_lt;
-    asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
-    unsigned int rank = __popc(active & lane_mask_lt);
-    uint64_t warp_res;
-    if (rank == 0) {
-      warp_res = atomicAdd(
-          (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
-          change);
-    }
-    warp_res = Shuffle(active, warp_res, leader);
-    return warp_res + rank;
-  }
-
-  INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
-                                     T loopLowerBound, T loopUpperBound) {
-    T N = NextIter();
-    lb = loopLowerBound + N * chunkSize;
-    ub = lb + chunkSize - 1;  // Clang uses i <= ub
-
-    // 3 result cases:
-    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
-    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
-    //  NOT_FINISHED
-    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
-    // a.
-    if (lb <= loopUpperBound && ub < loopUpperBound) {
-      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
-            (long long)lb, (long long)ub, (long long)loopUpperBound);
-      return NOT_FINISHED;
-    }
-    // b.
-    if (lb <= loopUpperBound) {
-      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
-            (long long)lb, (long long)ub, (long long)loopUpperBound);
-      ub = loopUpperBound;
-      return LAST_CHUNK;
-    }
-    // c. if we are here, we are in case 'c'
-    lb = loopUpperBound + 2;
-    ub = loopUpperBound + 1;
-    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
-          (long long)ub, (long long)loopUpperBound);
-    return FINISHED;
-  }
-
-  INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
-                                  T *plower, T *pupper, ST *pstride) {
-    if (checkRuntimeUninitialized(loc)) {
-      // In SPMD mode no need to check parallelism level - dynamic scheduling
-      // may appear only in L2 parallel regions with lightweight runtime.
-      ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
-      if (*plast)
-        return DISPATCH_FINISHED;
-      *plast = 1;
-      return DISPATCH_NOTFINISHED;
-    }
-    // ID of a thread in its own warp
-
-    // automatically selects thread or warp ID based on selected implementation
-    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-    ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
-            "current thread is not needed here; error");
-    // retrieve schedule
-    kmp_sched_t schedule =
-        omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
-
-    // xxx reduce to one
-    if (schedule == kmp_sched_static_chunk ||
-        schedule == kmp_sched_static_nochunk) {
-      T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
-      T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
-      // finished?
-      if (myLb > ub) {
-        PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
-              (long long)myLb, (long long)ub);
-        return DISPATCH_FINISHED;
-      }
-      // not finished, save current bounds
-      ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
-      *plower = myLb;
-      T myUb = myLb + chunk - 1; // Clang uses i <= ub
-      if (myUb > ub)
-        myUb = ub;
-      *pupper = myUb;
-      *plast = (int32_t)(myUb == ub);
-
-      // increment next lower bound by the stride
-      ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
-      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
-      PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
-            (long long)*plower, (long long)*pupper);
-      return DISPATCH_NOTFINISHED;
-    }
-    ASSERT0(LT_FUSSY,
-            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-            "bad sched");
-    T myLb, myUb;
-    int finished = DynamicNextChunk(
-        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
-        omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
-
-    if (finished == FINISHED)
-      return DISPATCH_FINISHED;
-
-    // not finished (either not finished or last chunk)
-    *plast = (int32_t)(finished == LAST_CHUNK);
-    *plower = myLb;
-    *pupper = myUb;
-    *pstride = 1;
-
-    PRINT(LD_LOOP,
-          "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
-          "last %d\n",
-          (int)GetNumberOfOmpThreads(isSPMDMode()),
-          (int)GetNumberOfWorkersInTeam(), (long long)*plower,
-          (long long)*pupper, (long long)*pstride, (int)*plast);
-    return DISPATCH_NOTFINISHED;
-  }
-
-  INLINE static void dispatch_fini() {
-    // nothing
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // end of template class that encapsulate all the helper functions
-  ////////////////////////////////////////////////////////////////////////////////
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (dyn loops)
-////////////////////////////////////////////////////////////////////////////////
-
-// init
-EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
-                                   int32_t schedule, int32_t lb, int32_t ub,
-                                   int32_t st, int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
-                                    int32_t schedule, uint32_t lb, uint32_t ub,
-                                    int32_t st, int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
-                                   int32_t schedule, int64_t lb, int64_t ub,
-                                   int64_t st, int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
-                                    int32_t schedule, uint64_t lb, uint64_t ub,
-                                    int64_t st, int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
-      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-// next
-EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
-                                  int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
-  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
-                                   int32_t *p_last, uint32_t *p_lb,
-                                   uint32_t *p_ub, int32_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
-  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
-                                  int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
-  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
-                                   int32_t *p_last, uint64_t *p_lb,
-                                   uint64_t *p_ub, int64_t *p_st) {
-  PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
-  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
-      loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-// fini
-EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
-  PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (static loops)
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int32_t *plower, int32_t *pupper,
-                                     int32_t *pstride, int32_t incr,
-                                     int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc));
-}
-
-EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint32_t *plower, uint32_t *pupper,
-                                      int32_t *pstride, int32_t incr,
-                                      int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc));
-}
-
-EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
-                                     int32_t schedtype, int32_t *plastiter,
-                                     int64_t *plower, int64_t *pupper,
-                                     int64_t *pstride, int64_t incr,
-                                     int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc));
-}
-
-EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
-                                      int32_t schedtype, int32_t *plastiter,
-                                      uint64_t *plower, uint64_t *pupper,
-                                      int64_t *pstride, int64_t incr,
-                                      int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc));
-}
-
-EXTERN
-void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                          int32_t schedtype, int32_t *plastiter,
-                                          int32_t *plower, int32_t *pupper,
-                                          int32_t *pstride, int32_t incr,
-                                          int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                           int32_t schedtype,
-                                           int32_t *plastiter, uint32_t *plower,
-                                           uint32_t *pupper, int32_t *pstride,
-                                           int32_t incr, int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                          int32_t schedtype, int32_t *plastiter,
-                                          int64_t *plower, int64_t *pupper,
-                                          int64_t *pstride, int64_t incr,
-                                          int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
-                                           int32_t schedtype,
-                                           int32_t *plastiter, uint64_t *plower,
-                                           uint64_t *pupper, int64_t *pstride,
-                                           int64_t incr, int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_4_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
-    int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
-    int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
-  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_4u_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
-    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
-    int32_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
-  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_8_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
-    int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
-    int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
-  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_8u_simple_generic(
-    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
-    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
-    int64_t chunk) {
-  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
-  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
-  PRINT0(LD_IO, "call kmpc_for_static_fini\n");
-}
-
-namespace {
-INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
-  int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  // On Volta and newer architectures we require that all lanes in
-  // a warp (at least, all present for the kernel launch) participate in the
-  // barrier.  This is enforced when launching the parallel region.  An
-  // exception is when there are < WARPSIZE workers.  In this case only 1 worker
-  // is started, so we don't need a barrier.
-  if (NumThreads > 1) {
-#endif
-    named_sync(L1_BARRIER, WARPSIZE * NumWarps);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  }
-#endif
-}
-}; // namespace
-
-EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
-                                                  int32_t varNum, void *array) {
-  PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
-  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
-          "Expected non-SPMD mode + initialized runtime.");
-
-  omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
-  uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
-  uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
-  for (unsigned i = 0; i < varNum; i++) {
-    // Reset buffer.
-    if (gtid == 0)
-      *Buffer = 0; // Reset to minimum loop iteration value.
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-
-    // Atomic max of iterations.
-    uint64_t *varArray = (uint64_t *)array;
-    uint64_t elem = varArray[i];
-    (void)atomicMax((unsigned long long int *)Buffer,
-                    (unsigned long long int)elem);
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-
-    // Read max value and update thread private array.
-    varArray[i] = *Buffer;
-
-    // Barrier.
-    syncWorkersInGenericMode(NumThreads);
-  }
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -46,7 +46,7 @@
 /// FIXME: What if not all threads (warps) participate to the barrier? We may
 ///        need to implement it differently
 template <bool IsCancellable, bool IsSimple, bool IsSPMD>
-__kmpc_impl_barrier(kmp_Ident *Loc, int32_t TID) {
+void __kmpc_impl_barrier(kmp_Ident *Loc, int32_t TID) {
   // Try to justify SPMD mode first as it allows a simple barrier
   // implementation.
   bool InSPMD = IsSPMD || checkRuntimeUninitialized(Loc) || checkSPMDMode(Loc);
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/worksharing.cpp b/openmp/libomptarget/deviceRTLs/nvptx/src/worksharing.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/worksharing.cpp
@@ -0,0 +1,623 @@
+//===--- worksharing.cpp --- OpenMP worksharing constructs ------+- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the OpenMP runtime calls for
+// worksharing constructs that use the same interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+/// Compute the lower bound (\p LB), upper bound (\p UB), stride (\p Stride),
+/// and last iteration flag (\p IsLast) for a loop with static scheduling and
+/// chunk size \p ChunkSize. The \p EntityId describes the location of the
+/// executing thread in the team and \p NumberOfEntities is the number of
+/// threads in the team.
+///
+/// IVTy is type of the loop iteration variable and SIVTy is the signed version
+/// of IVTy.
+template <typename IVTy, typename SIVTy>
+INLINE static void ForStaticChunk(int &last, IVTy &LB, IVTy &UB, SIVTy &Stride,
+                                  SIVTy ChunkSize, IVTy EntityId,
+                                  IVTy NumberOfEntities) {
+  // Each thread executes multiple chunks all of the same size.
+
+  // Distance between two successive chunks
+  Stride = NumberOfEntities * ChunkSize;
+  LB = LB + EntityId * ChunkSize;
+
+  // Adjust the upper bound by one to match the less-than (<=) comparison
+  // Clang uses, e.g., the loop condition will be i <= UB not i < UB.
+  IVTy InputUb = UB;
+  UB = LB + ChunkSize - 1;
+
+  // Say UB' is the begining of the last chunk. Then who ever has a
+  // lower bound plus a multiple of the increment equal to UB' is
+  // the last one.
+  IVTy beginingLastChunk = InputUb - (InputUb % ChunkSize);
+  last = ((beginingLastChunk - LB) % Stride) == 0;
+}
+
+/// Compute the lower bound (\p LB), upper bound (\p UB), stride (\p Stride),
+/// and last iteration flag (\p IsLast) for a loop with static scheduling and
+/// chunk size \p ChunkSize. The \p EntityId describes the location of the
+/// executing thread in the team and \p NumberOfEntities is the number of
+/// threads in the team.
+///
+/// IVTy is type of the loop iteration variable and SIVTy is the signed version
+/// of IVTy.
+template <typename IVTy, typename SIVTy>
+INLINE static void ForStaticNoChunk(int &IsLast, IVTy &LB, IVTy &UB,
+                                    SIVTy &Stride, SIVTy &ChunkSize,
+                                    IVTy EntityId, IVTy NumberOfEntities) {
+  // No chunk size specified. Each thread gets at most one chunk; chunks are
+  // all almost of equal size
+  IVTy loopSize = UB - LB + 1;
+
+  ChunkSize = loopSize / NumberOfEntities;
+  IVTy leftOver = loopSize - ChunkSize * NumberOfEntities;
+
+  if (EntityId < leftOver) {
+    ChunkSize++;
+    LB = LB + EntityId * ChunkSize;
+  } else {
+    LB = LB + EntityId * ChunkSize + leftOver;
+  }
+
+  // Adjust the upper bound by one to match the less-than (<=) comparison
+  // Clang uses, e.g., the loop condition will be i <= UB not i < UB.
+  IVTy InputUb = UB;
+  UB = LB + ChunkSize - 1;
+
+  IsLast = LB <= InputUb && InputUb <= UB;
+
+  // Make sure we only do 1 chunk per warp.
+  Stride = loopSize;
+}
+
+/// Calculate initial bounds for static loop and stride
+///
+/// \param[in] GlobalTID global thread id
+/// \param[in] ScheduleType type of scheduling (see omptarget-nvptx.h)
+/// \param[in] IsLastIterPtr pointer to last iteration
+/// \param[in,out] LowerBoundPtr to loop lower bound. it will contain value of
+/// lower bound of first chunk
+/// \param[in,out] UpperBoundPtr to loop upper bound. It will contain value of
+/// upper bound of first chunk
+/// \param[in,out] StridePtr to loop stride. It will contain value of stride
+/// between two successive chunks executed by the same thread
+/// \param[in] ChunkSize
+/// \param[in] IsSPMDExecutionMode flag to indicate SPMD-mode
+///
+/// IVTy is type of the loop iteration variable and SIVTy is the signed version
+/// of IVTy.
+template <typename IVTy, typename SIVTy>
+INLINE static void for_static_init(int32_t GlobalTID, int32_t ScheduleType,
+                                   int32_t *IsLastIterPtr, IVTy *LowerBoundPtr,
+                                   IVTy *UpperBoundPtr, SIVTy *StridePtr,
+                                   SIVTy ChunkSize, bool IsSPMDExecutionMode) {
+  // When IsSPMDExecutionMode is true, we assume that the caller is
+  // in an L0 parallel region and that all worker threads participate.
+
+  // Assume we are in teams region or that we use a single block
+  // per target region
+  SIVTy numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
+
+  // All warps that are in excess of the maximum requested, do
+  // not execute the loop
+  PRINT(LD_LOOP,
+        "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
+        "%d, num tids %d\n",
+        (int)GlobalTID, (int)ScheduleType, (long long)ChunkSize, (int)GlobalTID,
+        (int)numberOfActiveOMPThreads);
+  ASSERT0(LT_FUSSY, GlobalTID < numberOfActiveOMPThreads,
+          "current thread is not needed here; error");
+
+  // copy
+  int IsLastIter = 0;
+  IVTy LB = *LowerBoundPtr;
+  IVTy UB = *UpperBoundPtr;
+  SIVTy Stride = *StridePtr;
+  // init
+  switch (SCHEDULE_WITHOUT_MODIFIERS(ScheduleType)) {
+  case kmp_sched_static_chunk: {
+    if (ChunkSize > 0) {
+      ForStaticChunk(IsLastIter, LB, UB, Stride, ChunkSize, GlobalTID,
+                     numberOfActiveOMPThreads);
+      break;
+    }
+  } // note: if ChunkSize <=0, use nochunk
+  case kmp_sched_static_balanced_chunk: {
+    if (ChunkSize > 0) {
+      // round up to make sure the chunk is enough to cover all iterations
+      IVTy tripCount = UB - LB + 1; // +1 because UB is inclusive
+      IVTy span =
+          (tripCount + numberOfActiveOMPThreads - 1) / numberOfActiveOMPThreads;
+      // perform chunk adjustment
+      ChunkSize = (span + ChunkSize - 1) & ~(ChunkSize - 1);
+
+      ASSERT0(LT_FUSSY, UB >= LB, "ub must be >= lb.");
+      IVTy oldUb = UB;
+      ForStaticChunk(IsLastIter, LB, UB, Stride, ChunkSize, GlobalTID,
+                     numberOfActiveOMPThreads);
+      if (UB > oldUb)
+        UB = oldUb;
+      break;
+    }
+  } // note: if ChunkSize <=0, use nochunk
+  case kmp_sched_static_nochunk: {
+    ForStaticNoChunk(IsLastIter, LB, UB, Stride, ChunkSize, GlobalTID,
+                     numberOfActiveOMPThreads);
+    break;
+  }
+  case kmp_sched_distr_static_chunk: {
+    if (ChunkSize > 0) {
+      ForStaticChunk(IsLastIter, LB, UB, Stride, ChunkSize, GetOmpTeamId(),
+                     GetNumberOfOmpTeams());
+      break;
+    } // note: if ChunkSize <=0, use nochunk
+  }
+  case kmp_sched_distr_static_nochunk: {
+    ForStaticNoChunk(IsLastIter, LB, UB, Stride, ChunkSize, GetOmpTeamId(),
+                     GetNumberOfOmpTeams());
+    break;
+  }
+  case kmp_sched_distr_static_chunk_sched_static_chunkone: {
+    ForStaticChunk(IsLastIter, LB, UB, Stride, ChunkSize,
+                   numberOfActiveOMPThreads * GetOmpTeamId() + GlobalTID,
+                   GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
+    break;
+  }
+  default: {
+    ASSERT(LT_FUSSY, FALSE, "unknown ScheduleType %d", (int)ScheduleType);
+    PRINT(LD_LOOP, "unknown ScheduleType %d, revert back to static chunk\n",
+          (int)ScheduleType);
+    ForStaticChunk(IsLastIter, LB, UB, Stride, ChunkSize, GlobalTID,
+                   numberOfActiveOMPThreads);
+    break;
+  }
+  }
+  // copy back
+  *IsLastIterPtr = IsLastIter;
+  *LowerBoundPtr = LB;
+  *UpperBoundPtr = UB;
+  *StridePtr = Stride;
+  PRINT(LD_LOOP,
+        "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
+        "%d\n",
+        (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
+        (long long)(*LowerBoundPtr), (long long)(*UpperBoundPtr),
+        (long long)(*StridePtr), (int)IsLastIter);
+}
+
+#define FOR_STATIC_GEN(SUFFIX, IVTy, SIVTy, TIDTy, SPMD)                       \
+  EXTERN void __kmpc_for_static_init##SUFFIX(                                  \
+      kmp_Ident *Loc, TIDTy TId, TIDTy ScheduleType, TIDTy *IsLast, IVTy *LB,  \
+      IVTy *UB, SIVTy *Stride, SIVTy Incr, SIVTy ChunkSize) {                  \
+    PRINT0(LD_IO, "call kmpc_for_static_init" #SUFFIX "\n");                   \
+    for_static_init<IVTy, SIVTy>(TId, ScheduleType, IsLast, LB, UB, Stride,    \
+                                 ChunkSize, SPMD);                             \
+  }
+
+FOR_STATIC_GEN(_4, int32_t, int32_t, int32_t, checkSPMDMode(Loc))
+FOR_STATIC_GEN(_4u, uint32_t, int32_t, int32_t, checkSPMDMode(Loc))
+FOR_STATIC_GEN(_8, int64_t, int64_t, int32_t, checkSPMDMode(Loc))
+FOR_STATIC_GEN(_8u, uint64_t, int64_t, int32_t, checkSPMDMode(Loc))
+FOR_STATIC_GEN(_4_simple_spmd, int32_t, int32_t, int32_t, true)
+FOR_STATIC_GEN(_4u_simple_spmd, uint32_t, int32_t, int32_t, true)
+FOR_STATIC_GEN(_8_simple_spmd, int64_t, int64_t, int32_t, true)
+FOR_STATIC_GEN(_8u_simple_spmd, uint64_t, int64_t, int32_t, true)
+FOR_STATIC_GEN(_4_simple_generic, int32_t, int32_t, int32_t, false)
+FOR_STATIC_GEN(_4u_simple_generic, uint32_t, int32_t, int32_t, false)
+FOR_STATIC_GEN(_8_simple_generic, int64_t, int64_t, int32_t, false)
+FOR_STATIC_GEN(_8u_simple_generic, uint64_t, int64_t, int32_t, false)
+#undef FOR_STATIC_GEN
+
+EXTERN void __kmpc_for_static_fini(kmp_Ident *Loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_for_static_fini\n");
+}
+
+/// Return true if \p Schedule guarantees an order for the loop iterations.
+INLINE static int isOrderedSchedule(kmp_sched_t Schedule) {
+  return Schedule >= kmp_sched_ordered_first &&
+         Schedule <= kmp_sched_ordered_last;
+}
+
+///
+/// IVTy is type of the loop iteration variable and SIVTy is the signed version
+/// of IVTy.
+template <typename IVTy, typename SIVTy>
+INLINE static void dispatch_init(kmp_Ident *Loc, int32_t threadId,
+                                 kmp_sched_t Schedule, IVTy LB, IVTy UB,
+                                 SIVTy st, SIVTy ChunkSize) {
+  bool IsSPMD = checkSPMDMode(Loc);
+  if (checkRuntimeUninitialized(Loc)) {
+    // In SPMD mode no need to check parallelism level - dynamic scheduling
+    // may appear only in L2 parallel regions with lightweight runtime.
+    ASSERT0(LT_FUSSY, IsSPMD, "Expected non-SPMD mode.");
+    return;
+  }
+
+  int tid = GetLogicalThreadIdInBlock(IsSPMD);
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
+  IVTy tnum = GetNumberOfOmpThreads(IsSPMD);
+  IVTy tripCount = UB - LB + 1; // +1 because UB is inclusive
+  ASSERT0(LT_FUSSY, threadId < tnum,
+          "current thread is not needed here; error");
+
+  /* Currently just ignore the monotonic and non-monotonic modifiers
+   * (the compiler isn't producing them * yet anyway).
+   * When it is we'll want to look at them somewhere here and use that
+   * information to add to our schedule choice. We shouldn't need to pass
+   * them on, they merely affect which schedule we can legally choose for
+   * various dynamic cases. (In paritcular, whether or not a stealing scheme
+   * is legal).
+   */
+  Schedule = SCHEDULE_WITHOUT_MODIFIERS(Schedule);
+
+  // Process schedule.
+  if (tnum == 1 || tripCount <= 1 || isOrderedSchedule(Schedule)) {
+    if (isOrderedSchedule(Schedule))
+      __kmpc_barrier(Loc, threadId);
+    PRINT(LD_LOOP,
+          "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
+          (long)tnum, (long long)tripCount, (int)Schedule);
+    Schedule = kmp_sched_static_chunk;
+    ChunkSize = tripCount; // one thread gets the whole loop
+  } else if (Schedule == kmp_sched_runtime) {
+    // process runtime
+    omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
+    ChunkSize = currTaskDescr->RuntimeChunkSize();
+    switch (rtSched) {
+    case omp_sched_static: {
+      if (ChunkSize > 0)
+        Schedule = kmp_sched_static_chunk;
+      else
+        Schedule = kmp_sched_static_nochunk;
+      break;
+    }
+    case omp_sched_auto: {
+      Schedule = kmp_sched_static_chunk;
+      ChunkSize = 1;
+      break;
+    }
+    case omp_sched_dynamic:
+    case omp_sched_guided: {
+      Schedule = kmp_sched_dynamic;
+      break;
+    }
+    }
+    PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)Schedule,
+          (long long)ChunkSize);
+  } else if (Schedule == kmp_sched_auto) {
+    Schedule = kmp_sched_static_chunk;
+    ChunkSize = 1;
+    PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)Schedule,
+          (long long)ChunkSize);
+  } else {
+    PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)Schedule,
+          (long long)ChunkSize);
+    ASSERT(LT_FUSSY,
+           Schedule == kmp_sched_dynamic || Schedule == kmp_sched_guided,
+           "unknown schedule %d & chunk %lld\n", (int)Schedule,
+           (long long)ChunkSize);
+  }
+
+  // init schedules
+  if (Schedule == kmp_sched_static_chunk) {
+    ASSERT0(LT_FUSSY, ChunkSize > 0, "bad chunk value");
+    // save sched state
+    omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = Schedule;
+    // save UB
+    omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = UB;
+    // compute static chunk
+    SIVTy Stride;
+    int IsLastIter = 0;
+    ForStaticChunk(IsLastIter, LB, UB, Stride, ChunkSize, threadId, tnum);
+    // save computed params
+    omptarget_nvptx_threadPrivateContext->Chunk(tid) = ChunkSize;
+    omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = LB;
+    omptarget_nvptx_threadPrivateContext->Stride(tid) = Stride;
+    PRINT(
+        LD_LOOP,
+        "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+        ", next lower bound = %llu, stride = %llu\n",
+        (int)tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+        (unsigned long long)
+            omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+        (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(tid));
+  } else if (Schedule == kmp_sched_static_balanced_chunk) {
+    ASSERT0(LT_FUSSY, ChunkSize > 0, "bad chunk value");
+    // save sched state
+    omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = Schedule;
+    // save UB
+    omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = UB;
+    // compute static chunk
+    SIVTy Stride;
+    int IsLastIter = 0;
+    // round up to make sure the chunk is enough to cover all iterations
+    IVTy span = (tripCount + tnum - 1) / tnum;
+    // perform chunk adjustment
+    ChunkSize = (span + ChunkSize - 1) & ~(ChunkSize - 1);
+
+    IVTy oldUb = UB;
+    ForStaticChunk(IsLastIter, LB, UB, Stride, ChunkSize, threadId, tnum);
+    ASSERT0(LT_FUSSY, UB >= LB, "ub must be >= lb.");
+    if (UB > oldUb)
+      UB = oldUb;
+    // save computed params
+    omptarget_nvptx_threadPrivateContext->Chunk(tid) = ChunkSize;
+    omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = LB;
+    omptarget_nvptx_threadPrivateContext->Stride(tid) = Stride;
+    PRINT(
+        LD_LOOP,
+        "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+        ", next lower bound = %llu, stride = %llu\n",
+        (int)tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+        (unsigned long long)
+            omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+        (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(tid));
+  } else if (Schedule == kmp_sched_static_nochunk) {
+    ASSERT0(LT_FUSSY, ChunkSize == 0, "bad chunk value");
+    // save sched state
+    omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = Schedule;
+    // save UB
+    omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = UB;
+    // compute static chunk
+    SIVTy Stride;
+    int IsLastIter = 0;
+    ForStaticNoChunk(IsLastIter, LB, UB, Stride, ChunkSize, threadId, tnum);
+    // save computed params
+    omptarget_nvptx_threadPrivateContext->Chunk(tid) = ChunkSize;
+    omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = LB;
+    omptarget_nvptx_threadPrivateContext->Stride(tid) = Stride;
+    PRINT(
+        LD_LOOP,
+        "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+        ", next lower bound = %llu, stride = %llu\n",
+        (int)tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+        (unsigned long long)
+            omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+        (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(tid));
+  } else if (Schedule == kmp_sched_dynamic || Schedule == kmp_sched_guided) {
+    // save data
+    omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = Schedule;
+    if (ChunkSize < 1)
+      ChunkSize = 1;
+    omptarget_nvptx_threadPrivateContext->Chunk(tid) = ChunkSize;
+    omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = UB;
+    omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = LB;
+    __kmpc_barrier(Loc, threadId);
+    if (tid == 0) {
+      omptarget_nvptx_threadPrivateContext->Cnt() = 0;
+      __threadfence_block();
+    }
+    __kmpc_barrier(Loc, threadId);
+    PRINT(LD_LOOP,
+          "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+          ", chunk %" PRIu64 "\n",
+          (int)tnum,
+          (unsigned long long)
+              omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+          omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+          omptarget_nvptx_threadPrivateContext->Chunk(tid));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Support for dispatch next
+
+INLINE static int64_t shuffle(unsigned active, int64_t val, int leader) {
+  int lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+  hi = __SHFL_SYNC(active, hi, leader);
+  lo = __SHFL_SYNC(active, lo, leader);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+  return val;
+}
+
+INLINE static uint64_t nextIter() {
+  unsigned int active = __ACTIVEMASK();
+  int leader = __ffs(active) - 1;
+  int change = __popc(active);
+  unsigned lane_mask_lt;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
+  unsigned int rank = __popc(active & lane_mask_lt);
+  uint64_t warp_res;
+  if (rank == 0) {
+    warp_res = atomicAdd(
+        (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
+        change);
+  }
+  warp_res = shuffle(active, warp_res, leader);
+  return warp_res + rank;
+}
+
+INLINE static int DynamicNextChunk(IVTy &LB, IVTy &UB, IVTy chunkSize,
+                                   IVTy loopLowerBound, IVTy loopUpperBound) {
+  IVTy N = nextIter();
+  LB = loopLowerBound + N * chunkSize;
+  UB = LB + chunkSize - 1; // Clang uses i <= UB
+
+  // 3 result cases:
+  //  a. LB and UB < loopUpperBound --> NOT_FINISHED
+  //  b. LB < loopUpperBound and UB >= loopUpperBound: last chunk -->
+  //  NOT_FINISHED
+  //  c. LB and UB >= loopUpperBound: empty chunk --> FINISHED
+  // a.
+  if (LB <= loopUpperBound && UB < loopUpperBound) {
+    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
+          (long long)LB, (long long)UB, (long long)loopUpperBound);
+    return NOT_FINISHED;
+  }
+  // b.
+  if (LB <= loopUpperBound) {
+    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
+          (long long)LB, (long long)UB, (long long)loopUpperBound);
+    UB = loopUpperBound;
+    return LAST_CHUNK;
+  }
+  // c. if we are here, we are in case 'c'
+  LB = loopUpperBound + 2;
+  UB = loopUpperBound + 1;
+  PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)LB,
+        (long long)UB, (long long)loopUpperBound);
+  return FINISHED;
+}
+
+INLINE static int dispatch_next(kmp_Ident *Loc, int32_t GlobalTID,
+                                int32_t *IsLastIterPtr, IVTy *LowerBoundPtr,
+                                IVTy *UpperBoundPtr, SIVTy *StridePtr) {
+  bool IsSPMD = checkSPMDMode(Loc);
+  if (checkRuntimeUninitialized(Loc)) {
+    // In SPMD mode no need to check parallelism level - dynamic scheduling
+    // may appear only in L2 parallel regions with lightweight runtime.
+    ASSERT0(LT_FUSSY, IsSPMD, "Expected non-SPMD mode.");
+    if (*IsLastIterPtr)
+      return DISPATCH_FINISHED;
+    *IsLastIterPtr = 1;
+    return DISPATCH_NOTFINISHED;
+  }
+  // ID of a thread in its own warp
+
+  // automatically selects thread or warp ID based on selected implementation
+  int tid = GetLogicalThreadIdInBlock(IsSPMD);
+  ASSERT0(LT_FUSSY, GlobalTID < GetNumberOfOmpThreads(IsSPMD),
+          "current thread is not needed here; error");
+  // retrieve schedule
+  kmp_sched_t Schedule =
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
+
+  // xxx reduce to one
+  if (Schedule == kmp_sched_static_chunk ||
+      Schedule == kmp_sched_static_nochunk) {
+    IVTy myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
+    IVTy UB = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
+    // finished?
+    if (myLb > UB) {
+      PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
+            (long long)myLb, (long long)UB);
+      return DISPATCH_FINISHED;
+    }
+    // not finished, save current bounds
+    SIVTy ChunkSize = omptarget_nvptx_threadPrivateContext->Chunk(tid);
+    *LowerBoundPtr = myLb;
+    IVTy myUb = myLb + ChunkSize - 1; // Clang uses i <= ub
+    if (myUb > UB)
+      myUb = UB;
+    *UpperBoundPtr = myUb;
+    *IsLastIterPtr = (int32_t)(myUb == UB);
+
+    // increment next lower bound by the stride
+    SIVTy Stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
+    omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + Stride;
+    PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
+          (long long)*LowerBoundPtr, (long long)*UpperBoundPtr);
+    return DISPATCH_NOTFINISHED;
+  }
+  ASSERT0(LT_FUSSY,
+          Schedule == kmp_sched_dynamic || Schedule == kmp_sched_guided,
+          "bad sched");
+  IVTy myLb, myUb;
+  int finished = DynamicNextChunk(
+      myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+
+  if (finished == FINISHED)
+    return DISPATCH_FINISHED;
+
+  // not finished (either not finished or last chunk)
+  *IsLastIterPtr = (int32_t)(finished == LAST_CHUNK);
+  *LowerBoundPtr = myLb;
+  *UpperBoundPtr = myUb;
+  *StridePtr = 1;
+
+  PRINT(LD_LOOP,
+        "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
+        "last %d\n",
+        (int)GetNumberOfOmpThreads(isSPMDMode()),
+        (int)GetNumberOfWorkersInTeam(), (long long)*LowerBoundPtr,
+        (long long)*UpperBoundPtr, (long long)*StridePtr, (int)*IsLastIterPtr);
+  return DISPATCH_NOTFINISHED;
+}
+
+#define DISPATCH_GEN(SUFFIX, IVTy, SIVTy, TIDTy)                               \
+  EXTERN void __kmpc_dispatch_init##SUFFIX(kmp_Ident *Loc, TIDTy TID,          \
+                                           TIDTy Schedule, IVTy LB, IVTy UB,   \
+                                           SIVTy Stride, SIVTy ChunkSize) {    \
+    PRINT0(LD_IO, "call kmpc_dispatch_init" #SUFFIX "\n");                     \
+    dispatch_init<IVTy, SIVTy>(Loc, TID, (kmp_sched_t)Schedule, LB, UB,        \
+                               Stride, ChunkSize);                             \
+  }                                                                            \
+  EXTERN void __kmpc_dispatch_next##SUFFIX(kmp_Ident *Loc, TIDTy TID,          \
+                                           TIDTy *IsLast, IVTy LB, IVTy UB,    \
+                                           SIVTy Stride) {                     \
+    PRINT0(LD_IO, "call kmpc_dispatch_next" #SUFFIX "\n");                     \
+    dispatch_next<IVTy, SIVTy>(Loc, TID, IsLast, LB, UB, Stride);              \
+  }                                                                            \
+  EXTERN void __kmpc_dispatch_fini##SUFFIX(kmp_Ident *Loc, TIDTy TID) {        \
+    PRINT0(LD_IO, "call kmpc_dispatch_fini" #SUFFIX "\n");                     \
+  }
+
+DISPATCH_GEN(_4, int32_t, int32_t, int32_t)
+DISPATCH_GEN(_4u, uint32_t, int32_t, int32_t)
+DISPATCH_GEN(_8, int64_t, int64_t, int32_t)
+DISPATCH_GEN(_8u, uint64_t, int64_t, int32_t)
+#undef DISPATCH_GEN
+
+static INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
+  int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  // On Volta and newer architectures we require that all lanes in
+  // a warp (at least, all present for the kernel launch) participate in the
+  // barrier.  This is enforced when launching the parallel region.  An
+  // exception is when there are < WARPSIZE workers.  In this case only 1 worker
+  // is started, so we don't need a barrier.
+  if (NumThreads > 1) {
+#endif
+    named_sync(L1_BARRIER, WARPSIZE * NumWarps);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  }
+#endif
+}
+
+EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *Loc,
+                                                  int32_t GlobalTID,
+                                                  int32_t varNum, void *array) {
+  PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(Loc),
+          "Expected non-SPMD mode + initialized runtime.");
+
+  omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+  uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(Loc));
+  uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
+  for (unsigned i = 0; i < varNum; i++) {
+    // Reset buffer.
+    if (GlobalTID == 0)
+      *Buffer = 0; // Reset to minimum loop iteration value.
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+
+    // Atomic max of iterations.
+    uint64_t *varArray = (uint64_t *)array;
+    uint64_t elem = varArray[i];
+    (void)atomicMax((unsigned long long int *)Buffer,
+                    (unsigned long long int)elem);
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+
+    // Read max value and update thread private array.
+    varArray[i] = *Buffer;
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+  }
+}