Index: libomptarget/CMakeLists.txt
===================================================================
--- libomptarget/CMakeLists.txt
+++ libomptarget/CMakeLists.txt
@@ -67,6 +67,7 @@
 
 # Build offloading plugins and device RTLs if they are available.
 add_subdirectory(plugins)
+add_subdirectory(deviceRTLs)
 
 # Add tests.
 add_subdirectory(test)
Index: libomptarget/deviceRTLs/CMakeLists.txt
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/CMakeLists.txt
@@ -0,0 +1,14 @@
+##===----------------------------------------------------------------------===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+# ##===----------------------------------------------------------------------===##
+#
+# Build a device RTL for each available machine available.
+#
+##===----------------------------------------------------------------------===##
+
+add_subdirectory(nvptx)
Index: libomptarget/deviceRTLs/nvptx/CMakeLists.txt
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -0,0 +1,203 @@
+##===----------------------------------------------------------------------===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
+#
+##===----------------------------------------------------------------------===##
+
+set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING
+  "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.")
+
+if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER)
+  find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER})
+  if(NOT ALTERNATE_CUDA_HOST_COMPILER)
+    libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.")
+  endif()
+  set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE)
+endif()
+
+# We can't use clang as nvcc host preprocessor, so we attempt to replace it with
+# gcc.
+if(CUDA_HOST_COMPILER MATCHES clang)
+
+  find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc)
+
+  if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER)
+    libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.")
+    libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.")
+    return()
+  endif()
+  set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE)
+endif()
+
+if(LIBOMPTARGET_DEP_CUDA_FOUND)
+  libomptarget_say("Building CUDA offloading device RTL.")
+
+  # We really don't have any host code, so we don't need to care about
+  # propagating host flags.
+  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+  set(cuda_src_files
+      src/cancel.cu
+      src/critical.cu
+      src/data_sharing.cu
+      src/libcall.cu
+      src/loop.cu
+      src/omptarget-nvptx.cu
+      src/parallel.cu
+      src/reduction.cu
+      src/sync.cu
+      src/task.cu
+  )
+
+  set(omp_data_objects src/omp_data.cu)
+
+  # Get the compute capability the user requested or use SM_35 by default.
+  # SM_35 is what clang uses by default.
+  set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY 35 CACHE STRING
+    "CUDA Compute Capability to be used to compile the NVPTX device RTL.")
+  set(CUDA_ARCH -arch sm_${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
+
+  # Activate RTL message dumps if requested by the user.
+  set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
+    "Activate NVPTX device RTL debug messages.")
+  if(${LIBOMPTARGET_NVPTX_DEBUG})
+    set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v)
+  endif()
+
+  # NVPTX runtime library has to be statically linked. Dynamic linking is not
+  # yet supported by the CUDA toolchain on the device.
+  set(BUILD_SHARED_LIBS OFF)
+  set(CUDA_SEPARABLE_COMPILATION ON)
+
+  cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
+      OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
+
+  # Install device RTL under the lib destination folder.
+  install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "lib")
+
+  target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES})
+
+  # Check if we can create an LLVM bitcode implementation of the runtime library
+  # that could be inlined in the user implementation.
+  set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB FALSE CACHE BOOL
+    "Enable CUDA LLVM bitcode offloading device RTL.")
+  if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB})
+
+    # Find a clang compiler capable of compiling cuda files to LLVM bitcode and
+    # an LLVM linker.
+    # We use the one provided by the user, attempt to use the one used to build
+    # libomptarget, attempt to use clang in the PATH, or just fail.
+
+    set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
+      "Location of a CUDA compiler capable of emitting LLVM bitcode.")
+    set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
+      "Location of a linker capable of linking LLVM bitcode objects.")
+
+    if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
+      set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
+    elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+      set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
+    else()
+      find_program(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER clang++)
+      if(NOT LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER)
+        libomptarget_say("Cannot find a CUDA compiler capable of emitting LLVM bitcode.")
+        libomptarget_say("Please configure with flag -DLIBOMPTARGET_NVPTX_CUDA_COMPILER")
+      endif()
+    endif()
+
+    if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
+      set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
+    elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+      # Assuming that if we are building using clang then llvm-link is also present
+      set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER llvm-link)
+    else()
+      find_program(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER llvm-link)
+      if(NOT LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER)
+        libomptarget_say("Cannot find a linker capable of linking LLVM bitcode objects.")
+        libomptarget_say("Please configure with flag -DLIBOMPTARGET_NVPTX_BC_LINKER")
+      endif()
+    endif()
+
+    if(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER AND LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER)
+      libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
+
+      # Decide which ptx version to use. Same choices as Clang.
+      if(CUDA_VERSION_MAJOR GREATER 9 OR CUDA_VERSION_MAJOR EQUAL 9)
+        set(CUDA_PTX_VERSION ptx60)
+      else()
+        set(CUDA_PTX_VERSION ptx42)
+      endif()
+ 
+      # Set flags for Clang cuda compilation.  Only Clang is supported because there is
+      # no other compiler capable of generating bitcode from cuda sources.
+      set(CUDA_FLAGS
+          -emit-llvm
+          -O1
+          -Xclang -target-feature
+          -Xclang +${CUDA_PTX_VERSION}
+          --cuda-device-only
+          -DOMPTARGET_NVPTX_TEST=0 -DOMPTARGET_NVPTX_DEBUG=0
+      )
+
+      # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared
+      # to handle. Therefore, we use 'weak' instead. We are compiling only for the
+      # device, so it should be equivalent.
+      if(CUDA_VERSION_MAJOR EQUAL 9)
+        set(CUDA_FLAGS ${CUDA_FLAGS} -Dnv_weak=weak)
+      endif()
+
+      # Get the compute capability the user requested or use SM_35 by default.
+      set(CUDA_ARCH "")
+      set(CUDA_ARCH --cuda-gpu-arch=sm_${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
+
+      # Compile cuda files to bitcode.
+      set(bc_files "")
+      foreach(src ${cuda_src_files})
+        get_filename_component(infile ${src} ABSOLUTE)
+        get_filename_component(outfile ${src} NAME)
+
+        add_custom_command(OUTPUT ${outfile}.bc
+          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${CUDA_FLAGS} ${CUDA_ARCH} ${CUDA_INCLUDES}
+            -c ${infile} -o ${outfile}.bc
+          DEPENDS ${infile}
+          IMPLICIT_DEPENDS CXX ${infile}
+          COMMENT "Building LLVM bitcode ${outfile}.bc"
+          VERBATIM
+        )
+        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}.bc)
+
+        list(APPEND bc_files ${outfile}.bc)
+      endforeach()
+
+      # Link to a bitcode library.
+      add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc
+          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
+            -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc ${bc_files}
+          DEPENDS ${bc_files}
+          COMMENT "Linking LLVM bitcode libomptarget-nvptx.bc"
+      )
+      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx.bc)
+
+      add_custom_target(omptarget-nvptx-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc)
+
+      # Copy library to destination.
+      add_custom_command(TARGET omptarget-nvptx-bc POST_BUILD
+                         COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc
+                         $<TARGET_FILE_DIR:omptarget-nvptx>)
+
+      # Install device RTL under the lib destination folder.
+      install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc DESTINATION "lib")
+
+    endif()
+  endif()
+
+else()
+  libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.")
+endif()
Index: libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
@@ -0,0 +1,502 @@
+
+**Design document for OpenMP reductions on the GPU** 
+
+//Abstract: //In this document we summarize the new design for an OpenMP
+implementation of reductions on NVIDIA GPUs.  This document comprises
+* a succinct background review,
+* an introduction to the decoupling of reduction algorithm and
+    data-structure-specific processing routines,
+* detailed illustrations of reduction algorithms used and
+* a brief overview of steps we have made beyond the last implementation.
+
+**Problem Review**
+
+Consider a typical OpenMP program with reduction pragma.
+
+```
+    double foo, bar;
+    #pragma omp parallel for reduction(+:foo, bar)
+    for (int i = 0; i < N; i++) {
+      foo+=A[i]; bar+=B[i];
+    }
+```
+where 'foo' and 'bar' are reduced across all threads in the parallel region.
+Our primary goal is to efficiently aggregate the values of foo and bar in
+such manner that
+* makes the compiler logically concise.
+* efficiently reduces within warps, threads, blocks and the device.
+
+**Introduction to Decoupling**
+In this section we address the problem of making the compiler
+//logically concise// by partitioning the task of reduction into two broad
+categories: data-structure specific routines and algorithmic routines.
+
+The previous reduction implementation was highly coupled with
+the specificity of the reduction element data structures (e.g., sizes, data
+types) and operators of the reduction (e.g., addition, multiplication). In
+our implementation we strive to decouple them. In our final implementations,
+we could remove all template functions in our runtime system.
+
+The (simplified) pseudo code generated by LLVM is as follows:
+
+```
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn, 
+               interWarpCpyFn)
+        where:
+        struct ReduceData {
+          double *foo;
+          double *bar;
+        } reduceData
+        reduceData.foo = &foo_p
+        reduceData.bar = &bar_p
+
+        shuffleReduceFn and interWarpCpyFn are two auxiliary functions
+        generated to aid the runtime performing algorithmic steps
+        while being data-structure agnostic about ReduceData.
+
+        In particular, shuffleReduceFn is a function that takes the following inputs
+        a. local copy of ReduceData
+        b. its lane_id
+        c. the offset of the lane_id which hosts a remote ReduceData
+                relative to the current one
+        d. an algorithm version paramter determining which reduction
+                algorithm to use.
+        This shuffleReduceFn retrieves the remote ReduceData through shuffle
+        intrinsics and  reduces, using the algorithm specified by the 4th parameter,
+        the local ReduceData and with the remote ReduceData element wise,
+        and places the resultant values into the local ReduceData.
+
+        Different reduction algorithms are implemented with different runtime
+        functions, but they all make calls to this same shuffleReduceFn to perform
+        the essential reduction step. Therefore, based on the 4th parameter, this
+        shuffleReduceFn will behave slightly differently to cooperate with the
+        runtime function to ensure correctness under different circumstances.
+
+        InterWarpCpyFn, as the name suggests, is a function that copies data across
+        warps. Its function is to tunnel all the thread private ReduceData that is already
+        reduced within a warp to a lane in the first warp with minimal shared memory
+        footprint. This is an essential step to prepare for the last step of a block reduction.
+
+        (Warp, block, device level reduction routines that utilize these auxiliary functions
+        will be discussed in the next section.)
+
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+```
+
+**Reduction Algorithms**
+
+On the warp level, we have three versions of the algorithms:
+
+1. Full Warp Reduction
+
+```
+gpu_regular_warp_reduce(void *reduce_data, kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
+    ShuffleReduceFn(reduce_data, 0, offset, 0);
+}
+```
+ShuffleReduceFn is used here with lane_id set to 0 because it is not used therefore
+we save instructions by not retrieving lane_id from the corresponding special
+registers. The 4th parameters, which represents the version of the algorithm
+being used here, is set to 0 to signify full warp reduction.
+
+In this version specified (=0), the ShuffleReduceFn behaves, per element, as follows:
+
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+reduce_elem = reduce_elem @ remote_elem;
+
+```
+
+An illustration of this algorithm operating on a hypothetical 8-lane full-warp
+would be:
+{F74}
+The coloring invariant follows that elements with the same color will be combined
+and reduced in the next reduction step. As can be observed, no overhead is
+present, exactly log(2, N) steps are needed.
+
+2. Contiguous Full Warp Reduction
+```
+gpu_irregular_warp_reduce(void *reduce_data, kmp_ShuffleReductFctPtr ShuffleReduceFn, 
+int size, int lane_id) {
+  int curr_size;
+  int offset;
+    curr_size = size;
+    mask = curr_size/2;
+    while (offset>0) {
+      ShuffleReduceFn(reduce_data, lane_id, offset, 1);
+      curr_size = (curr_size+1)/2;
+      offset = curr_size/2;
+    }
+}
+```
+
+In this version specified (=1), the ShuffleReduceFn behaves, per element, as follows:
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+if (lane_id < offset) {
+    reduce_elem = reduce_elem @ remote_elem
+} else {
+    reduce_elem = remote_elem
+}
+```
+
+An important invariant (also a restriction on the starting state of the
+reduction) is that this algorithm assumes that all unused ReduceData are
+located in a contiguous subset of threads in a warp starting from lane 0.
+
+With the presence of a trailing active lane with an odd-numbered lane
+id, its value will not be aggregated with any other lane. Therefore,
+in order to preserve the invariant, such ReduceData is copied to the first lane whose
+thread-local ReduceData has already being used in a previous reduction
+and would therefore be useless otherwise.
+
+An illustration of this algorithm operating on a hypothetical 8-lane partial
+warp woud be:
+{F75}
+
+As illustrated, this version of the algorithm introduces overhead whenever
+we have odd number of participating lanes in any reduction step to
+copy data between lanes.
+
+3. Dispersed Partial Warp Reduction
+```
+gpu_irregular_simt_reduce(void *reduce_data, kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  int size, remote_id;
+  int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2;
+  do {
+      remote_id = find_the_next_active_lane_id_right_after_me();
+      // the above function returns 0 of no active lane
+      // is present right after the current thread.
+      size = get_number_of_active_lanes_in_this_warp();
+      logical_lane_id /= 2;
+      ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2);
+  } while (logical_lane_id % 2 == 0 && size > 1);
+```
+
+There is no assumption made about the initial state of the reduction.
+Any number of lanes (>=1) could be active at any position. The reduction
+result is kept in the first active lane.
+
+In this version specified (=2), the ShuffleReduceFn behaves, per element, as follows:
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+if (LaneId % 2 == 0 && Offset > 0) {
+    reduce_elem = reduce_elem @ remote_elem
+} else {
+    reduce_elem = remote_elem
+}
+```
+We will proceed with a brief explanation for some arguments passed in,
+it is important to notice that, in this section, we will introduce the
+concept of logical_lane_id, and it is important to distinguish it
+from physical lane_id as defined by nvidia.
+1. //logical_lane_id//: as the name suggests, it refers to the calculated
+    lane_id (instead of the physical one defined by nvidia) that would make
+    our algorithm logically concise. A thread with logical_lane_id k means
+    there are (k-1) threads before it.
+2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane
+    id of the remote lane from which we will retrieve the ReduceData. We subtract
+    (threadIdx+1) from it because we would like to maintain only one
+    underlying shuffle intrinsic (which is used to communicate among lanes in a warp).
+    This particular version of shuffle intrinsic we take accepts only offsets, instead of
+    absolute lane_id. Therefore the subtraction is performed on the absolute
+    lane_id we calculated to obtain the offset.
+
+This algorithm is slightly different in 2 ways and it is not, conceptually, a generalization
+of the above algorithms.
+1. It reduces elements close to each other. For instance, values in the 0th lane is to
+    be combined with that of the 1st lane; values in the 2nd lane is to be combined
+    with that of the 3rd lane. We did not use the previous algorithm where the first
+    half of the (partial) warp is reduced with the second half of the (partial) warp.
+    This is because, the mapping f(x): logical_lane_id -> physical_lane_id; can be easily
+    calculated whereas its inverse f^-1(x): physical_lane_id -> logical_lane_id cannot and
+    performing such reduction requires the inverse to be known.
+2. Because this algorithm is agnostic about the positions of the lanes that are active,
+    we do not need to perform the coping step as in the second algorithm.
+An illustrative run would look like
+{F76}
+As observed, overhead is high because in each and every step of reduction,
+logical_lane_id is recalculated; so is the remote_id.
+
+On a block level, we have implemented the following block reduce algorithm:
+
+```
+gpu_irregular_block_reduce(void *reduce_data,
+              kmp_ShuffleReductFctPtr shuflReduceFn,
+              kmp_InterWarpCopyFctPtr interWarpCpyFn,
+              int size) {
+
+  int wid = threadIdx.x/WARPSIZE;
+  int lane_id = threadIdx.x%WARPSIZE;
+
+  int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division
+
+  unsigned tnum = __ballot(1);
+  int thread_num = __popc(tnum);
+
+    //full warp reduction
+    if (thread_num == WARPSIZE) {
+      gpu_regular_warp_reduce(reduce_data, shuflReduceFn);
+    }
+    //partial warp reduction
+    if (thread_num < WARPSIZE) {
+        gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num, lane_id);
+    }
+    //Gather all the reduced values from each warp
+    //to the first warp
+    //named_barrier inside this function to ensure
+    //correctness. It is effectively a sync_thread
+    //that won't deadlock.
+    interWarpCpyFn(reduce_data, warp_needed);
+
+    //This is to reduce data gathered from each "warp master".
+    if (wid==0) {
+        gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed, lane_id);
+    }
+
+  return;
+}
+```
+In this function, no ShuffleReduceFn is directly called as it makes calls
+to various versions of the warp-reduction functions. It first reduces
+ReduceData warp by warp; in the end, we end up with the number of
+ReduceData equal to the number of warps present in this thread
+block. We then proceed to gather all such ReduceData to the first warp.
+
+As observed, in this algorithm we make use of the function InterWarpCpyFn,
+which copies data from each of the "warp master" (0th lane of each warp, where 
+a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a mathematical
+sense) the problem of reduction across warp masters in a block to the problem of warp reduction
+which we already have solutions to.
+
+We can thus completely avoid the use of atomics to reduce in a threadblock.
+
+**Efficient Cross Block Reduce**
+
+The next challenge is to reduce values across threadblocks.  We aim to do this
+without atomics or critical sections.
+
+Let a kernel be started with TB threadblocks.
+Let the GPU have S SMs.
+There can be at most N active threadblocks per SM at any time.
+
+Consider a threadblock tb (tb < TB) running on SM s (s < SM).  'tb' is one of
+at most 'N' active threadblocks on SM s.  Let each threadblock active on an SM
+be given an instance identifier id (0 <= id < N).  Therefore, the tuple (s, id)
+uniquely identifies an active threadblock on the GPU.
+
+To efficiently implement cross block reduce, we first allocate an array for
+each value to be reduced of size S*N (which is the maximum number of active
+threadblocks at any time on the device).
+
+Each threadblock reduces its value to slot [s][id].  This can be done without
+locking since no other threadblock can write to the same slot concurrently.
+
+As a final stage, we reduce the values in the array as follows:
+
+```
+// Compiler generated wrapper function for each target region with a reduction clause.
+target_function_wrapper(map_args, reduction_array)  <--- start with 1 team and 1 thread.
+  // Use dynamic parallelism to launch M teams, N threads as requested by the user
+  to execute the target region.
+
+  target_function<<M, N>>(map_args)
+
+  Reduce values in reduction_array
+
+```
+
+**Comparison with Last Version**
+
+
+The (simplified) pseudo code generated by LLVM on the host is as follows:
+
+
+```
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
+        where:
+        struct ReduceData {
+          double *foo;
+          double *bar;
+        } reduceData
+        reduceData.foo = &foo_p
+        reduceData.bar = &bar_p
+
+        reduceFn is a pointer to a function that takes in two inputs
+        of type ReduceData, "reduces" them element wise, and places the
+        result in the first input:
+        reduceFn(ReduceData *a, ReduceData *b)
+          a = a @ b
+
+        Every thread in the parallel region calls kmpc_reduce_nowait with
+        its private copy of reduceData.  The runtime reduces across the
+        threads (using tree reduction on the operator 'reduceFn?) and stores
+        the final result in the master thread if successful.
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+    5. else if ret == 2:
+        In this case kmpc_reduce_nowait() could not use tree reduction,
+        so use atomics instead:
+        each thread atomically writes to foo
+        each thread atomically writes to bar
+```
+
+On a GPU, a similar reduction may need to be performed across SIMT threads,
+warps, and threadblocks.  The challenge is to do so efficiently in a fashion
+that is compatible with the LLVM OpenMP implementation.
+
+In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs,
+the salient steps of the code generated are as follows:
+
+
+```
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
+        status = can_block_reduce()
+        if status == 1:
+          reduce efficiently to thread 0 using shuffles and shared memory.
+          return 1
+        else
+          cannot use efficient block reduction, fallback to atomics
+          return 2
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+    5. else if ret == 2:
+        In this case kmpc_reduce_nowait() could not use tree reduction,
+        so use atomics instead:
+        each thread atomically writes to foo
+        each thread atomically writes to bar
+```
+
+The function can_block_reduce() is defined as follows:
+
+
+```
+int32_t can_block_reduce() {
+  int tid = GetThreadIdInTeam();
+  int nt = GetNumberOfOmpThreads(tid);
+  if (nt != blockDim.x)
+    return 0;
+  unsigned tnum = __ballot(1);
+  if (tnum != (~0x0)) {
+    return 0;
+  }
+  return 1;
+}
+```
+
+This function permits the use of the efficient block reduction algorithm
+using shuffles and shared memory (return 1) only if (a) all SIMT threads in
+a warp are active (i.e., number of threads in the parallel region is a
+multiple of 32) and (b) the number of threads in the parallel region
+(set by the num_threads clause) equals blockDim.x.
+
+If either of these preconditions is not true, each thread in the threadblock
+updates the global value using atomics.
+
+Atomics and compare-and-swap operations are expensive on many threaded
+architectures such as GPUs and we must avoid them completely.
+
+
+**Appendix: Implementation Details**
+
+
+```
+// Compiler generated function.
+reduceFn(ReduceData *a, ReduceData *b)
+  a->foo = a->foo + b->foo
+  a->bar = a->bar + b->bar
+
+// Compiler generated function.
+swapAndReduceFn(ReduceData *thread_private, int lane)
+  ReduceData *remote = new ReduceData()
+  remote->foo = shuffle_double(thread_private->foo, lane)
+  remote->bar = shuffle_double(thread_private->bar, lane)
+  reduceFn(thread_private, remote)
+
+// OMP runtime function.
+warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn):
+  offset = 16
+  while (offset > 0)
+    swapAndReduceFn(thread_private, offset)
+    offset /= 2
+
+// OMP runtime function.
+warpReduce_irregular():
+  ...
+
+// OMP runtime function.
+kmpc_reduce_warp(reduceData, swapAndReduceFn)
+  if all_lanes_active:
+    warpReduce_regular(reduceData, swapAndReduceFn)
+  else:
+    warpReduce_irregular(reduceData, swapAndReduceFn)
+  if in_simd_region:
+    // all done, reduce to global in simd lane 0
+    return 1
+  else if in_parallel_region:
+    // done reducing to one value per warp, now reduce across warps
+    return 3
+
+// OMP runtime function; one for each basic type.
+kmpc_reduce_block_double(double *a)
+  if lane == 0:
+    shared[wid] = *a
+  named_barrier(1, num_threads)
+  if wid == 0
+    block_reduce(shared)
+  if lane == 0
+    *a = shared[0]
+  named_barrier(1, num_threads)
+  if wid == 0 and lane == 0
+    return 1  // write back reduced result
+  else
+    return 0  // don't do anything
+
+```
+
+
+
+```
+// Compiler generated code.
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn)
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+    5. else if ret == 3:
+        ret = block_reduce_double(reduceData.foo)
+        if ret == 1:
+          foo += reduceData.foo
+        ret = block_reduce_double(reduceData.bar)
+        if ret == 1:
+          bar += reduceData.bar
+```
+
+**Notes**
+
+    1. This scheme requires that the CUDA OMP runtime can call llvm generated functions. This functionality now works.
+    2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery (including calls through function pointers) are optimized away.
+    3. If we are reducing multiple to multiple variables in a parallel region, the reduce operations are all performed in warpReduce_[ir]regular(). This results in more instructions in the loop and should result in fewer stalls due to data dependencies.  Unfortunately we cannot do the same in kmpc_reduce_block_double() without increasing shared memory usage.
+
Index: libomptarget/deviceRTLs/nvptx/src/cancel.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/cancel.cu
@@ -0,0 +1,28 @@
+//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to be used in the implementation of OpenMP cancel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent *loc, int32_t global_tid,
+                                        int32_t cancelVal) {
+  PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", cancelVal);
+  // disabled
+  return FALSE;
+}
+
+EXTERN int32_t __kmpc_cancel(kmp_Indent *loc, int32_t global_tid,
+                             int32_t cancelVal) {
+  PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", cancelVal);
+  // disabled
+  return FALSE;
+}
Index: libomptarget/deviceRTLs/nvptx/src/counter_group.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/counter_group.h
@@ -0,0 +1,51 @@
+//===------ counter_group.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to implement OpenMP loop scheduling
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_NVPTX_COUNTER_GROUP_H_
+#define _OMPTARGET_NVPTX_COUNTER_GROUP_H_
+
+#include "option.h"
+
+// counter group type for synchronizations
+class omptarget_nvptx_CounterGroup {
+public:
+  // getters and setters
+  INLINE Counter &Event() { return v_event; }
+  INLINE volatile Counter &Start() { return v_start; }
+  INLINE Counter &Init() { return v_init; }
+
+  // Synchronization Interface
+
+  INLINE void Clear();             // first time start=event
+  INLINE void Reset();             // init = first
+  INLINE void Init(Counter &priv); // priv = init
+  INLINE Counter Next();           // just counts number of events
+
+  // set priv to n, to be used in later waitOrRelease
+  INLINE void Complete(Counter &priv, Counter n);
+
+  // check priv and decide if we have to wait or can free the other warps
+  INLINE void Release(Counter priv, Counter current_event_value);
+  INLINE void WaitOrRelease(Counter priv, Counter current_event_value);
+
+private:
+  Counter v_event; // counter of events (atomic)
+
+  // volatile is needed to force loads to read from global
+  // memory or L2 cache and see the write by the last master
+  volatile Counter v_start; // signal when events registered are finished
+
+  Counter v_init; // used to initialize local thread variables
+};
+
+#endif /* SRC_COUNTER_GROUP_H_ */
Index: libomptarget/deviceRTLs/nvptx/src/counter_groupi.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/counter_groupi.h
@@ -0,0 +1,83 @@
+//===----- counter_groupi.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface implementation for OpenMP loop scheduling
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+#include "option.h"
+
+INLINE void omptarget_nvptx_CounterGroup::Clear() {
+  PRINT0(LD_SYNCD, "clear counters\n")
+  v_event = 0;
+  v_start = 0;
+  // v_init does not need to be reset (its value is dead)
+}
+
+INLINE void omptarget_nvptx_CounterGroup::Reset() {
+  // done by master before entering parallel
+  ASSERT(LT_FUSSY, v_event == v_start,
+         "error, entry %lld !=start %lld at reset\n", P64(v_event),
+         P64(v_start));
+  v_init = v_start;
+}
+
+INLINE void omptarget_nvptx_CounterGroup::Init(Counter &priv) {
+  PRINT(LD_SYNCD, "init priv counter 0x%llx with val %lld\n", P64(&priv),
+        P64(v_start));
+  priv = v_start;
+}
+
+// just counts number of events
+INLINE Counter omptarget_nvptx_CounterGroup::Next() {
+  Counter oldVal = atomicAdd(&v_event, (Counter)1);
+  PRINT(LD_SYNCD, "next event counter 0x%llx with val %lld->%lld\n",
+        P64(&v_event), P64(oldVal), P64(oldVal + 1));
+
+  return oldVal;
+}
+
+// set priv to n, to be used in later waitOrRelease
+INLINE void omptarget_nvptx_CounterGroup::Complete(Counter &priv, Counter n) {
+  PRINT(LD_SYNCD, "complete priv counter 0x%llx with val %lld->%lld (+%d)\n",
+        P64(&priv), P64(priv), P64(priv + n), n);
+  priv += n;
+}
+
+INLINE void omptarget_nvptx_CounterGroup::Release(Counter priv,
+                                                  Counter current_event_value) {
+  if (priv - 1 == current_event_value) {
+    PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n",
+          P64(&v_start), P64(v_start), P64(priv));
+    v_start = priv;
+  }
+}
+
+// check priv and decide if we have to wait or can free the other warps
+INLINE void
+omptarget_nvptx_CounterGroup::WaitOrRelease(Counter priv,
+                                            Counter current_event_value) {
+  if (priv - 1 == current_event_value) {
+    PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n",
+          P64(&v_start), P64(v_start), P64(priv));
+    v_start = priv;
+  } else {
+    PRINT(LD_SYNCD,
+          "Start waiting while start counter 0x%llx with val %lld < %lld\n",
+          P64(&v_start), P64(v_start), P64(priv));
+    while (priv > v_start) {
+      // IDLE LOOP
+      // start is volatile: it will be re-loaded at each while loop
+    }
+    PRINT(LD_SYNCD,
+          "Done waiting as start counter 0x%llx with val %lld >= %lld\n",
+          P64(&v_start), P64(v_start), P64(priv));
+  }
+}
Index: libomptarget/deviceRTLs/nvptx/src/critical.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/critical.cu
@@ -0,0 +1,32 @@
+//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of critical with KMPC interface
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdio.h>
+
+#include "omptarget-nvptx.h"
+
+EXTERN
+void __kmpc_critical(kmp_Indent *loc, int32_t global_tid,
+                     kmp_CriticalName *lck) {
+  PRINT0(LD_IO, "call to kmpc_critical()\n");
+  omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+  omp_set_lock(teamDescr.CriticalLock());
+}
+
+EXTERN
+void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid,
+                         kmp_CriticalName *lck) {
+  PRINT0(LD_IO, "call to kmpc_end_critical()\n");
+  omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+  omp_unset_lock(teamDescr.CriticalLock());
+}
Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -0,0 +1,311 @@
+//===----- data_sharing.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of data sharing environments/
+//
+//===----------------------------------------------------------------------===//
+#include "omptarget-nvptx.h"
+#include <stdio.h>
+
+// Number of threads in the CUDA block.
+__device__ static unsigned getNumThreads() {
+  return blockDim.x;
+}
+// Thread ID in the CUDA block
+__device__ static unsigned getThreadId() {
+  return threadIdx.x;
+}
+// Warp ID in the CUDA block
+__device__ static unsigned getWarpId() {
+  return threadIdx.x >> DS_Max_Worker_Warp_Size_Bits;
+}
+
+// The CUDA thread ID of the master thread.
+__device__ static unsigned getMasterThreadId() {
+  unsigned Mask = DS_Max_Worker_Warp_Size - 1;
+  return (getNumThreads() - 1) & (~Mask);
+}
+
+// Find the active threads in the warp - return a mask whose n-th bit is set if
+// the n-th thread in the warp is active.
+__device__ static unsigned getActiveThreadsMask() {
+  return __BALLOT_SYNC(0xFFFFFFFF, true);
+}
+
+// Return true if this is the first active thread in the warp.
+__device__ static bool IsWarpMasterActiveThread() {
+  unsigned long long Mask = getActiveThreadsMask();
+  unsigned long long ShNum = DS_Max_Worker_Warp_Size -
+      (getThreadId() & DS_Max_Worker_Warp_Size_Bit_Mask);
+  unsigned long long Sh = Mask << ShNum;
+  return Sh == 0;
+}
+// Return true if this is the master thread.
+__device__ static bool IsMasterThread() {
+  return getMasterThreadId() == getThreadId();
+}
+
+/// Return the provided size aligned to the size of a pointer.
+__device__ static size_t AlignVal(size_t Val) {
+  const size_t Align = (size_t)sizeof(void*);
+  if (Val & (Align-1)) {
+    Val += Align;
+    Val &= ~(Align-1);
+  }
+  return Val;
+}
+
+
+#define DSFLAG 0
+#define DSFLAG_INIT 0
+#define DSPRINT(_flag, _str, _args...)                                         \
+  {                                                                            \
+    if (_flag) {                                                               \
+      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/          \
+    }                                                                          \
+  }
+#define DSPRINT0(_flag, _str)                                                  \
+  {                                                                            \
+    if (_flag) {                                                               \
+      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/                 \
+    }                                                                          \
+  }
+
+// Initialize the shared data structures. This is expected to be called for the master thread and warp masters.
+// \param RootS: A pointer to the root of the data sharing stack.
+// \param InitialDataSize: The initial size of the data in the slot.
+EXTERN void __kmpc_initialize_data_sharing_environment(
+    __kmpc_data_sharing_slot *rootS,
+    size_t InitialDataSize){
+
+  DSPRINT0(DSFLAG_INIT,"Entering __kmpc_initialize_data_sharing_environment\n");
+
+  unsigned WID = getWarpId();
+  DSPRINT(DSFLAG_INIT,"Warp ID: %d\n", WID);
+
+  omptarget_nvptx_TeamDescr *teamDescr = &omptarget_nvptx_threadPrivateContext->TeamContext();
+  __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID);
+
+  DataSharingState.SlotPtr[WID] = RootS;
+  DataSharingState.StackPtr[WID] = (void*)&RootS->Data[0];
+
+  // We don't need to initialize the frame and active threads.
+
+  DSPRINT(DSFLAG_INIT,"Initial data size: %08x \n", InitialDataSize);
+  DSPRINT(DSFLAG_INIT,"Root slot at: %016llx \n", (long long)RootS);
+  DSPRINT(DSFLAG_INIT,"Root slot data-end at: %016llx \n", (long long)RootS->DataEnd);
+  DSPRINT(DSFLAG_INIT,"Root slot next at: %016llx \n", (long long)RootS->Next);
+  DSPRINT(DSFLAG_INIT,"Shared slot ptr at: %016llx \n", (long long)DataSharingState.SlotPtr[WID]);
+  DSPRINT(DSFLAG_INIT,"Shared stack ptr at: %016llx \n", (long long)DataSharingState.StackPtr[WID]);
+
+  DSPRINT0(DSFLAG_INIT,"Exiting __kmpc_initialize_data_sharing_environment\n");
+}
+
+EXTERN void* __kmpc_data_sharing_environment_begin(
+    __kmpc_data_sharing_slot **SavedSharedSlot,
+    void **SavedSharedStack,
+    void **SavedSharedFrame,
+    int32_t *SavedActiveThreads,
+    size_t SharingDataSize,
+    size_t SharingDefaultDataSize,
+    int16_t IsOMPRuntimeInitialized
+    ){
+
+  DSPRINT0(DSFLAG,"Entering __kmpc_data_sharing_environment_begin\n");
+
+  // If the runtime has been elided, used __shared__ memory for master-worker
+  // data sharing.
+  if (!IsOMPRuntimeInitialized) return (void *) &DataSharingState;
+
+  DSPRINT(DSFLAG,"Data Size %016llx\n", SharingDataSize);
+  DSPRINT(DSFLAG,"Default Data Size %016llx\n", SharingDefaultDataSize);
+
+  unsigned WID = getWarpId();
+  unsigned CurActiveThreads = getActiveThreadsMask();
+
+  __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+  void *&StackP = DataSharingState.StackPtr[WID];
+  void *&FrameP = DataSharingState.FramePtr[WID];
+  int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
+
+  DSPRINT0(DSFLAG,"Save current slot/stack values.\n");
+  // Save the current values.
+  *SavedSharedSlot = SlotP;
+  *SavedSharedStack = StackP;
+  *SavedSharedFrame = FrameP;
+  *SavedActiveThreads = ActiveT;
+
+  DSPRINT(DSFLAG,"Warp ID: %d\n", WID);
+  DSPRINT(DSFLAG,"Saved slot ptr at: %016llx \n", (long long)SlotP);
+  DSPRINT(DSFLAG,"Saved stack ptr at: %016llx \n", (long long)StackP);
+  DSPRINT(DSFLAG,"Saved frame ptr at: %016llx \n", (long long)FrameP);
+  DSPRINT(DSFLAG,"Active threads: %08x \n", ActiveT);
+
+  // Only the warp active master needs to grow the stack.
+  if (IsWarpMasterActiveThread()) {
+    // Save the current active threads.
+    ActiveT = CurActiveThreads;
+
+    // Make sure we use aligned sizes to avoid rematerialization of data.
+    SharingDataSize = AlignVal(SharingDataSize);
+    // FIXME: The default data size can be assumed to be aligned?
+    SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
+
+    // Check if we have room for the data in the current slot.
+    const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
+    const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
+    const uintptr_t RequiredEndAddress =CurrentStartAddress + (uintptr_t)SharingDataSize;
+
+    DSPRINT(DSFLAG,"Data Size %016llx\n", SharingDataSize);
+    DSPRINT(DSFLAG,"Default Data Size %016llx\n", SharingDefaultDataSize);
+    DSPRINT(DSFLAG,"Current Start Address %016llx\n", CurrentStartAddress);
+    DSPRINT(DSFLAG,"Current End Address %016llx\n", CurrentEndAddress);
+    DSPRINT(DSFLAG,"Required End Address %016llx\n", RequiredEndAddress);
+    DSPRINT(DSFLAG,"Active Threads %08x\n", ActiveT);
+
+    // If we require a new slot, allocate it and initialize it (or attempt to reuse one). Also, set the shared stack and slot pointers to the new place. If we do not need to grow the stack, just adapt the stack and frame pointers.
+    if (CurrentEndAddress < RequiredEndAddress) {
+      size_t NewSize = (SharingDataSize > SharingDefaultDataSize) ? SharingDataSize : SharingDefaultDataSize;
+      __kmpc_data_sharing_slot *NewSlot = 0;
+
+      // Attempt to reuse an existing slot.
+      if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
+        uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - (uintptr_t)(&ExistingSlot->Data[0]);
+        if (ExistingSlotSize >= NewSize) {
+          DSPRINT(DSFLAG,"Reusing stack slot %016llx\n", (long long)ExistingSlot);
+          NewSlot = ExistingSlot;
+        } else {
+          DSPRINT(DSFLAG,"Cleaning up -failed reuse - %016llx\n", (long long)SlotP->Next);
+          free(ExistingSlot);
+        }
+      }
+
+      if (!NewSlot) {
+        NewSlot = ( __kmpc_data_sharing_slot *)malloc(sizeof(__kmpc_data_sharing_slot) + NewSize);
+        DSPRINT(DSFLAG,"New slot allocated %016llx (data size=%016llx)\n", (long long)NewSlot, NewSize);
+      }
+
+      NewSlot->Next = 0;
+      NewSlot->DataEnd = &NewSlot->Data[NewSize];
+
+      SlotP->Next = NewSlot;
+      SlotP = NewSlot;
+      StackP = &NewSlot->Data[SharingDataSize];
+      FrameP = &NewSlot->Data[0];
+    } else {
+
+      // Clean up any old slot that we may still have. The slot producers, do not eliminate them because that may be used to return data.
+      if (SlotP->Next) {
+        DSPRINT(DSFLAG,"Cleaning up - old not required - %016llx\n", (long long)SlotP->Next);
+        free(SlotP->Next);
+        SlotP->Next = 0;
+      }
+
+      FrameP = StackP;
+      StackP = (void*)RequiredEndAddress;
+    }
+  }
+
+  // FIXME: Need to see the impact of doing it here.
+  __threadfence_block();
+
+  DSPRINT0(DSFLAG,"Exiting __kmpc_data_sharing_environment_begin\n");
+
+  // All the threads in this warp get the frame they should work with.
+  return FrameP;
+}
+
+EXTERN void __kmpc_data_sharing_environment_end(
+    __kmpc_data_sharing_slot **SavedSharedSlot,
+    void **SavedSharedStack,
+    void **SavedSharedFrame,
+    int32_t *SavedActiveThreads,
+    int32_t IsEntryPoint
+    ){
+
+  DSPRINT0(DSFLAG,"Entering __kmpc_data_sharing_environment_end\n");
+
+  unsigned WID = getWarpId();
+
+  if (IsEntryPoint){
+    if (IsWarpMasterActiveThread()) {
+      DSPRINT0(DSFLAG,"Doing clean up\n");
+
+      // The master thread cleans the saved slot, because this is an environment only for the master.
+      __kmpc_data_sharing_slot *S = IsMasterThread() ? *SavedSharedSlot : DataSharingState.SlotPtr[WID];
+
+      if (S->Next) {
+        free(S->Next);
+        S->Next = 0;
+      }
+    }
+
+    DSPRINT0(DSFLAG,"Exiting Exiting __kmpc_data_sharing_environment_end\n");
+    return;
+  }
+
+  int32_t CurActive = getActiveThreadsMask();
+
+  // Only the warp master can restore the stack and frame information, and only if there are no other threads left behind in this environment (i.e. the warp diverged and returns in different places). This only works if we assume that threads will converge right after the call site that started the environment.
+  if (IsWarpMasterActiveThread()) {
+    int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
+
+    DSPRINT0(DSFLAG,"Before restoring the stack\n");
+    // Zero the bits in the mask. If it is still different from zero, then we have other threads that will return after the current ones.
+    ActiveT &= ~CurActive;
+
+    DSPRINT(DSFLAG,"Active threads: %08x; New mask: %08x\n", CurActive, ActiveT);
+
+
+    if (!ActiveT) {
+      // No other active threads? Great, lets restore the stack.
+
+      __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+      void *&StackP = DataSharingState.StackPtr[WID];
+      void *&FrameP = DataSharingState.FramePtr[WID];
+
+      SlotP = *SavedSharedSlot;
+      StackP = *SavedSharedStack;
+      FrameP = *SavedSharedFrame;
+      ActiveT = *SavedActiveThreads;
+
+      DSPRINT(DSFLAG,"Restored slot ptr at: %016llx \n",(long long)SlotP);
+      DSPRINT(DSFLAG,"Restored stack ptr at: %016llx \n",(long long)StackP);
+      DSPRINT(DSFLAG,"Restored frame ptr at: %016llx \n", (long long)FrameP);
+      DSPRINT(DSFLAG,"Active threads: %08x \n", ActiveT);
+
+    }
+  }
+
+  // FIXME: Need to see the impact of doing it here.
+  __threadfence_block();
+
+  DSPRINT0(DSFLAG,"Exiting __kmpc_data_sharing_environment_end\n");
+  return;
+}
+
+EXTERN void* __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
+                                                       int16_t IsOMPRuntimeInitialized){
+  DSPRINT0(DSFLAG,"Entering __kmpc_get_data_sharing_environment_frame\n");
+
+  // If the runtime has been elided, use __shared__ memory for master-worker
+  // data sharing.  We're reusing the statically allocated data structure
+  // that is used for standard data sharing.
+  if (!IsOMPRuntimeInitialized) return (void *) &DataSharingState;
+
+  // Get the frame used by the requested thread.
+
+  unsigned SourceWID = SourceThreadID >> DS_Max_Worker_Warp_Size_Bits;
+
+  DSPRINT(DSFLAG,"Source  warp: %d\n", SourceWID);
+
+  void *P = DataSharingState.FramePtr[SourceWID];
+  DSPRINT0(DSFLAG,"Exiting __kmpc_get_data_sharing_environment_frame\n");
+  return P;
+}
Index: libomptarget/deviceRTLs/nvptx/src/debug.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/debug.h
@@ -0,0 +1,276 @@
+//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug macros to be used in the application.
+//
+//   Usage guide
+//
+//   PRINT0(flag, str)        : if debug flag is on, print (no arguments)
+//   PRINT(flag, str, args)   : if debug flag is on, print (arguments)
+//   DON(flag)                : return true if debug flag is on
+//
+//   ASSERT(flag, cond, str, args): if test flag is on, test the condition
+//                                  if the condition is false, print str+args
+//          and assert.
+//          CAUTION: cond may be evaluate twice
+//   AON(flag)                     : return true if test flag is on
+//
+//   WARNING(flag, str, args)      : if warning flag is on, print the warning
+//   WON(flag)                     : return true if warning flag is on
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_NVPTX_DEBUG_H_
+#define _OMPTARGET_NVPTX_DEBUG_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of debugging
+////////////////////////////////////////////////////////////////////////////////
+
+#define LD_SET_NONE 0ULL /* none */
+#define LD_SET_ALL -1ULL /* all */
+
+// pos 1
+#define LD_SET_LOOP 0x1ULL  /* basic loop */
+#define LD_SET_LOOPD 0x2ULL /* basic loop */
+#define LD_SET_PAR 0x4ULL   /* basic parallel */
+#define LD_SET_PARD 0x8ULL  /* basic parallel */
+
+// pos 2
+#define LD_SET_SYNC 0x10ULL  /* sync info */
+#define LD_SET_SYNCD 0x20ULL /* sync info */
+#define LD_SET_WAIT 0x40ULL  /* state when waiting */
+#define LD_SET_TASK 0x80ULL  /* print task info (high level) */
+
+// pos 3
+#define LD_SET_IO 0x100ULL     /* big region io (excl atomic) */
+#define LD_SET_IOD 0x200ULL    /* big region io (excl atomic) */
+#define LD_SET_ENV 0x400ULL    /* env info */
+#define LD_SET_CANCEL 0x800ULL /* print cancel info */
+
+// pos 4
+#define LD_SET_MEM 0x1000ULL /* malloc / free */
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags to print selected output.
+
+// these are some examples of possible definitions that can be used for
+// debugging.
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save
+// on cuda buffer
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR)
+
+#ifndef OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE
+#elif OMPTARGET_NVPTX_DEBUG
+#warning debug is used, not good for measurements
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of asserts
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// available flags
+
+#define LT_SET_NONE 0x0 /* unsafe */
+#define LT_SET_SAFETY                                                          \
+  0x1 /* check malloc type of stuff, input at creation, cheap */
+#define LT_SET_INPUT 0x2 /* check also all runtime inputs */
+#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags
+
+#ifndef OMPTARGET_NVPTX_TEST
+#if OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY)
+#else
+#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY)
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of warnings
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// available flags
+
+#define LW_SET_ALL -1
+#define LW_SET_NONE 0x0
+#define LW_SET_ENV 0x1
+#define LW_SET_INPUT 0x2
+#define LW_SET_FUSSY 0x4
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags
+
+#if OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE)
+#else
+#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// implemtation for debug
+////////////////////////////////////////////////////////////////////////////////
+
+#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
+#include <stdio.h>
+#endif
+#if OMPTARGET_NVPTX_TEST
+#include <assert.h>
+#endif
+
+// set flags that are tested (inclusion properties)
+
+#define LD_ALL (LD_SET_ALL)
+
+#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD)
+#define LD_LOOPD (LD_SET_LOOPD)
+#define LD_PAR (LD_SET_PAR | LD_SET_PARD)
+#define LD_PARD (LD_SET_PARD)
+
+// pos 2
+#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD)
+#define LD_SYNCD (LD_SET_SYNCD)
+#define LD_WAIT (LD_SET_WAIT)
+#define LD_TASK (LD_SET_TASK)
+
+// pos 3
+#define LD_IO (LD_SET_IO | LD_SET_IOD)
+#define LD_IOD (LD_SET_IOD)
+#define LD_ENV (LD_SET_ENV)
+#define LD_CANCEL (LD_SET_CANCEL)
+
+// pos 3
+#define LD_MEM (LD_SET_MEM)
+
+// implement
+#if OMPTARGET_NVPTX_DEBUG
+
+#define DON(_flag) ((OMPTARGET_NVPTX_DEBUG) & (_flag))
+
+#define PRINT0(_flag, _str)                                                    \
+  {                                                                            \
+    if (DON(_flag)) {                                                          \
+      printf("<b %2d, t %4d, w %2d, l %2d>: " _str, blockIdx.x, threadIdx.x,   \
+             threadIdx.x / warpSize, threadIdx.x & 0x1F);                      \
+    }                                                                          \
+  }
+
+#define PRINT(_flag, _str, _args...)                                           \
+  {                                                                            \
+    if (DON(_flag)) {                                                          \
+      printf("<b %2d, t %4d, w %2d, l %2d>: " _str, blockIdx.x, threadIdx.x,   \
+             threadIdx.x / warpSize, threadIdx.x & 0x1F, _args);               \
+    }                                                                          \
+  }
+#else
+
+#define DON(_flag) (FALSE)
+#define PRINT0(flag, str)
+#define PRINT(flag, str, _args...)
+
+#endif
+
+// for printing without worring about precision, pointers...
+#define P64(_x) ((unsigned long long)(_x))
+
+////////////////////////////////////////////////////////////////////////////////
+// early defs for test
+////////////////////////////////////////////////////////////////////////////////
+
+#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY)
+#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY)
+#define LT_FUSSY (LT_SET_FUSSY)
+
+#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY
+
+#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
+#define ASSERT0(_flag, _cond, _str)                                            \
+  {                                                                            \
+    if (TON(_flag)) {                                                          \
+      assert(_cond);                                                           \
+    }                                                                          \
+  }
+#define ASSERT(_flag, _cond, _str, _args...)                                   \
+  {                                                                            \
+    if (TON(_flag)) {                                                          \
+      assert(_cond);                                                           \
+    }                                                                          \
+  }
+
+#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT
+
+#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
+#define ASSERT0(_flag, _cond, _str)                                            \
+  {                                                                            \
+    if (TON(_flag) && !(_cond)) {                                              \
+      printf("<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n", blockIdx.x,    \
+             threadIdx.x, threadIdx.x / warpSize, threadIdx.x & 0x1F);         \
+      assert(_cond);                                                           \
+    }                                                                          \
+  }
+#define ASSERT(_flag, _cond, _str, _args...)                                   \
+  {                                                                            \
+    if (TON(_flag) && !(_cond)) {                                              \
+      printf("<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n", blockIdx.x,    \
+             threadIdx.x, threadIdx.x / warpSize, threadIdx.x & 0x1F, _args);  \
+      assert(_cond);                                                           \
+    }                                                                          \
+  }
+
+#else
+
+#define TON(_flag) (FALSE)
+#define ASSERT0(_flag, _cond, _str)
+#define ASSERT(_flag, _cond, _str, _args...)
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// early defs for warning
+
+#define LW_ALL (LW_SET_ALL)
+#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV)
+#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT)
+#define LW_FUSSY (LW_SET_FUSSY)
+
+#if OMPTARGET_NVPTX_WARNING
+
+#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag))
+#define WARNING0(_flag, _str)                                                  \
+  {                                                                            \
+    if (WON(_flag)) {                                                          \
+      printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, blockIdx.x,        \
+             threadIdx.x, threadIdx.x / warpSize, threadIdx.x & 0x1F);         \
+    }                                                                          \
+  }
+#define WARNING(_flag, _str, _args...)                                         \
+  {                                                                            \
+    if (WON(_flag)) {                                                          \
+      printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, blockIdx.x,        \
+             threadIdx.x, threadIdx.x / warpSize, threadIdx.x & 0x1F, _args);  \
+    }                                                                          \
+  }
+
+#else
+
+#define WON(_flag) (FALSE)
+#define WARNING0(_flag, _str)
+#define WARNING(_flag, _str, _args...)
+
+#endif
+
+#endif
Index: libomptarget/deviceRTLs/nvptx/src/interface.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -0,0 +1,503 @@
+//===------- interface.h - NVPTX OpenMP interface definitions ---- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug macros to be used in the application.
+//
+//  This file contains all the definitions that are relevant to
+//  the interface. The first section contains the interface as
+//  declared by OpenMP.  A second section includes library private calls
+//  (mostly debug, temporary?) The third section includes the compiler
+//  specific interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _INTERFACES_H_
+#define _INTERFACES_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenMP interface
+////////////////////////////////////////////////////////////////////////////////
+
+typedef uint32_t omp_lock_t;      /* arbitrary type of the right length */
+typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
+
+typedef enum omp_sched_t {
+  omp_sched_static = 1,  /* chunkSize >0 */
+  omp_sched_dynamic = 2, /* chunkSize >0 */
+  omp_sched_guided = 3,  /* chunkSize >0 */
+  omp_sched_auto = 4,    /* no chunkSize */
+} omp_sched_t;
+
+typedef enum omp_proc_bind_t {
+  omp_proc_bind_false = 0,
+  omp_proc_bind_true = 1,
+  omp_proc_bind_master = 2,
+  omp_proc_bind_close = 3,
+  omp_proc_bind_spread = 4
+} omp_proc_bind_t;
+
+EXTERN double omp_get_wtick(void);
+EXTERN double omp_get_wtime(void);
+
+EXTERN void omp_set_num_threads(int num);
+EXTERN int omp_get_num_threads(void);
+EXTERN int omp_get_max_threads(void);
+EXTERN int omp_get_thread_limit(void);
+EXTERN int omp_get_thread_num(void);
+EXTERN int omp_get_num_procs(void);
+EXTERN int omp_in_parallel(void);
+EXTERN int omp_in_final(void);
+EXTERN void omp_set_dynamic(int flag);
+EXTERN int omp_get_dynamic(void);
+EXTERN void omp_set_nested(int flag);
+EXTERN int omp_get_nested(void);
+EXTERN void omp_set_max_active_levels(int level);
+EXTERN int omp_get_max_active_levels(void);
+EXTERN int omp_get_level(void);
+EXTERN int omp_get_active_level(void);
+EXTERN int omp_get_ancestor_thread_num(int level);
+EXTERN int omp_get_team_size(int level);
+
+EXTERN void omp_init_lock(omp_lock_t *lock);
+EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_destroy_lock(omp_lock_t *lock);
+EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_set_lock(omp_lock_t *lock);
+EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_unset_lock(omp_lock_t *lock);
+EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock);
+EXTERN int omp_test_lock(omp_lock_t *lock);
+EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock);
+
+EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier);
+EXTERN void omp_set_schedule(omp_sched_t kind, int modifier);
+EXTERN omp_proc_bind_t omp_get_proc_bind(void);
+EXTERN int omp_get_cancellation(void);
+EXTERN void omp_set_default_device(int deviceId);
+EXTERN int omp_get_default_device(void);
+EXTERN int omp_get_num_devices(void);
+EXTERN int omp_get_num_teams(void);
+EXTERN int omp_get_team_num(void);
+EXTERN int omp_is_initial_device(void);
+EXTERN int omp_get_initial_device(void);
+EXTERN int omp_get_max_task_priority(void);
+
+////////////////////////////////////////////////////////////////////////////////
+// OMPTARGET_NVPTX private (debug / temportary?) interface
+////////////////////////////////////////////////////////////////////////////////
+
+// for debug
+EXTERN void __kmpc_print_str(char *title);
+EXTERN void __kmpc_print_title_int(char *title, int data);
+EXTERN void __kmpc_print_index(char *title, int i);
+EXTERN void __kmpc_print_int(int data);
+EXTERN void __kmpc_print_double(double data);
+EXTERN void __kmpc_print_address_int64(int64_t data);
+
+////////////////////////////////////////////////////////////////////////////////
+// file below is swiped from kmpc host interface
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// kmp specifc types
+////////////////////////////////////////////////////////////////////////////////
+
+typedef enum kmp_sched_t {
+  kmp_sched_static_chunk = 33,
+  kmp_sched_static_nochunk = 34,
+  kmp_sched_dynamic = 35,
+  kmp_sched_guided = 36,
+  kmp_sched_runtime = 37,
+  kmp_sched_auto = 38,
+
+  kmp_sched_static_ordered = 65,
+  kmp_sched_static_nochunk_ordered = 66,
+  kmp_sched_dynamic_ordered = 67,
+  kmp_sched_guided_ordered = 68,
+  kmp_sched_runtime_ordered = 69,
+  kmp_sched_auto_ordered = 70,
+
+  kmp_sched_distr_static_chunk = 91,
+  kmp_sched_distr_static_nochunk = 92,
+  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
+
+  kmp_sched_default = kmp_sched_static_nochunk,
+  kmp_sched_unordered_first = kmp_sched_static_chunk,
+  kmp_sched_unordered_last = kmp_sched_auto,
+  kmp_sched_ordered_first = kmp_sched_static_ordered,
+  kmp_sched_ordered_last = kmp_sched_auto_ordered,
+  kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
+  kmp_sched_distribute_last = kmp_sched_distr_static_chunk_sched_static_chunkone,
+
+  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
+   * Since we need to distinguish the three possible cases (no modifier,
+   * monotonic modifier, nonmonotonic modifier), we need separate bits for
+   * each modifier. The absence of monotonic does not imply nonmonotonic,
+   * especially since 4.5 says that the behaviour of the "no modifier" case
+   * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
+   *
+   * Since we're passing a full 32 bit value, we can use a couple of high
+   * bits for these flags; out of paranoia we avoid the sign bit.
+   *
+   * These modifiers can be or-ed into non-static schedules by the compiler
+   * to pass the additional information. They will be stripped early in the
+   * processing in __kmp_dispatch_init when setting up schedules, so
+   * most of the code won't ever see schedules with these bits set.
+   */
+  kmp_sched_modifier_monotonic = (1<<29),
+                /**< Set if the monotonic schedule modifier was present */
+  kmp_sched_modifier_nonmonotonic = (1<<30),
+                /**< Set if the nonmonotonic schedule modifier was present */
+
+# define SCHEDULE_WITHOUT_MODIFIERS(s) (enum kmp_sched_t)((s) & ~ (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
+# define SCHEDULE_HAS_MONOTONIC(s) (((s) & kmp_sched_modifier_monotonic)    != 0)
+# define SCHEDULE_HAS_NONMONOTONIC(s) (((s) & kmp_sched_modifier_nonmonotonic) != 0)
+# define SCHEDULE_HAS_NO_MODIFIERS(s) (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == 0)
+
+} kmp_sched_t;
+
+// parallel defs
+typedef void kmp_Indent;
+typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
+typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData);
+typedef void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num);
+typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t lane_offset, int16_t shortCircuit);
+typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void * scratchpad, int32_t index, int32_t width);
+typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void * scratchpad, int32_t index, int32_t width, int32_t reduce);
+
+// task defs
+typedef struct kmp_TaskDescr kmp_TaskDescr;
+typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr);
+typedef struct kmp_TaskDescr {
+  void *sharedPointerTable;   // ptr to a table of shared var ptrs
+  kmp_TaskFctPtr sub;         // task subroutine
+  int32_t partId;             // unused
+  kmp_TaskFctPtr destructors; // destructor of c++ first private
+} kmp_TaskDescr;
+// task dep defs
+#define KMP_TASKDEP_IN 0x1u
+#define KMP_TASKDEP_OUT 0x2u
+typedef struct kmp_TaskDep_Public {
+  void *addr;
+  size_t len;
+  uint8_t flags; // bit 0: in, bit 1: out
+} kmp_TaskDep_Public;
+
+// flags that interpret the interface part of tasking flags
+#define KMP_TASK_IS_TIED 0x1
+#define KMP_TASK_FINAL 0x2
+#define KMP_TASK_MERGED_IF0 0x4 /* unused */
+#define KMP_TASK_DESTRUCTOR_THUNK 0x8
+
+// flags for task setup return
+#define KMP_CURRENT_TASK_NOT_SUSPENDED 0
+#define KMP_CURRENT_TASK_SUSPENDED 1
+
+// sync defs
+typedef int32_t kmp_CriticalName[8];
+
+////////////////////////////////////////////////////////////////////////////////
+// flags for kstate (all bits initially off)
+////////////////////////////////////////////////////////////////////////////////
+
+// first 2 bits used by kmp_Reduction (defined in kmp_reduction.cpp)
+#define KMP_REDUCTION_MASK 0x3
+#define KMP_SKIP_NEXT_CALL 0x4
+#define KMP_SKIP_NEXT_CANCEL_BARRIER 0x8
+
+////////////////////////////////////////////////////////////////////////////////
+// data
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// external interface
+////////////////////////////////////////////////////////////////////////////////
+
+// query
+EXTERN int32_t __kmpc_global_num_threads(kmp_Indent *loc); // missing
+EXTERN int32_t __kmpc_bound_thread_num(kmp_Indent *loc);   // missing
+EXTERN int32_t __kmpc_bound_num_threads(kmp_Indent *loc);  // missing
+EXTERN int32_t __kmpc_in_parallel(kmp_Indent *loc);        // missing
+
+// parallel
+EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc);
+EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t global_tid,
+                                    int32_t num_threads);
+// simd
+EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t global_tid,
+                                   int32_t simd_limit);
+// aee ... not supported
+// EXTERN void __kmpc_fork_call(kmp_Indent *loc, int32_t argc, kmp_ParFctPtr
+// microtask, ...);
+EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid);
+EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
+                                           uint32_t global_tid);
+EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc,
+                                      uint32_t global_tid);
+
+// proc bind
+EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t global_tid, int proc_bind);
+EXTERN int omp_get_num_places(void);
+EXTERN int omp_get_place_num_procs(int place_num);
+EXTERN void omp_get_place_proc_ids(int place_num, int *ids);
+EXTERN int omp_get_place_num(void);
+EXTERN int omp_get_partition_num_places(void);
+EXTERN void omp_get_partition_place_nums(int *place_nums);
+
+// for static (no chunk or chunk)
+EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid,
+                                     int32_t sched, int32_t *plastiter,
+                                     int32_t *plower, int32_t *pupper,
+                                     int32_t *pstride, int32_t incr,
+                                     int32_t chunk);
+EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid,
+                                      int32_t sched, int32_t *plastiter,
+                                      uint32_t *plower, uint32_t *pupper,
+                                      int32_t *pstride, int32_t incr,
+                                      int32_t chunk);
+EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid,
+                                     int32_t sched, int32_t *plastiter,
+                                     int64_t *plower, int64_t *pupper,
+                                     int64_t *pstride, int64_t incr,
+                                     int64_t chunk);
+EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid,
+                                      int32_t sched, int32_t *plastiter1,
+                                      uint64_t *plower, uint64_t *pupper,
+                                      int64_t *pstride, int64_t incr,
+                                      int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+                                          int32_t sched, int32_t *plastiter,
+                                          int32_t *plower, int32_t *pupper,
+                                          int32_t *pstride, int32_t incr,
+                                          int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+                                           int32_t sched, int32_t *plastiter,
+                                           uint32_t *plower, uint32_t *pupper,
+                                           int32_t *pstride, int32_t incr,
+                                           int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+                                          int32_t sched, int32_t *plastiter,
+                                          int64_t *plower, int64_t *pupper,
+                                          int64_t *pstride, int64_t incr,
+                                          int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+                                           int32_t sched, int32_t *plastiter1,
+                                           uint64_t *plower, uint64_t *pupper,
+                                           int64_t *pstride, int64_t incr,
+                                           int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_4_simple_generic(kmp_Indent *loc,
+                                             int32_t global_tid, int32_t sched,
+                                             int32_t *plastiter,
+                                             int32_t *plower, int32_t *pupper,
+                                             int32_t *pstride, int32_t incr,
+                                             int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_4u_simple_generic(
+    kmp_Indent *loc, int32_t global_tid, int32_t sched, int32_t *plastiter,
+    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_8_simple_generic(kmp_Indent *loc,
+                                             int32_t global_tid, int32_t sched,
+                                             int32_t *plastiter,
+                                             int64_t *plower, int64_t *pupper,
+                                             int64_t *pstride, int64_t incr,
+                                             int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_8u_simple_generic(
+    kmp_Indent *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1,
+    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk);
+
+EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid);
+
+// for dynamic
+EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t global_tid,
+                                   int32_t sched, int32_t lower, int32_t upper,
+                                   int32_t incr, int32_t chunk);
+EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t global_tid,
+                                    int32_t sched, uint32_t lower,
+                                    uint32_t upper, int32_t incr,
+                                    int32_t chunk);
+EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t global_tid,
+                                   int32_t sched, int64_t lower, int64_t upper,
+                                   int64_t incr, int64_t chunk);
+EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t global_tid,
+                                    int32_t sched, uint64_t lower,
+                                    uint64_t upper, int64_t incr,
+                                    int64_t chunk);
+
+EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t global_tid,
+                                  int32_t *plastiter, int32_t *plower,
+                                  int32_t *pupper, int32_t *pstride);
+EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t global_tid,
+                                   int32_t *plastiter, uint32_t *plower,
+                                   uint32_t *pupper, int32_t *pstride);
+EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t global_tid,
+                                  int32_t *plastiter, int64_t *plower,
+                                  int64_t *pupper, int64_t *pstride);
+EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t global_tid,
+                                   int32_t *plastiter, uint64_t *plower,
+                                   uint64_t *pupper, int64_t *pstride);
+
+EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t global_tid);
+
+// Support for reducing conditional lastprivate variables
+EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc,
+  int32_t global_tid, int32_t varNum, void *array);
+
+// reduction
+EXTERN int32_t __kmpc_reduce41(kmp_Indent *loc, int32_t global_tid,
+                               int32_t varNum, size_t reduceSize,
+                               void *reduceData, void *reduceArraySize,
+                               kmp_ReductFctPtr *reductFct,
+                               kmp_CriticalName *lock);
+EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
+EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait(int32_t global_tid,
+                             int32_t num_vars, size_t reduce_size,
+                             void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+                             kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid,
+                             int32_t num_vars, size_t reduce_size,
+                             void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+                             kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
+EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
+
+// sync barrier
+EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid);
+EXTERN void __kmpc_barrier_simple_spmd(kmp_Indent *loc_ref, int32_t tid);
+EXTERN void __kmpc_barrier_simple_generic(kmp_Indent *loc_ref, int32_t tid);
+EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc, int32_t global_tid);
+
+// single
+EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid);
+
+// sync
+EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_critical(kmp_Indent *loc, int32_t global_tid,
+                            kmp_CriticalName *crit);
+EXTERN void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid,
+                                kmp_CriticalName *crit);
+EXTERN void __kmpc_flush(kmp_Indent *loc);
+
+// vote
+EXTERN int32_t __kmpc_warp_active_thread_mask();
+
+// tasks
+EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Indent *loc,
+                                            uint32_t global_tid, int32_t flag,
+                                            size_t sizeOfTaskInclPrivate,
+                                            size_t sizeOfSharedTable,
+                                            kmp_TaskFctPtr sub);
+EXTERN int32_t __kmpc_omp_task(kmp_Indent *loc, uint32_t global_tid,
+                               kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newLegacyTaskDescr,
+                                         int32_t depNum, void *depList,
+                                         int32_t noAliasDepNum,
+                                         void *noAliasDepList);
+EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, uint32_t global_tid,
+                                      kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, uint32_t global_tid,
+                                 int32_t depNum, void *depList,
+                                 int32_t noAliasDepNum, void *noAliasDepList);
+EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid);
+EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid);
+EXTERN int32_t __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid,
+    int end_part);
+EXTERN int32_t __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid);
+EXTERN void __kmpc_taskloop(kmp_Indent *loc, uint32_t global_tid,
+    kmp_TaskDescr *newKmpTaskDescr, int if_val, uint64_t *lb, uint64_t *ub,
+    int64_t st, int nogroup, int32_t sched, uint64_t grainsize,
+    void *task_dup);
+
+// cancel
+EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent *loc, int32_t global_tid,
+                                        int32_t cancelVal);
+EXTERN int32_t __kmpc_cancel(kmp_Indent *loc, int32_t global_tid,
+                             int32_t cancelVal);
+
+// non standard
+EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr);
+EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
+EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
+                                    int16_t RequiresOMPRuntime,
+                                    int16_t RequiresDataSharing);
+EXTERN void __kmpc_spmd_kernel_deinit();
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized);
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_kernel_end_parallel();
+EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, bool *IsFinal, int32_t *LaneSource);
+EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
+EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask, bool *IsFinal, int32_t *LaneSource,
+                                          int32_t *LaneId, int32_t *NumLanes);
+EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
+
+// The slot used for data sharing by the master and worker threads. We use a complete (default size version and an incomplete one so that we allow sizes greater than the default).
+struct __kmpc_data_sharing_slot {
+  __kmpc_data_sharing_slot *Next;
+  void *DataEnd;
+  char Data[];
+};
+EXTERN void __kmpc_initialize_data_sharing_environment(
+    __kmpc_data_sharing_slot *RootS,
+    size_t InitialDataSize);
+EXTERN void* __kmpc_data_sharing_environment_begin(
+    __kmpc_data_sharing_slot **SavedSharedSlot,
+    void **SavedSharedStack,
+    void **SavedSharedFrame,
+    int32_t *SavedActiveThreads,
+    size_t SharingDataSize,
+    size_t SharingDefaultDataSize,
+    int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_data_sharing_environment_end(
+    __kmpc_data_sharing_slot **SavedSharedSlot,
+    void **SavedSharedStack,
+    void **SavedSharedFrame,
+    int32_t *SavedActiveThreads,
+    int32_t IsEntryPoint );
+
+EXTERN void* __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
+                                                       int16_t IsOMPRuntimeInitialized);
+#endif
Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -0,0 +1,461 @@
+//===------------ libcall.cu - NVPTX OpenMP user calls ----------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenMP runtime functions that can be
+// invoked by the user in an OpenMP region
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+// Timer precision is 1ns
+#define TIMER_PRECISION ((double) 1E-9)
+
+EXTERN double omp_get_wtick(void) {
+  PRINT(LD_IO, "omp_get_wtick() returns %g\n", TIMER_PRECISION);
+  return TIMER_PRECISION;
+}
+
+EXTERN double omp_get_wtime(void) {
+  unsigned long long nsecs;
+  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
+  double rc = (double) nsecs * TIMER_PRECISION;
+  PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_num_threads(int num) {
+  PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
+  if (num <= 0) {
+    WARNING0(LW_INPUT, "expected positive num; ignore\n");
+  } else {
+    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+    currTaskDescr->NThreads() = num;
+  }
+}
+
+EXTERN int omp_get_num_threads(void) {
+  int tid = GetLogicalThreadIdInBlock();
+  int rc = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+  PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_max_threads(void) {
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  int rc = 1; // default is 1 thread avail
+  if (!currTaskDescr->InParallelRegion()) {
+    // not currently in a parallel region... all are available
+    rc = GetNumberOfProcsInTeam();
+    ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
+  }
+  PRINT(LD_IO, "call omp_get_max_threads() return %\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_thread_limit(void) {
+  // per contention group.. meaning threads in current team
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  int rc = currTaskDescr->ThreadLimit();
+  PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_thread_num() {
+  int tid = GetLogicalThreadIdInBlock();
+  int rc = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
+  PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_num_procs(void) {
+  int rc = GetNumberOfProcsInDevice();
+  PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_in_parallel(void) {
+  int rc = 0;
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  if (currTaskDescr->InParallelRegion()) {
+    rc = 1;
+  }
+  PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_in_final(void) {
+  // treat all tasks as final... Specs may expect runtime to keep
+  // track more precisely if a task was actively set by users... This
+  // is not explicitely specified; will treat as if runtime can
+  // actively decide to put a non-final task into a final one.
+  int rc = 1;
+  PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_dynamic(int flag) {
+  PRINT(LD_IO, "call omp_set_dynamic(%d)\n", flag);
+
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  if (flag) {
+    currTaskDescr->SetDynamic();
+  } else {
+    currTaskDescr->ClearDynamic();
+  }
+}
+
+EXTERN int omp_get_dynamic(void) {
+  int rc = 0;
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  if (currTaskDescr->IsDynamic()) {
+    rc = 1;
+  }
+  PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_nested(int flag) {
+  PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n",
+        flag);
+}
+
+EXTERN int omp_get_nested(void) {
+  int rc = 0;
+  PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_max_active_levels(int level) {
+  PRINT(LD_IO,
+        "call omp_set_max_active_levels(%d) is ignored (no nested support)\n",
+        level);
+}
+
+EXTERN int omp_get_max_active_levels(void) {
+  int rc = 1;
+  PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_level(void) {
+  int level = 0;
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  ASSERT0(LT_FUSSY, currTaskDescr,
+          "do not expect fct to be called in a non-active thread");
+  do {
+    if (currTaskDescr->IsParallelConstruct()) {
+      level++;
+    }
+    currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+  } while (currTaskDescr);
+  PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
+  return level;
+}
+
+EXTERN int omp_get_active_level(void) {
+  int level = 0; // no active level parallelism
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  ASSERT0(LT_FUSSY, currTaskDescr,
+          "do not expect fct to be called in a non-active thread");
+  do {
+    if (currTaskDescr->ThreadsInTeam() > 1) {
+      // has a parallel with more than one thread in team
+      level = 1;
+      break;
+    }
+    currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+  } while (currTaskDescr);
+  PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
+  return level;
+}
+
+EXTERN int omp_get_ancestor_thread_num(int level) {
+  int rc = 0; // default at level 0
+  if (level >= 0) {
+    int totLevel = omp_get_level();
+    if (level <= totLevel) {
+      omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+      int steps = totLevel - level;
+      PRINT(LD_IO, "backtrack %d steps\n", steps);
+      ASSERT0(LT_FUSSY, currTaskDescr,
+              "do not expect fct to be called in a non-active thread");
+      do {
+        if (DON(LD_IOD)) {
+          // print current state
+          omp_sched_t sched = currTaskDescr->GetRuntimeSched();
+          PRINT(LD_ALL, "task descr %s %d: %s, in par %d, dyn %d, rt sched %d,"
+                " chunk %lld; tid %d, tnum %d, nthreads %d\n", "ancestor",
+                steps, (currTaskDescr->IsParallelConstruct() ? "par" : "task"),
+                currTaskDescr->InParallelRegion(), currTaskDescr->IsDynamic(),
+                sched,
+                currTaskDescr->RuntimeChunkSize(), currTaskDescr->ThreadId(),
+                currTaskDescr->ThreadsInTeam(), currTaskDescr->NThreads());
+        }
+
+        if (currTaskDescr->IsParallelConstruct()) {
+          // found the level
+          if (!steps) {
+            rc = currTaskDescr->ThreadId();
+            break;
+          }
+          steps--;
+        }
+        currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+      } while (currTaskDescr);
+      ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
+    }
+  }
+  PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
+        rc)
+  return rc;
+}
+
+EXTERN int omp_get_team_size(int level) {
+  int rc = 1; // default at level 0
+  if (level >= 0) {
+    int totLevel = omp_get_level();
+    if (level <= totLevel) {
+      omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+      int steps = totLevel - level;
+      ASSERT0(LT_FUSSY, currTaskDescr,
+              "do not expect fct to be called in a non-active thread");
+      do {
+        if (currTaskDescr->IsParallelConstruct()) {
+          if (!steps) {
+            // found the level
+            rc = currTaskDescr->ThreadsInTeam();
+            break;
+          }
+          steps--;
+        }
+        currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+      } while (currTaskDescr);
+      ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
+    }
+  }
+  PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
+  return rc;
+}
+
+EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  *kind = currTaskDescr->GetRuntimeSched();
+  *modifier = currTaskDescr->RuntimeChunkSize();
+  PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
+        (int)*kind, *modifier);
+}
+
+EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
+  PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
+        modifier);
+  if (kind >= omp_sched_static && kind < omp_sched_auto) {
+    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+    currTaskDescr->SetRuntimeSched(kind);
+    currTaskDescr->RuntimeChunkSize() = modifier;
+    PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %d\n",
+          (int)currTaskDescr->GetRuntimeSched(),
+          currTaskDescr->RuntimeChunkSize());
+  }
+}
+
+EXTERN omp_proc_bind_t omp_get_proc_bind(void) {
+  PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n");
+  return omp_proc_bind_true;
+}
+
+EXTERN int omp_get_num_places(void) {
+  PRINT0(LD_IO, "call omp_get_num_places() returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_place_num_procs(int place_num) {
+  PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n");
+  return 0;
+}
+
+EXTERN void omp_get_place_proc_ids(int place_num, int *ids) {
+  PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n");
+}
+
+EXTERN int omp_get_place_num(void) {
+  PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_partition_num_places(void) {
+  PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n");
+  return 0;
+}
+
+EXTERN void omp_get_partition_place_nums(int *place_nums) {
+  PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n");
+}
+
+EXTERN int omp_get_cancellation(void) {
+  int rc = FALSE; // currently false only
+  PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_default_device(int deviceId) {
+  PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n");
+}
+
+EXTERN int omp_get_default_device(void) {
+  PRINT0(LD_IO,
+         "call omp_get_default_device() is undef on device, returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_num_devices(void) {
+  PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_num_teams(void) {
+  int rc = GetNumberOfOmpTeams();
+  PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_team_num() {
+  int rc = GetOmpTeamId();
+  PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_is_initial_device(void) {
+  PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");
+  return 0; // 0 by def on device
+}
+
+// Unspecified on the device.
+EXTERN int omp_get_initial_device(void) {
+  PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");
+  return 0;
+}
+
+// Unused for now.
+EXTERN int omp_get_max_task_priority(void) {
+  PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n");
+  return 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// locks
+////////////////////////////////////////////////////////////////////////////////
+
+#define __OMP_SPIN 1000
+#define UNSET 0
+#define SET 1
+
+EXTERN void omp_init_lock(omp_lock_t *lock) {
+  *lock = UNSET;
+  PRINT0(LD_IO, "call omp_init_lock()\n");
+}
+
+EXTERN void omp_destroy_lock(omp_lock_t *lock) {
+  PRINT0(LD_IO, "call omp_destroy_lock()\n");
+}
+
+EXTERN void omp_set_lock(omp_lock_t *lock) {
+  // int atomicCAS(int* address, int compare, int val);
+  // (old == compare ? val : old)
+  int compare = UNSET;
+  int val = SET;
+
+  // TODO: not sure spinning is a good idea here..
+  while (atomicCAS(lock, compare, val) != UNSET) {
+
+    clock_t start = clock();
+    clock_t now;
+    for (;;) {
+      now = clock();
+      clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
+      if (cycles >= __OMP_SPIN * blockIdx.x) {
+        break;
+      }
+    }
+  } // wait for 0 to be the read value
+
+  PRINT0(LD_IO, "call omp_set_lock()\n");
+}
+
+EXTERN void omp_unset_lock(omp_lock_t *lock) {
+  int compare = SET;
+  int val = UNSET;
+  int old = atomicCAS(lock, compare, val);
+
+  PRINT0(LD_IO, "call omp_unset_lock()\n");
+}
+
+EXTERN int omp_test_lock(omp_lock_t *lock) {
+  // int atomicCAS(int* address, int compare, int val);
+  // (old == compare ? val : old)
+  int compare = UNSET;
+  int val = SET;
+
+  int ret = atomicCAS(lock, compare, val);
+
+  PRINT(LD_IO, "call omp_test_lock() return %d\n", ret);
+
+  return ret;
+}
+
+//for xlf Fotran
+//Fotran, the return is LOGICAL type
+
+#define FLOGICAL long
+EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() {
+        int ret = omp_is_initial_device();
+        if (ret == 0 )
+                return (FLOGICAL) 0;
+        else
+                return (FLOGICAL) 1;
+}
+
+EXTERN int __xlf_omp_is_initial_device_i4() {
+        int ret = omp_is_initial_device();
+        if (ret == 0 )
+                return 0;
+        else
+                return 1;
+}
+
+EXTERN long __xlf_omp_get_team_num_i4() {
+	int ret = omp_get_team_num();
+	return (long)ret;
+}
+
+EXTERN long __xlf_omp_get_num_teams_i4() {
+	int ret = omp_get_num_teams();
+	return (long)ret;
+}
+
+EXTERN void xlf_debug_print_int(int *p) {
+	printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p==0?0:*p);
+}
+
+EXTERN void xlf_debug_print_long(long *p) {
+	printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p==0?0:*p);
+}
+
+EXTERN void xlf_debug_print_float(float *p) {
+	printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p==0?0:*p);
+}
+
+EXTERN void xlf_debug_print_double(double *p) {
+	printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p==0?0:*p);
+}
+
+EXTERN void xlf_debug_print_addr(void *p) {
+	printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p);
+}
+
Index: libomptarget/deviceRTLs/nvptx/src/loop.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -0,0 +1,753 @@
+//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the KMPC interface
+// for the loop construct plus other worksharing constructs that use the same
+// interface as loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// template class that encapsulate all the helper functions
+//
+// T is loop iteration type (32 | 64)  (unsigned | signed)
+// ST is the signed version of T
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
+public:
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling with chunk
+
+  // Generic implementation of OMP loop scheduling with static policy
+  /*! \brief Calculate initial bounds for static loop and stride
+   *  @param[in] loc location in code of the call (not used here)
+   *  @param[in] global_tid global thread id
+   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
+   *  @param[in] plastiter pointer to last iteration
+   *  @param[in,out] pointer to loop lower bound. it will contain value of
+   *  lower bound of first chunk
+   *  @param[in,out] pointer to loop upper bound. It will contain value of
+   *  upper bound of first chunk
+   *  @param[in,out] pointer to loop stride. It will contain value of stride
+   *  between two successive chunks executed by the same thread
+   *  @param[in] loop increment bump
+   *  @param[in] chunk size
+   */
+
+  // helper function for static chunk
+  INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
+                                    T entityId, T numberOfEntities) {
+    // each thread executes multiple chunks all of the same size, except
+    // the last one
+
+    // distance between two successive chunks
+    stride = numberOfEntities * chunk;
+    lb = lb + entityId * chunk;
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    // Say ub' is the begining of the last chunk. Then who ever has a
+    // lower bound plus a multiple of the increment equal to ub' is
+    // the last one.
+    T beginingLastChunk = inputUb - (inputUb % chunk);
+    last = ((beginingLastChunk - lb) % stride) == 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling without chunk
+
+  // helper function for static no chunk
+  INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
+                                      T entityId, T numberOfEntities) {
+    // No chunk size specified.  Each thread or warp gets at most one
+    // chunk; chunks are all almost of equal size
+    T loopSize = ub - lb + 1;
+
+    chunk = loopSize / numberOfEntities;
+    T leftOver = loopSize - chunk * numberOfEntities;
+
+    if (entityId < leftOver) {
+      chunk++;
+      lb = lb + entityId * chunk;
+    } else {
+      lb = lb + entityId * chunk + leftOver;
+    }
+
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    last = ub == inputUb;
+    stride = loopSize;   // make sure we only do 1 chunk per warp
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for Static Init
+
+  INLINE static void for_static_init(int32_t schedtype, int32_t *plastiter,
+                                     T *plower, T *pupper,
+                                     ST *pstride, ST chunk,
+                                     bool IsSPMDExecutionMode,
+                                     bool IsOMPRuntimeUnavailable = false) {
+    // When IsOMPRuntimeUnavailable is true, we assume that the caller is
+    // in an L0 parallel region and that all worker threads participate.
+
+    int tid = GetLogicalThreadIdInBlock();
+
+    // Assume we are in teams region or that we use a single block
+    // per target region
+    ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(
+        tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+
+    // All warps that are in excess of the maximum requested, do
+    // not execute the loop
+    PRINT(LD_LOOP, "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
+                   "%d, num tids %d\n",
+          GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable),
+          schedtype, P64(chunk),
+          GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable),
+          GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable));
+    ASSERT0(LT_FUSSY,
+            (GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable)) <
+            (GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable)),
+            "current thread is not needed here; error");
+
+    // copy
+    int lastiter = 0;
+    T lb = *plower;
+    T ub = *pupper;
+    ST stride = *pstride;
+    T entityId, numberOfEntities;
+    // init
+    switch (schedtype) {
+    case kmp_sched_static_chunk: {
+      if (chunk > 0) {
+        entityId = GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+        numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, numberOfEntities);
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_nochunk: {
+      entityId = GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+      numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId, numberOfEntities);
+      break;
+    }
+    case kmp_sched_distr_static_chunk: {
+      if (chunk > 0) {
+        entityId = GetOmpTeamId();
+        numberOfEntities = GetNumberOfOmpTeams();
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, numberOfEntities);
+        break;
+      } // note: if chunk <=0, use nochunk
+    }
+    case kmp_sched_distr_static_nochunk: {
+      entityId = GetOmpTeamId();
+      numberOfEntities = GetNumberOfOmpTeams();
+
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId, numberOfEntities);
+      break;
+    }
+    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
+      entityId =
+          GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+                                IsOMPRuntimeUnavailable) *
+              GetOmpTeamId() +
+          GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+      numberOfEntities = GetNumberOfOmpTeams() *
+                         GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+                                               IsOMPRuntimeUnavailable);
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, numberOfEntities);
+      break;
+    }
+    default: {
+      ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype);
+      PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
+            schedtype);
+      entityId = GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+      numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, numberOfEntities);
+    }
+    }
+    // copy back
+    *plastiter = lastiter;
+    *plower = lb;
+    *pupper = ub;
+    *pstride = stride;
+    PRINT(LD_LOOP,
+          "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n",
+          GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable),
+          GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch Init
+
+  INLINE static int OrderedSchedule(kmp_sched_t schedule) {
+    return schedule >= kmp_sched_ordered_first &&
+           schedule <= kmp_sched_ordered_last;
+  }
+
+  INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st,
+                                   ST chunk) {
+    int tid = GetLogicalThreadIdInBlock();
+    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
+    T tnum = currTaskDescr->ThreadsInTeam();
+    T tripCount = ub - lb + 1; // +1 because ub is inclusive
+    ASSERT0(
+        LT_FUSSY,
+        GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
+            GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+        "current thread is not needed here; error");
+
+    /* Currently just ignore the monotonic and non-monotonic modifiers
+     * (the compiler isn't producing them * yet anyway).
+     * When it is we'll want to look at them somewhere here and use that
+     * information to add to our schedule choice. We shouldn't need to pass
+     * them on, they merely affect which schedule we can legally choose for
+     * various dynamic cases. (In paritcular, whether or not a stealing scheme
+     * is legal).
+     */
+    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+    // Process schedule.
+    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
+      PRINT(LD_LOOP,
+            "go sequential as tnum=%d, trip count %lld, ordered sched=%d\n",
+            tnum, P64(tripCount), schedule);
+      schedule = kmp_sched_static_chunk;
+      chunk = tripCount; // one thread gets the whole loop
+
+    } else if (schedule == kmp_sched_runtime) {
+      // process runtime
+      omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
+      chunk = currTaskDescr->RuntimeChunkSize();
+      switch (rtSched) {
+      case omp_sched_static: {
+        if (chunk > 0)
+          schedule = kmp_sched_static_chunk;
+        else
+          schedule = kmp_sched_static_nochunk;
+        break;
+      }
+      case omp_sched_auto: {
+        schedule = kmp_sched_static_chunk;
+        chunk = 1;
+        break;
+      }
+      case omp_sched_dynamic:
+      case omp_sched_guided: {
+        schedule = kmp_sched_dynamic;
+        break;
+      }
+      }
+      PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", schedule,
+            P64(chunk));
+    } else if (schedule == kmp_sched_auto) {
+      schedule = kmp_sched_static_chunk;
+      chunk = 1;
+      PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", schedule,
+            P64(chunk));
+    } else {
+      PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk));
+      ASSERT(LT_FUSSY,
+             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+             "unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
+    }
+
+    // save sched state
+    omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+    omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+
+    // init schedules
+    if (schedule == kmp_sched_static_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
+      int lastiter = 0;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static chunk) : num threads = %d, ub = %lld,"
+            "next lower bound = %lld, stride = %lld\n",
+            GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            omptarget_nvptx_threadPrivateContext->Stride(tid));
+
+    } else if (schedule == kmp_sched_static_nochunk) {
+      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
+      int lastiter = 0;
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static nochunk) : num threads = %d, ub = %lld,"
+            "next lower bound = %lld, stride = %lld\n",
+            GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            omptarget_nvptx_threadPrivateContext->Stride(tid));
+
+    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
+      if (chunk < 1)
+        chunk = 1;
+      Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks
+      // but each thread (but one) must discover that it is last
+      eventNum += tnum;
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum;
+      PRINT(LD_LOOP,
+            "dispatch init (dyn) : num threads = %d, ub = %lld, chunk %lld, "
+            "events number = %lld\n",
+            GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            omptarget_nvptx_threadPrivateContext->Chunk(tid),
+            omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch next
+
+  INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg,
+                                     Counter priv, T &lb, T &ub,
+                                     Counter &chunkId, Counter &currentEvent,
+                                     T chunkSize, T loopUpperBound) {
+    // get next event atomically
+    Counter nextEvent = cg.Next();
+    // calculate chunk Id (priv was initialized upon entering the loop to
+    // 'start' == 'event')
+    chunkId = nextEvent - priv;
+    // calculate lower bound for all lanes in the warp
+    lb = chunkId * chunkSize; // this code assume normalization of LB
+    ub = lb + chunkSize - 1;  // Clang uses i <= ub
+
+    // 3 result cases:
+    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
+    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
+    //  NOT_FINISHED
+    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
+    currentEvent = nextEvent;
+    // a.
+    if (ub <= loopUpperBound) {
+      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
+            P64(ub), P64(loopUpperBound));
+      return NOT_FINISHED;
+    }
+    // b.
+    if (lb <= loopUpperBound) {
+      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
+            P64(lb), P64(ub), P64(loopUpperBound));
+      ub = loopUpperBound;
+      return LAST_CHUNK;
+    }
+    // c. if we are here, we are in case 'c'
+    lb = loopUpperBound + 1;
+    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
+          P64(ub), P64(loopUpperBound));
+    return FINISHED;
+  }
+
+  // On Pascal, with inlining of the runtime into the user application,
+  // this code deadlocks.  This is probably because different threads
+  // in a warp cannot make independent progress.
+  NOINLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper,
+                                    ST *pstride) {
+    // ID of a thread in its own warp
+
+    // automatically selects thread or warp ID based on selected implementation
+    int tid = GetLogicalThreadIdInBlock();
+    ASSERT0(LT_FUSSY,
+            GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
+                GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+            "current thread is not needed here; error");
+    // retrieve schedule
+    kmp_sched_t schedule =
+        omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
+
+    // xxx reduce to one
+    if (schedule == kmp_sched_static_chunk ||
+        schedule == kmp_sched_static_nochunk) {
+      T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
+      T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
+      // finished?
+      if (myLb > ub) {
+        PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
+              P64(myLb), P64(ub));
+        return DISPATCH_FINISHED;
+      }
+      // not finished, save current bounds
+      ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
+      *plower = myLb;
+      T myUb = myLb + chunk - 1; // Clang uses i <= ub
+      if (myUb > ub)
+        myUb = ub;
+      *pupper = myUb;
+      *plast = (int32_t) (myUb == ub);
+
+      // increment next lower bound by the stride
+      ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) =
+          myLb + stride;
+      PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
+            P64(*plower), P64(*pupper));
+      return DISPATCH_NOTFINISHED;
+    }
+    ASSERT0(LT_FUSSY,
+            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+            "bad sched");
+    omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+    T myLb, myUb;
+    Counter chunkId;
+    // xxx current event is now local
+    omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup();
+    int finished = DynamicNextChunk(
+        cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb,
+        chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid),
+        omptarget_nvptx_threadPrivateContext->Chunk(tid),
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+
+    if (finished == FINISHED) {
+      cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid),
+                  omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
+      cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid),
+                 omptarget_nvptx_threadPrivateContext->CurrentEvent(tid));
+
+      return DISPATCH_FINISHED;
+    }
+
+    // not finished (either not finished or last chunk)
+    *plast = (int32_t) (myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+    *plower = myLb;
+    *pupper = myUb;
+    *pstride = 1;
+
+    PRINT(LD_LOOP,
+          "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n",
+          GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), GetNumberOfWorkersInTeam(),
+          P64(*plower), P64(*pupper), P64(*pstride));
+    return DISPATCH_NOTFINISHED;
+  }
+
+  INLINE static void dispatch_fini() {
+    // nothing
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // end of template class that encapsulate all the helper functions
+  ////////////////////////////////////////////////////////////////////////////////
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (dyn loops)
+////////////////////////////////////////////////////////////////////////////////
+
+// init
+EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid,
+                                   int32_t schedule, int32_t lb, int32_t ub,
+                                   int32_t st, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
+      (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
+                                    int32_t schedule, uint32_t lb, uint32_t ub,
+                                    int32_t st, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
+      (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
+                                   int32_t schedule, int64_t lb, int64_t ub,
+                                   int64_t st, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
+      (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
+                                    int32_t schedule, uint64_t lb, uint64_t ub,
+                                    int64_t st, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
+      (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+// next
+EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t tid,
+                                  int32_t *p_last, int32_t *p_lb, int32_t *p_ub,
+                                  int32_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
+  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
+      p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t tid,
+                                   int32_t *p_last, uint32_t *p_lb,
+                                   uint32_t *p_ub, int32_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
+  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
+      p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t tid,
+                                  int32_t *p_last, int64_t *p_lb, int64_t *p_ub,
+                                  int64_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
+  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
+      p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t tid,
+                                   int32_t *p_last, uint64_t *p_lb,
+                                   uint64_t *p_ub, int64_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
+  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
+      p_last, p_lb, p_ub, p_st);
+}
+
+// fini
+EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (static loops)
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid,
+                                     int32_t schedtype, int32_t *plastiter,
+                                     int32_t *plower, int32_t *pupper,
+                                     int32_t *pstride, int32_t incr,
+                                     int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
+}
+
+EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid,
+                                      int32_t schedtype, int32_t *plastiter,
+                                      uint32_t *plower, uint32_t *pupper,
+                                      int32_t *pstride, int32_t incr,
+                                      int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
+}
+
+EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid,
+                                     int32_t schedtype, int32_t *plastiter,
+                                     int64_t *plower, int64_t *pupper,
+                                     int64_t *pstride, int64_t incr,
+                                     int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
+}
+
+EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid,
+                                      int32_t schedtype, int32_t *plastiter,
+                                      uint64_t *plower, uint64_t *pupper,
+                                      int64_t *pstride, int64_t incr,
+                                      int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+                                          int32_t schedtype, int32_t *plastiter,
+                                          int32_t *plower, int32_t *pupper,
+                                          int32_t *pstride, int32_t incr,
+                                          int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*isSPMDExecutionMode=*/true,
+      /*IsOMPRuntimeUnavailable=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+                                           int32_t schedtype,
+                                           int32_t *plastiter, uint32_t *plower,
+                                           uint32_t *pupper, int32_t *pstride,
+                                           int32_t incr, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*isSPMDExecutionMode=*/true,
+      /*IsOMPRuntimeUnavailable=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+                                          int32_t schedtype, int32_t *plastiter,
+                                          int64_t *plower, int64_t *pupper,
+                                          int64_t *pstride, int64_t incr,
+                                          int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*isSPMDExecutionMode=*/true,
+      /*IsOMPRuntimeUnavailable=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+                                           int32_t schedtype,
+                                           int32_t *plastiter, uint64_t *plower,
+                                           uint64_t *pupper, int64_t *pstride,
+                                           int64_t incr, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*isSPMDExecutionMode=*/true,
+      /*IsOMPRuntimeUnavailable=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_generic(
+    kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*isSPMDExecutionMode=*/false,
+      /*IsOMPRuntimeUnavailable=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_generic(
+    kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*isSPMDExecutionMode=*/false,
+      /*IsOMPRuntimeUnavailable=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_generic(
+    kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*isSPMDExecutionMode=*/false,
+      /*IsOMPRuntimeUnavailable=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_generic(
+    kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*isSPMDExecutionMode=*/false,
+      /*IsOMPRuntimeUnavailable=*/true);
+}
+
+EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_for_static_fini\n");
+}
+
+namespace {
+INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
+  int NumWarps = ((NumThreads + warpSize - 1) / warpSize);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  // On Volta and newer architectures we require that all lanes in
+  // a warp (at least, all present for the kernel launch) participate in the
+  // barrier.  This is enforced when launching the parallel region.  An
+  // exception is when there are < warpSize workers.  In this case only 1 worker
+  // is started, so we don't need a barrier.
+  if (NumThreads > 1) {
+#endif
+    named_sync(L1_BARRIER, warpSize * NumWarps);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  }
+#endif
+}
+};
+
+EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, int32_t gtid,
+  int32_t varNum, void *array) {
+  PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
+
+  omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+  int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), isSPMDMode(),
+                           isRuntimeUninitialized());
+  uint32_t NumThreads = GetNumberOfOmpThreads(GetLogicalThreadIdInBlock(),
+      isSPMDMode(), isRuntimeUninitialized());
+  uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
+  for (unsigned i = 0; i < varNum; i++) {
+    // Reset buffer.
+    if (tid == 0)
+      *Buffer = 0;  // Reset to minimum loop iteration value.
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+
+    // Atomic max of iterations.
+    uint64_t *varArray = (uint64_t *) array;
+    uint64_t elem = varArray[i];
+    (void) atomicMax((unsigned long long int *) Buffer, (unsigned long long int) elem);
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+
+    // Read max value and update thread private array.
+    varArray[i] = *Buffer;
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+  }
+}
Index: libomptarget/deviceRTLs/nvptx/src/omp_data.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -0,0 +1,50 @@
+//===------------ omp_data.cu - NVPTX OpenMP GPU objects --------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the data objects used on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// global data holding OpenMP state information
+////////////////////////////////////////////////////////////////////////////////
+
+__device__
+omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
+    omptarget_nvptx_device_State[MAX_SM];
+
+// Pointer to this team's OpenMP state object
+__device__ __shared__ omptarget_nvptx_ThreadPrivateContext
+    *omptarget_nvptx_threadPrivateContext;
+
+////////////////////////////////////////////////////////////////////////////////
+// The team master sets the outlined parallel function in this variable to
+// communicate with the workers.  Since it is in shared memory, there is one
+// copy of these variables for each kernel, instance, and team.
+////////////////////////////////////////////////////////////////////////////////
+volatile __device__ __shared__ omptarget_nvptx_WorkFn   omptarget_nvptx_workFn;
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenMP kernel execution parameters
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ uint32_t execution_param;
+
+////////////////////////////////////////////////////////////////////////////////
+// Data sharing state
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ DataSharingStateTy DataSharingState;
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Scratchpad for teams reduction.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ void *ReductionScratchpadPtr;
+
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -0,0 +1,361 @@
+//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPTARGET_NVPTX_H
+#define __OMPTARGET_NVPTX_H
+
+// std includes
+#include <stdlib.h>
+#include <stdint.h>
+
+// cuda includes
+#include <cuda.h>
+#include <math.h>
+
+// local includes
+#include "option.h" // choices we have
+#include "counter_group.h"
+#include "debug.h" // debug
+#include "interface.h" // interfaces with omp, compiler, and user
+#include "state-queue.h"
+#include "support.h"
+
+#define OMPTARGET_NVPTX_VERSION 1.1
+
+// used by the library for the interface with the app
+#define DISPATCH_FINISHED 0
+#define DISPATCH_NOTFINISHED 1
+
+// used by dynamic scheduling
+#define FINISHED 0
+#define NOT_FINISHED 1
+#define LAST_CHUNK 2
+
+#define BARRIER_COUNTER 0
+#define ORDERED_COUNTER 1
+
+// Macros for Cuda intrinsics
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
+#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
+                        __shfl_down_sync((mask), (var), (delta), (width))
+#define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate))
+#define __ACTIVEMASK() __activemask()
+#else
+#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
+#define __SHFL_DOWN_SYNC(mask, var, delta, width) __shfl_down((var), (delta), (width))
+#define __BALLOT_SYNC(mask, predicate) __ballot((predicate))
+#define __ACTIVEMASK() __ballot(1)
+#endif
+
+// Data sharing related quantities, need to match what is used in the compiler.
+enum DATA_SHARING_SIZES {
+  // The maximum number of workers in a kernel.
+  DS_Max_Worker_Threads = 992,
+  // The size reserved for data in a shared memory slot.
+  DS_Slot_Size = 256,
+  // The number of bits required to represent the maximum number of threads in a
+  // warp.
+  DS_Max_Worker_Warp_Size_Bits = 5,
+  // The maximum number of threads in a worker warp.
+  DS_Max_Worker_Warp_Size = 1 << DS_Max_Worker_Warp_Size_Bits,
+  // Mask for the 5 (DS_Max_Worker_Warp_Size_Bits) LSBs of a thread ID,
+  // can be used to retrieve threadID % Warp_Size.
+  DS_Max_Worker_Warp_Size_Bit_Mask = DS_Max_Worker_Warp_Size - 1,
+  // The slot size that should be reserved for a working warp.
+  DS_Worker_Warp_Slot_Size = DS_Max_Worker_Warp_Size * DS_Slot_Size,
+  // The maximum number of warps in use
+  DS_Max_Warp_Number = 32,
+};
+
+// Data structure to keep in shared memory that traces the current slot, stack, and frame pointer as well as the active threads that didn't exit the current environment.
+struct DataSharingStateTy {
+  __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
+  void *StackPtr[DS_Max_Warp_Number];
+  void *FramePtr[DS_Max_Warp_Number];
+  int32_t ActiveThreads[DS_Max_Warp_Number];
+};
+// Additional worker slot type which is initialized with the default worker slot size of 4*32 bytes.
+struct __kmpc_data_sharing_worker_slot_static {
+  __kmpc_data_sharing_slot *Next;
+  void *DataEnd;
+  char Data[DS_Worker_Warp_Slot_Size];
+};
+// Additional master slot type which is initialized with the default master slot size of 4 bytes.
+struct __kmpc_data_sharing_master_slot_static {
+  __kmpc_data_sharing_slot *Next;
+  void *DataEnd;
+  char Data[DS_Slot_Size];
+};
+extern __device__ __shared__ DataSharingStateTy DataSharingState;
+
+////////////////////////////////////////////////////////////////////////////////
+// task ICV and (implicit & explicit) task state
+
+class omptarget_nvptx_TaskDescr {
+public:
+  // methods for flags
+  INLINE omp_sched_t GetRuntimeSched();
+  INLINE void SetRuntimeSched(omp_sched_t sched);
+  INLINE int IsDynamic() { return data.items.flags & TaskDescr_IsDynamic; }
+  INLINE void SetDynamic() {
+    data.items.flags = data.items.flags | TaskDescr_IsDynamic;
+  }
+  INLINE void ClearDynamic() {
+    data.items.flags = data.items.flags & (~TaskDescr_IsDynamic);
+  }
+  INLINE int InParallelRegion() { return data.items.flags & TaskDescr_InPar; }
+  INLINE int InL2OrHigherParallelRegion() { return data.items.flags & TaskDescr_InParL2P; }
+  INLINE int IsParallelConstruct() {
+    return data.items.flags & TaskDescr_IsParConstr;
+  }
+  INLINE int IsTaskConstruct() { return !IsParallelConstruct(); }
+  // methods for other fields
+  INLINE uint16_t &NThreads() { return data.items.nthreads; }
+  INLINE uint16_t &ThreadLimit() { return data.items.threadlimit; }
+  INLINE uint16_t &ThreadId() { return data.items.threadId; }
+  INLINE uint16_t &ThreadsInTeam() { return data.items.threadsInTeam; }
+  INLINE uint64_t &RuntimeChunkSize() { return data.items.runtimeChunkSize; }
+  INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; }
+  INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
+    prev = taskDescr;
+  }
+  // init & copy
+  INLINE void InitLevelZeroTaskDescr();
+  INLINE void InitLevelOneTaskDescr(
+      uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+  INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+  INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr,
+                              uint16_t tnum);
+  INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
+  INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
+                                   uint16_t tid, uint16_t tnum);
+
+private:
+  // bits for flags: (7 used, 1 free)
+  //   3 bits (SchedMask) for runtime schedule
+  //   1 bit (IsDynamic) for dynamic schedule (false = static)
+  //   1 bit (InPar) if this thread has encountered one or more parallel region
+  //   1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
+  //   1 bit (InParL2+) if this thread has encountered L2 or higher parallel region
+  static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
+  static const uint8_t TaskDescr_IsDynamic = 0x8;
+  static const uint8_t TaskDescr_InPar = 0x10;
+  static const uint8_t TaskDescr_IsParConstr = 0x20;
+  static const uint8_t TaskDescr_InParL2P = 0x40;
+
+  union { // both have same size
+    uint64_t vect[2];
+    struct TaskDescr_items {
+      uint8_t flags; // 6 bit used (see flag above)
+      uint8_t unused;
+      uint16_t nthreads;         // thread num for subsequent parallel regions
+      uint16_t threadlimit;      // thread limit ICV
+      uint16_t threadId;         // thread id
+      uint16_t threadsInTeam;    // threads in current team
+      uint64_t runtimeChunkSize; // runtime chunk size
+    } items;
+  } data;
+  omptarget_nvptx_TaskDescr *prev;
+};
+
+// build on kmp
+typedef struct omptarget_nvptx_ExplicitTaskDescr {
+  omptarget_nvptx_TaskDescr
+      taskDescr; // omptarget_nvptx task description (must be first)
+  kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
+} omptarget_nvptx_ExplicitTaskDescr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Descriptor of a parallel region (worksharing in general)
+
+class omptarget_nvptx_WorkDescr {
+
+public:
+  // access to data
+  INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; }
+  INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
+  // init
+  INLINE void InitWorkDescr();
+
+private:
+  omptarget_nvptx_CounterGroup cg; // for barrier (no other needed)
+  omptarget_nvptx_TaskDescr masterTaskICV;
+  bool hasCancel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+class omptarget_nvptx_TeamDescr {
+public:
+  // access to data
+  INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
+    return &levelZeroTaskDescr;
+  }
+  INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
+    return workDescrForActiveParallel;
+  }
+  INLINE omp_lock_t *CriticalLock() { return &criticalLock; }
+  INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
+
+  // init
+  INLINE void InitTeamDescr();
+
+  INLINE __kmpc_data_sharing_slot *RootS(int wid) {
+    // If this is invoked by the master thread of the master warp then intialize it with a smaller slot.
+    if (wid == DS_Max_Worker_Warp_Size - 1){
+      // Initialize the pointer to the end of the slot given the size of the data section. DataEnd is non-inclusive.
+      master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
+      // We currently do not have a next slot.
+      master_rootS[0].Next = 0;
+      return (__kmpc_data_sharing_slot *) &master_rootS[0];
+    }
+    // Initialize the pointer to the end of the slot given the size of the data section. DataEnd is non-inclusive.
+    worker_rootS[wid].DataEnd = &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+    // We currently do not have a next slot.
+    worker_rootS[wid].Next = 0;
+    return (__kmpc_data_sharing_slot *) &worker_rootS[wid];
+  }
+
+private:
+  omptarget_nvptx_TaskDescr
+      levelZeroTaskDescr; // icv for team master initial thread
+  omptarget_nvptx_WorkDescr
+      workDescrForActiveParallel; // one, ONLY for the active par
+  omp_lock_t criticalLock;
+  uint64_t lastprivateIterBuffer;
+
+  __align__(16) __kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Worker_Warp_Size - 1];
+  __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// thread private data (struct of arrays for better coalescing)
+// tid refers here to the global thread id
+// do not support multiple concurrent kernel a this time
+class omptarget_nvptx_ThreadPrivateContext {
+public:
+  // task
+  INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
+    return &levelOneTaskDescr[tid];
+  }
+  INLINE void SetTopLevelTaskDescr(int tid,
+                                   omptarget_nvptx_TaskDescr *taskICV) {
+    topTaskDescr[tid] = taskICV;
+  }
+  INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid);
+  // parallel
+  INLINE uint16_t &NumThreadsForNextParallel(int tid) {
+    return nextRegion.tnum[tid];
+  }
+  // simd
+  INLINE uint16_t &SimdLimitForNextSimd(int tid) {
+    return nextRegion.slim[tid];
+  }
+  // sync
+  INLINE Counter &Priv(int tid) { return priv[tid]; }
+  INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; }
+  // schedule (for dispatch)
+  INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
+  INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
+  INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
+  // state for dispatch with dyn/guided
+  INLINE Counter &CurrentEvent(int tid) {
+    return currEvent_or_nextLowerBound[tid];
+  }
+  INLINE Counter &EventsNumber(int tid) { return eventsNum_or_stride[tid]; }
+  // state for dispatch with static
+  INLINE Counter &NextLowerBound(int tid) {
+    return currEvent_or_nextLowerBound[tid];
+  }
+  INLINE Counter &Stride(int tid) { return eventsNum_or_stride[tid]; }
+
+  INLINE omptarget_nvptx_TeamDescr &TeamContext() {
+    return teamContext;
+  }
+
+  INLINE void InitThreadPrivateContext(int tid);
+  INLINE void SetSourceQueue(uint64_t Src) {
+    SourceQueue = Src;
+  }
+  INLINE uint64_t GetSourceQueue() {
+    return SourceQueue;
+  }
+private:
+  // team context for this team
+  omptarget_nvptx_TeamDescr teamContext;
+  // task ICV for implict threads in the only parallel region
+  omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
+  // pointer where to find the current task ICV (top of the stack)
+  omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
+  union {
+    // Only one of the two is live at the same time.
+    // parallel
+    uint16_t tnum[MAX_THREADS_PER_TEAM];
+    // simd limit
+    uint16_t slim[MAX_THREADS_PER_TEAM];
+  } nextRegion;
+  // sync
+  Counter priv[MAX_THREADS_PER_TEAM];
+  // schedule (for dispatch)
+  kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
+  int64_t chunk[MAX_THREADS_PER_TEAM];
+  int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
+  // state for dispatch with dyn/guided OR static (never use both at a time)
+  Counter currEvent_or_nextLowerBound[MAX_THREADS_PER_TEAM];
+  Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM];
+  // Queue to which this object must be returned.
+  uint64_t SourceQueue;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// global data tables
+////////////////////////////////////////////////////////////////////////////////
+
+extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext
+    *omptarget_nvptx_threadPrivateContext;
+extern __device__ __shared__ uint32_t execution_param;
+extern __device__ __shared__ void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// work function (outlined parallel/simd functions) and arguments.
+// needed for L1 parallelism only.
+////////////////////////////////////////////////////////////////////////////////
+
+typedef void * omptarget_nvptx_WorkFn;
+extern volatile __device__ __shared__ omptarget_nvptx_WorkFn   omptarget_nvptx_workFn;
+
+////////////////////////////////////////////////////////////////////////////////
+// get private data structures
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
+INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor();
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
+
+////////////////////////////////////////////////////////////////////////////////
+// inlined implementation
+////////////////////////////////////////////////////////////////////////////////
+
+#include "supporti.h"
+#include "omptarget-nvptxi.h"
+#include "counter_groupi.h"
+
+#endif
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -0,0 +1,192 @@
+//===--- omptarget-nvptx.cu - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the initialization code for the GPU
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// global data tables
+////////////////////////////////////////////////////////////////////////////////
+
+extern __device__
+omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
+    omptarget_nvptx_device_State[MAX_SM];
+
+extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext
+    *omptarget_nvptx_threadPrivateContext;
+
+//
+// The team master sets the outlined function and its arguments in these
+// variables to communicate with the workers.  Since they are in shared memory,
+// there is one copy of these variables for each kernel, instance, and team.
+//
+extern volatile __device__ __shared__ omptarget_nvptx_WorkFn   omptarget_nvptx_workFn;
+extern __device__ __shared__ uint32_t execution_param;
+
+////////////////////////////////////////////////////////////////////////////////
+// init entry points
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE unsigned smid() {
+  unsigned id;
+  asm("mov.u32 %0, %%smid;" : "=r"(id));
+  return id;
+}
+
+INLINE unsigned n_sm() {
+  unsigned n_sm;
+  asm("mov.u32 %0, %%nsmid;" : "=r"(n_sm));
+  return n_sm;
+}
+
+EXTERN void __kmpc_kernel_init_params(void *Ptr) {
+  PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
+        OMPTARGET_NVPTX_VERSION);
+
+  SetTeamsReductionScratchpadPtr(Ptr);
+}
+
+EXTERN void __kmpc_kernel_init(int ThreadLimit,
+                               int16_t RequiresOMPRuntime) {
+  PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
+        OMPTARGET_NVPTX_VERSION);
+
+  if (!RequiresOMPRuntime) {
+    // If OMP runtime is not required don't initialize OMP state.
+    setExecutionParameters(Generic, RuntimeUninitialized);
+    return;
+  }
+  setExecutionParameters(Generic, RuntimeInitialized);
+
+  int threadIdInBlock = GetThreadIdInBlock();
+  ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
+          "__kmpc_kernel_init() must be called by team master warp only!");
+  PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
+
+  // Get a state object from the queue.
+  int slot = smid() % MAX_SM;
+  omptarget_nvptx_threadPrivateContext = omptarget_nvptx_device_State[slot].Dequeue();
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  omptarget_nvptx_threadPrivateContext->SetSourceQueue(slot);
+#endif
+
+  // init thread private
+  int threadId = GetLogicalThreadIdInBlock();
+  omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(
+      threadId);
+
+  // init team context
+  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+  currTeamDescr.InitTeamDescr();
+  // this thread will start execution... has to update its task ICV
+  // to point to the level zero task ICV. That ICV was init in
+  // InitTeamDescr()
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, currTeamDescr.LevelZeroTaskDescr());
+
+  // set number of threads and thread limit in team to started value
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
+          threadId);
+  currTaskDescr->NThreads() = GetNumberOfWorkersInTeam();
+  currTaskDescr->ThreadLimit() = ThreadLimit;
+}
+
+EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
+  if (IsOMPRuntimeInitialized) {
+    // Enqueue omp state object for use by another team.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+    int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue();
+#else
+    int slot = smid() % MAX_SM;
+#endif
+    omptarget_nvptx_device_State[slot].Enqueue(omptarget_nvptx_threadPrivateContext);
+  }
+  // Done with work.  Kill the workers.
+  omptarget_nvptx_workFn = 0;
+}
+
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
+                                    int16_t RequiresOMPRuntime,
+                                    int16_t RequiresDataSharing) {
+  PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
+
+  if (!RequiresOMPRuntime) {
+    // If OMP runtime is not required don't initialize OMP state.
+    setExecutionParameters(Spmd, RuntimeUninitialized);
+    return;
+  }
+  setExecutionParameters(Spmd, RuntimeInitialized);
+
+  //
+  // Team Context Initialization.
+  //
+  // In SPMD mode there is no master thread so use any cuda thread for team
+  // context initialization.
+  int threadId = GetThreadIdInBlock();
+  if (threadId == 0) {
+    // Get a state object from the queue.
+    int slot = smid() % MAX_SM;
+    omptarget_nvptx_threadPrivateContext = omptarget_nvptx_device_State[slot].Dequeue();
+
+    omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+    omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+    // init team context
+    currTeamDescr.InitTeamDescr();
+    // init counters (copy start to init)
+    workDescr.CounterGroup().Reset();
+  }
+  __syncthreads();
+
+  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+
+  //
+  // Initialize task descr for each thread.
+  //
+  omptarget_nvptx_TaskDescr *newTaskDescr =
+      omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
+  ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+  newTaskDescr->InitLevelOneTaskDescr(
+    ThreadLimit, currTeamDescr.LevelZeroTaskDescr());
+  newTaskDescr->ThreadLimit() = ThreadLimit;
+  // install new top descriptor
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                             newTaskDescr);
+
+  // init thread private from init value
+  workDescr.CounterGroup().Init(
+      omptarget_nvptx_threadPrivateContext->Priv(threadId));
+  PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of "
+                "%d threads\n",
+                newTaskDescr->ThreadId(), newTaskDescr->NThreads());
+
+  if (RequiresDataSharing && threadId % WARPSIZE == 0) {
+    // Warp master innitializes data sharing environment.
+    unsigned WID = threadId >> DS_Max_Worker_Warp_Size_Bits;
+    __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(WID);
+    DataSharingState.SlotPtr[WID] = RootS;
+    DataSharingState.StackPtr[WID] = (void*)&RootS->Data[0];
+  }
+}
+
+EXTERN void __kmpc_spmd_kernel_deinit() {
+  // We're not going to pop the task descr stack of each thread since
+  // there are no more parallel regions in SPMD mode.
+  __syncthreads();
+  int threadId = GetThreadIdInBlock();
+  if (threadId == 0) {
+    // Enqueue omp state object for use by another team.
+    int slot = smid() % MAX_SM;
+    omptarget_nvptx_device_State[slot].Enqueue(omptarget_nvptx_threadPrivateContext);
+  }
+}
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -0,0 +1,196 @@
+//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// Task Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() {
+  // sched starts from 1..4; encode it as 0..3; so add 1 here
+  uint8_t rc = (data.items.flags & TaskDescr_SchedMask) + 1;
+  return (omp_sched_t)rc;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
+  // sched starts from 1..4; encode it as 0..3; so sub 1 here
+  uint8_t val = ((uint8_t)sched) - 1;
+  // clear current sched
+  data.items.flags &= ~TaskDescr_SchedMask;
+  // set new sched
+  data.items.flags |= val;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
+  // slow method
+  // flag:
+  //   default sched is static,
+  //   dyn is off (unused now anyway, but may need to sample from host ?)
+  //   not in parallel
+
+  data.items.flags = 0;
+  data.items.nthreads = GetNumberOfProcsInTeam();
+  ;                                // threads: whatever was alloc by kernel
+  data.items.threadId = 0;         // is master
+  data.items.threadsInTeam = 1;    // sequential
+  data.items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+}
+
+// This is called when all threads are started together in SPMD mode.
+// OMP directives include target parallel, target distribute parallel for, etc.
+INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
+    uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  // slow method
+  // flag:
+  //   default sched is static,
+  //   dyn is off (unused now anyway, but may need to sample from host ?)
+  //   in L1 parallel
+
+  data.items.flags =
+    TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
+  data.items.nthreads = 0;                   // # threads for subsequent parallel region
+  data.items.threadId = GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+  data.items.threadsInTeam = tnum;
+  data.items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+  prev = parentTaskDescr;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyData(
+    omptarget_nvptx_TaskDescr *sourceTaskDescr) {
+  data.vect[0] = sourceTaskDescr->data.vect[0];
+  data.vect[1] = sourceTaskDescr->data.vect[1];
+}
+
+INLINE void
+omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
+  CopyData(sourceTaskDescr);
+  prev = sourceTaskDescr->prev;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyParent(
+    omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  CopyData(parentTaskDescr);
+  prev = parentTaskDescr;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
+    omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  CopyParent(parentTaskDescr);
+  data.items.flags = data.items.flags & ~TaskDescr_IsParConstr;
+  ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
+    omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) {
+  CopyParent(masterTaskDescr);
+  // overrwrite specific items;
+  data.items.flags |=
+      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
+  data.items.threadsInTeam = tnum;             // set number of threads
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
+    omptarget_nvptx_TaskDescr *workTaskDescr) {
+  Copy(workTaskDescr);
+  //
+  // overrwrite specific items;
+  //
+  // The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
+  // This is so that the serial master (first lane in the master warp)
+  // gets a threadId of 0.
+  // However, we know that this function is always called in a parallel
+  // region where only workers are active.  The serial master thread
+  // never enters this region.  When a parallel region is executed serially,
+  // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
+  // are called, which never activate this region.
+  data.items.threadId =
+      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
+    omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid,
+    uint16_t tnum) {
+  CopyParent(parentTaskDescr);
+  data.items.flags |= TaskDescr_InParL2P;      // In L2+ parallelism
+  data.items.threadsInTeam = tnum;             // set number of threads
+  data.items.threadId = tid;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Thread Private Context
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TaskDescr *
+omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) {
+  ASSERT0(
+      LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
+      "Getting top level, tid is larger than allocated data structure size");
+  return topTaskDescr[tid];
+}
+
+INLINE void
+omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
+  // levelOneTaskDescr is init when starting the parallel region
+  // top task descr is NULL (team master version will be fixed separately)
+  topTaskDescr[tid] = NULL;
+  // no num threads value has been pushed
+  nextRegion.tnum[tid] = 0;
+  // priv counter init to zero
+  priv[tid] = 0;
+  // the following don't need to be init here; they are init when using dyn
+  // sched
+  // current_Event, events_Number, chunk, num_Iterations, schedule
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Work Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_WorkDescr::InitWorkDescr() {
+  cg.Clear(); // start and stop to zero too
+  // threadsInParallelTeam does not need to be init (done in start parallel)
+  hasCancel = FALSE;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Team Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
+  levelZeroTaskDescr.InitLevelZeroTaskDescr();
+  workDescrForActiveParallel.InitWorkDescr();
+  // omp_init_lock(criticalLock);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Get private data structure for thread
+////////////////////////////////////////////////////////////////////////////////
+
+// Utility routines for CUDA threads
+INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
+  return omptarget_nvptx_threadPrivateContext->TeamContext();
+}
+
+INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
+  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+  return currTeamDescr.WorkDescr();
+}
+
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
+  return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
+      threadId);
+}
+
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
+  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
+}
Index: libomptarget/deviceRTLs/nvptx/src/option.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/option.h
@@ -0,0 +1,92 @@
+//===------------ option.h - NVPTX OpenMP GPU options ------------ CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// GPU default options
+//
+//===----------------------------------------------------------------------===//
+#ifndef _OPTION_H_
+#define _OPTION_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// Kernel options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// following two defs must match absolute limit hardwired in the host RTL
+#define TEAMS_ABSOLUTE_LIMIT                                                   \
+  512 /* omptx limit (must match teamsAbsoluteLimit) */
+#define THREAD_ABSOLUTE_LIMIT                                                  \
+  1024 /* omptx limit (must match threadAbsoluteLimit) */
+
+// max number of thread per team
+#define MAX_THREADS_PER_TEAM THREAD_ABSOLUTE_LIMIT
+
+// max number of blocks depend on the kernel we are executing - pick default
+// here
+#define MAX_NUM_TEAMS TEAMS_ABSOLUTE_LIMIT
+#define WARPSIZE 32
+#define MAX_NUM_WARPS (MAX_NUM_TEAMS * THREAD_ABSOLUTE_LIMIT / WARPSIZE)
+#define MAX_NUM_THREADS MAX_NUM_WARPS *WARPSIZE
+
+#ifdef OMPTHREAD_IS_WARP
+// assume here one OpenMP thread per CUDA warp
+#define MAX_NUM_OMP_THREADS MAX_NUM_WARPS
+#else
+// assume here one OpenMP thread per CUDA thread
+#define MAX_NUM_OMP_THREADS MAX_NUM_THREADS
+#endif
+
+// The named barrier for active parallel threads of a team in an L1 parallel region
+// to synchronize with each other.
+#define L1_BARRIER (1)
+
+// Maximum number of omp state objects per SM allocated statically in global memory.
+#if __CUDA_ARCH__ >= 600
+#define OMP_STATE_COUNT 32
+#define MAX_SM 56
+#else
+#define OMP_STATE_COUNT 16
+#define MAX_SM 16
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// algo options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// data options
+////////////////////////////////////////////////////////////////////////////////
+
+// decide if counters are 32 or 64 bit
+#define Counter unsigned long long
+
+// TODO: KMP defines kmp_int to be 32 or 64 bits depending on the target.
+// think we don't need it here (meaning we can be always 64 bit compatible)
+//
+// #ifdef KMP_I8
+//   typedef kmp_int64		kmp_int;
+// #else
+//   typedef kmp_int32		kmp_int;
+// #endif
+
+////////////////////////////////////////////////////////////////////////////////
+// misc options (by def everythig here is device)
+////////////////////////////////////////////////////////////////////////////////
+
+#define EXTERN extern "C" __device__
+#define INLINE __inline__ __device__
+#define NOINLINE __noinline__ __device__
+#ifndef TRUE
+#define TRUE 1
+#endif
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+#endif
Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -0,0 +1,471 @@
+//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Parallel implemention in the GPU. Here is the pattern:
+//
+//    while (not finished) {
+//
+//    if (master) {
+//      sequential code, decide which par loop to do, or if finished
+//     __kmpc_kernel_prepare_parallel() // exec by master only
+//    }
+//    syncthreads // A
+//    __kmpc_kernel_parallel() // exec by all
+//    if (this thread is included in the parallel) {
+//      switch () for all parallel loops
+//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
+//    }
+//
+//
+//    The reason we don't exec end_parallel for the threads not included
+//    in the parallel loop is that for each barrier in the parallel
+//    region, these non-included threads will cycle through the
+//    syncthread A. Thus they must preserve their current threadId that
+//    is larger than thread in team.
+//
+//    To make a long story short...
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+typedef struct ConvergentSimdJob {
+  omptarget_nvptx_TaskDescr taskDescr;
+  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
+  uint16_t slimForNextSimd;
+} ConvergentSimdJob;
+
+////////////////////////////////////////////////////////////////////////////////
+// support for convergent simd (team of threads in a warp only)
+////////////////////////////////////////////////////////////////////////////////
+EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask, bool *IsFinal,
+                                          int32_t *LaneSource, int32_t *LaneId,
+                                          int32_t *NumLanes) {
+  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
+  uint32_t ConvergentMask = Mask;
+  int32_t  ConvergentSize = __popc(ConvergentMask);
+  uint32_t WorkRemaining  = ConvergentMask >> (*LaneSource+1);
+  *LaneSource += __ffs(WorkRemaining);
+  *IsFinal = __popc(WorkRemaining) == 1;
+  uint32_t lanemask_lt;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
+  *LaneId = __popc(ConvergentMask & lanemask_lt);
+
+  int threadId = GetLogicalThreadIdInBlock();
+  int sourceThreadId = (threadId & ~(WARPSIZE-1)) + *LaneSource;
+
+  ConvergentSimdJob *job = (ConvergentSimdJob *) buffer;
+  int32_t SimdLimit =
+    omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
+  job->slimForNextSimd = SimdLimit;
+
+  int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
+  // reset simdlimit to avoid propagating to successive #simd
+  if (SimdLimitSource > 0 && threadId == sourceThreadId)
+    omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(
+        threadId) = 0;
+
+  // We cannot have more than the # of convergent threads.
+  if (SimdLimitSource > 0)
+    *NumLanes = min(ConvergentSize, SimdLimitSource);
+  else
+    *NumLanes = ConvergentSize;
+  ASSERT(LT_FUSSY, *NumLanes > 0,
+       "bad thread request of %d threads", *NumLanes);
+
+  // Set to true for lanes participating in the simd region.
+  bool isActive = false;
+  // Initialize state for active threads.
+  if (*LaneId < *NumLanes) {
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+    omptarget_nvptx_TaskDescr *sourceTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(sourceThreadId);
+    job->convHeadTaskDescr = currTaskDescr;
+    // install top descriptor from the thread for which the lanes are working.
+    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                               sourceTaskDescr);
+    isActive = true;
+  }
+
+  // requires a memory fence between threads of a warp
+  return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
+  // pop stack
+  int threadId = GetLogicalThreadIdInBlock();
+  ConvergentSimdJob *job = (ConvergentSimdJob *) buffer;
+  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
+      job->slimForNextSimd;
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, job->convHeadTaskDescr);
+}
+
+typedef struct ConvergentParallelJob {
+  omptarget_nvptx_TaskDescr taskDescr;
+  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
+  uint16_t tnumForNextPar;
+} ConvergentParallelJob;
+
+////////////////////////////////////////////////////////////////////////////////
+// support for convergent parallelism (team of threads in a warp only)
+////////////////////////////////////////////////////////////////////////////////
+EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, bool *IsFinal, int32_t *LaneSource) {
+  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
+  uint32_t ConvergentMask = Mask;
+  int32_t  ConvergentSize = __popc(ConvergentMask);
+  uint32_t WorkRemaining  = ConvergentMask >> (*LaneSource+1);
+  *LaneSource += __ffs(WorkRemaining);
+  *IsFinal = __popc(WorkRemaining) == 1;
+  uint32_t lanemask_lt;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
+  uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
+
+  int threadId = GetLogicalThreadIdInBlock();
+  int sourceThreadId = (threadId & ~(WARPSIZE-1)) + *LaneSource;
+
+  ConvergentParallelJob *job = (ConvergentParallelJob *) buffer;
+  int32_t NumThreadsClause =
+    omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
+  job->tnumForNextPar = NumThreadsClause;
+
+  int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
+  // reset numthreads to avoid propagating to successive #parallel
+  if (NumThreadsSource > 0 && threadId == sourceThreadId)
+    omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(
+        threadId) = 0;
+
+  // We cannot have more than the # of convergent threads.
+  uint16_t NumThreads;
+  if (NumThreadsSource > 0)
+    NumThreads = min(ConvergentSize, NumThreadsSource);
+  else
+    NumThreads = ConvergentSize;
+  ASSERT(LT_FUSSY, NumThreads > 0,
+       "bad thread request of %d threads", NumThreads);
+
+  // Set to true for workers participating in the parallel region.
+  bool isActive = false;
+  // Initialize state for active threads.
+  if (OmpId < NumThreads) {
+    // init L2 task descriptor and storage for the L1 parallel task descriptor.
+    omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
+    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+    omptarget_nvptx_TaskDescr *sourceTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(sourceThreadId);
+    job->convHeadTaskDescr = currTaskDescr;
+    newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
+    // install new top descriptor
+    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                               newTaskDescr);
+    isActive = true;
+  }
+
+  // requires a memory fence between threads of a warp
+  return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
+  // pop stack
+  int threadId = GetLogicalThreadIdInBlock();
+  ConvergentParallelJob *job = (ConvergentParallelJob *) buffer;
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, job->convHeadTaskDescr);
+  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
+      job->tnumForNextPar;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support for parallel that goes parallel (1 static level only)
+////////////////////////////////////////////////////////////////////////////////
+
+// return number of cuda threads that participate to parallel
+// calculation has to consider simd implementation in nvptx
+// i.e. (num omp threads * num lanes)
+//
+// cudathreads =
+//    if(num_threads != 0) {
+//      if(thread_limit > 0) {
+//        min (num_threads*numLanes ; thread_limit*numLanes);
+//      } else {
+//        min (num_threads*numLanes; blockDim.x)
+//      }
+//    } else {
+//      if (thread_limit != 0) {
+//        min (thread_limit*numLanes; blockDim.x)
+//      } else { // no thread_limit, no num_threads, use all cuda threads
+//        blockDim.x;
+//      }
+//    }
+//
+// This routine is always called by the team master..
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized) {
+  PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
+  omptarget_nvptx_workFn = WorkFn;
+
+  if (!IsOMPRuntimeInitialized) return;
+
+  // This routine is only called by the team master.  The team master is
+  // the first thread of the last warp.  It always has the logical thread
+  // id of 0 (since it is a shadow for the first worker thread).
+  int threadId = 0;
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
+          threadId);
+  ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
+  ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(), "cannot be called in a parallel region.");
+  if (currTaskDescr->InParallelRegion()) {
+    PRINT0(LD_PAR, "already in parallel: go seq\n");
+    return;
+  }
+
+  uint16_t CudaThreadsForParallel = 0;
+  uint16_t NumThreadsClause =
+      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(
+          threadId);
+
+  // we cannot have more than block size
+  uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam();
+
+  // currTaskDescr->ThreadLimit(): If non-zero, this is the limit as
+  // specified by the thread_limit clause on the target directive.
+  // GetNumberOfWorkersInTeam(): This is the number of workers available
+  // in this kernel instance.
+  //
+  // E.g: If thread_limit is 33, the kernel is launched with 33+32=65
+  // threads.  The last warp is the master warp so in this case
+  // GetNumberOfWorkersInTeam() returns 64.
+
+  // this is different from ThreadAvail of OpenMP because we may be
+  // using some of the CUDA threads as SIMD lanes
+  int NumLanes = 1;
+  if (NumThreadsClause != 0) {
+    // reset request to avoid propagating to successive #parallel
+    omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(
+        threadId) = 0;
+
+    // assume that thread_limit*numlanes is already <= CudaThreadsAvail
+    // because that is already checked on the host side (CUDA offloading rtl)
+    if (currTaskDescr->ThreadLimit() != 0)
+      CudaThreadsForParallel =
+          NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes
+              ? NumThreadsClause * NumLanes
+              : currTaskDescr->ThreadLimit() * NumLanes;
+    else {
+      CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail)
+                                   ? CudaThreadsAvail
+                                   : NumThreadsClause * NumLanes;
+    }
+  } else {
+    if (currTaskDescr->ThreadLimit() != 0) {
+      CudaThreadsForParallel =
+          (currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail)
+              ? CudaThreadsAvail
+              : currTaskDescr->ThreadLimit() * NumLanes;
+    } else
+      CudaThreadsForParallel = CudaThreadsAvail;
+  }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  // On Volta and newer architectures we require that all lanes in
+  // a warp participate in the parallel region.  Round down to a
+  // multiple of warpSize since it is legal to do so in OpenMP.
+  // CudaThreadsAvail is the number of workers available in this
+  // kernel instance and is greater than or equal to
+  // currTaskDescr->ThreadLimit().
+  if (CudaThreadsForParallel < CudaThreadsAvail) {
+    CudaThreadsForParallel = (CudaThreadsForParallel < warpSize) ? 1 :
+      CudaThreadsForParallel & ~((uint16_t)warpSize - 1);
+  }
+#endif
+
+  ASSERT(LT_FUSSY, CudaThreadsForParallel > 0,
+         "bad thread request of %d threads", CudaThreadsForParallel);
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+          "only team master can create parallel");
+
+  // set number of threads on work descriptor
+  // this is different from the number of cuda threads required for the parallel
+  // region
+  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+  workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr,
+                                             CudaThreadsForParallel / NumLanes);
+  // init counters (copy start to init)
+  workDescr.CounterGroup().Reset();
+}
+
+// All workers call this function.  Deactivate those not needed.
+// Fn - the outlined work function to execute.
+// returns True if this thread is active, else False.
+//
+// Only the worker threads call this routine.
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized) {
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
+
+  // Work function and arguments for L1 parallel region.
+  *WorkFn   = omptarget_nvptx_workFn;
+
+  if (!IsOMPRuntimeInitialized) return true;
+
+  // If this is the termination signal from the master, quit early.
+  if (!*WorkFn) return false;
+
+  // Only the worker threads call this routine and the master warp
+  // never arrives here.  Therefore, use the nvptx thread id.
+  int threadId = GetThreadIdInBlock();
+  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+  // Set to true for workers participating in the parallel region.
+  bool isActive = false;
+  // Initialize state for active threads.
+  if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) {
+    // init work descriptor from workdesccr
+    omptarget_nvptx_TaskDescr *newTaskDescr =
+        omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
+    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+    newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
+    // install new top descriptor
+    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                               newTaskDescr);
+    // init private from int value
+    workDescr.CounterGroup().Init(
+        omptarget_nvptx_threadPrivateContext->Priv(threadId));
+    PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of "
+                  "%d threads\n",
+          newTaskDescr->ThreadId(), newTaskDescr->NThreads());
+
+    isActive = true;
+  }
+
+  return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_parallel() {
+  // pop stack
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
+  // Only the worker threads call this routine and the master warp
+  // never arrives here.  Therefore, use the nvptx thread id.
+  int threadId = GetThreadIdInBlock();
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      getMyTopTaskDescriptor(threadId);
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, currTaskDescr->GetPrevTaskDescr());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support for parallel that goes sequential
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
+
+  // assume this is only called for nested parallel
+  int threadId = GetLogicalThreadIdInBlock();
+
+  // unlike actual parallel, threads in the same team do not share
+  // the workTaskDescr in this case and num threads is fixed to 1
+
+  // get current task
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      getMyTopTaskDescriptor(threadId);
+
+  // allocate new task descriptor and copy value from current one, set prev to
+  // it
+  omptarget_nvptx_TaskDescr *newTaskDescr =
+      (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
+                                              (char *)"new seq parallel task");
+  newTaskDescr->CopyParent(currTaskDescr);
+
+  // tweak values for serialized parallel case:
+  // - each thread becomes ID 0 in its serialized parallel, and
+  // - there is only one thread per team
+  newTaskDescr->ThreadId() = 0;
+  newTaskDescr->ThreadsInTeam() = 1;
+
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                             newTaskDescr);
+}
+
+EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
+                                           uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
+
+  // pop stack
+  int threadId = GetLogicalThreadIdInBlock();
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      getMyTopTaskDescriptor(threadId);
+  // set new top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, currTaskDescr->GetPrevTaskDescr());
+  // free
+  SafeFree(currTaskDescr, (char *)"new seq parallel task");
+}
+
+EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc,
+                                      uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
+
+  int threadId = GetLogicalThreadIdInBlock();
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+  if (currTaskDescr->InL2OrHigherParallelRegion())
+    return 2;
+  else if (currTaskDescr->InParallelRegion())
+    return 1;
+  else
+    return 0;
+}
+
+// This kmpc call returns the thread id across all teams. It's value is
+// cached by the compiler and used when calling the runtime. On nvptx
+// it's cheap to recalculate this value so we never use the result
+// of this call.
+EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc) {
+  return GetLogicalThreadIdInBlock();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// push params
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid,
+                                    int32_t num_threads) {
+  PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
+  tid = GetLogicalThreadIdInBlock();
+  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
+      num_threads;
+}
+
+EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid,
+                                   int32_t simd_limit) {
+  PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit);
+  tid = GetLogicalThreadIdInBlock();
+  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) =
+      simd_limit;
+}
+
+// Do not do nothing: the host guarantees we started the requested number of
+// teams and we only need inspection gridDim
+
+EXTERN void __kmpc_push_num_teams(kmp_Indent *loc, int32_t tid,
+                                  int32_t num_teams, int32_t thread_limit) {
+  PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams);
+  ASSERT0(LT_FUSSY, FALSE,
+          "should never have anything with new teams on device");
+}
+
+EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t tid,
+                                  int proc_bind) {
+  PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind);
+}
Index: libomptarget/deviceRTLs/nvptx/src/reduction.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -0,0 +1,521 @@
+//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA
+//-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of reduction with KMPC interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdio.h>
+#include <complex.h>
+
+#include "omptarget-nvptx.h"
+
+//may eventually remove this
+EXTERN
+int32_t __gpu_block_reduce() {
+  int tid = GetLogicalThreadIdInBlock();
+  int nt = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+  if (nt != blockDim.x)
+    return 0;
+  unsigned tnum = __ACTIVEMASK();
+  if (tnum != (~0x0)) { // assume swapSize is 32
+    return 0;
+  }
+  return 1;
+}
+
+EXTERN
+int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars,
+                          size_t reduce_size, void *reduce_data,
+                          void *reduce_array_size, kmp_ReductFctPtr *reductFct,
+                          kmp_CriticalName *lck) {
+  int threadId = GetLogicalThreadIdInBlock();
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      getMyTopTaskDescriptor(threadId);
+  int numthread;
+  if (currTaskDescr->IsParallelConstruct()) {
+    numthread = GetNumberOfOmpThreads(threadId, isSPMDMode(), isRuntimeUninitialized());
+  } else {
+    numthread = GetNumberOfOmpTeams();
+  }
+
+  if (numthread == 1)
+    return 1;
+  else if (!__gpu_block_reduce())
+    return 2;
+  else {
+    if (threadIdx.x == 0)
+      return 1;
+    else
+      return 0;
+  }
+}
+
+EXTERN
+int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
+  if (threadIdx.x == 0) {
+    return 2;
+  } else {
+    return 0;
+  }
+}
+
+EXTERN
+int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
+  if (threadIdx.x % 32 == 0) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+EXTERN
+void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
+
+EXTERN
+void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
+
+EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
+  return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size);
+}
+
+EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
+  int lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+  hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
+  lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+  return val;
+}
+
+static INLINE void gpu_regular_warp_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
+  for (uint32_t mask = WARPSIZE/2; mask > 0; mask /= 2) {
+    shflFct(reduce_data, /*LaneId - not used= */0,
+            /*Offset = */mask, /*AlgoVersion=*/0);
+  }
+}
+
+static INLINE void gpu_irregular_warp_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+                                             uint32_t size, uint32_t tid) {
+  uint32_t curr_size;
+  uint32_t mask;
+  curr_size = size; 
+  mask = curr_size/2;
+  while (mask>0) {
+    shflFct(reduce_data, /*LaneId = */tid, /*Offset=*/mask, /*AlgoVersion=*/1);
+    curr_size = (curr_size+1)/2;
+    mask = curr_size/2;
+  }
+}
+
+static INLINE uint32_t gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
+  uint32_t lanemask_lt;
+  uint32_t lanemask_gt;
+  uint32_t size, remote_id, physical_lane_id;
+  physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
+  uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+  uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
+  do {
+    Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+    remote_id = __ffs(Liveness & lanemask_gt);
+    size = __popc(Liveness);
+    logical_lane_id /= 2;
+    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
+            /*Offset=*/remote_id-1-physical_lane_id, /*AlgoVersion=*/2);
+  } while (logical_lane_id % 2 == 0 && size > 1);
+  return (logical_lane_id == 0);
+}
+
+//
+// runtime support for array reduction
+//
+
+#define ARRAYATOMIC_GENOP(_name, _dtype, _op)                                  \
+  EXTERN void __array_atomic_##_name##_##_op(                                  \
+      kmp_Indent *id_ref, int32_t gtid, _dtype *lhs, _dtype *rhs, int64_t n) { \
+    PRINT(LD_LOOP, "Reduction: thead %d\n", gtid);                             \
+    for (int i = 0; i < n / sizeof(_dtype); i++) {                             \
+      __kmpc_atomic_##_name##_##_op(id_ref, gtid, lhs + i, rhs[i]);            \
+    }                                                                          \
+  }                                                                            \
+  EXTERN void __gpu_array_warpBlockRedu_##_name##_##_op(_dtype *ldata,         \
+                                                        int64_t n) {           \
+    for (int i = 0; i < n / sizeof(_dtype); i++) {                             \
+      ldata[i] = __gpu_warpBlockRedu_##_name##_##_op(ldata[i]);                \
+    }                                                                          \
+  }
+
+EXTERN
+int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid,
+                                        int32_t num_vars, size_t reduce_size,
+                                        void *reduce_data,
+                                        kmp_ShuffleReductFctPtr shflFct,
+                                        kmp_InterWarpCopyFctPtr cpyFct) {
+  uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+  if (Liveness == 0xffffffff) {
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+    return GetThreadIdInBlock() % WARPSIZE == 0; // Result on lane 0 of the simd warp.
+  } else {
+    return gpu_irregular_simd_reduce(reduce_data, shflFct); // Result on the first active lane.
+  }
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+INLINE
+int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
+                                     size_t reduce_size, void *reduce_data,
+                                     kmp_ShuffleReductFctPtr shflFct,
+                                     kmp_InterWarpCopyFctPtr cpyFct,
+                                     bool isSPMDExecutionMode,
+                                     bool isRuntimeUninitialized = false) {
+  /*
+   * This reduce function handles reduction within a team. It handles
+   * parallel regions in both L1 and L2 parallelism levels. It also
+   * supports Generic, SPMD, and NoOMP modes.
+   *
+   * 1. Reduce within a warp.
+   * 2. Warp master copies value to warp 0 via shared memory.
+   * 3. Warp 0 reduces to a single value.
+   * 4. The reduced value is available in the thread that returns 1.
+   */
+  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
+  uint32_t NumThreads = GetNumberOfOmpThreads(BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
+  uint32_t WarpsNeeded = (NumThreads+WARPSIZE-1)/WARPSIZE;
+  uint32_t WarpId = BlockThreadId/WARPSIZE;
+
+  // Volta execution model:
+  // For the Generic execution mode a parallel region either has 1 thread and beyond that,
+  // always a multiple of 32.
+  // For the SPMD execution mode we may have any number of threads.
+  if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+        /*LaneCount=*/NumThreads % WARPSIZE, /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+//  else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 parallel region may enter here; return early.
+//    return gpu_irregular_simd_reduce(reduce_data, shflFct);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > WARPSIZE) {
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, BlockThreadId);
+
+    return BlockThreadId == 0;
+//  } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
+//    return BlockThreadId == 0;
+  }
+  return BlockThreadId == 0;
+
+  // Get the OMP thread Id. This is different from BlockThreadId in the case of
+  // an L2 parallel region.
+//  return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized) == 0;
+}
+#else
+INLINE
+int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
+                                     size_t reduce_size, void *reduce_data,
+                                     kmp_ShuffleReductFctPtr shflFct,
+                                     kmp_InterWarpCopyFctPtr cpyFct,
+                                     bool isSPMDExecutionMode,
+                                     bool isRuntimeUninitialized = false) {
+  /*
+   * This reduce function handles reduction within a team. It handles
+   * parallel regions in both L1 and L2 parallelism levels. It also
+   * supports Generic, SPMD, and NoOMP modes.
+   *
+   * 1. Reduce within a warp.
+   * 2. Warp master copies value to warp 0 via shared memory.
+   * 3. Warp 0 reduces to a single value.
+   * 4. The reduced value is available in the thread that returns 1.
+   */
+  uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+  if (Liveness == 0xffffffff) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+        /*LaneCount=*/__popc(Liveness), /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+  else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 parallel region may enter here; return early.
+    return gpu_irregular_simd_reduce(reduce_data, shflFct);
+
+  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
+  uint32_t NumThreads = GetNumberOfOmpThreads(BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > WARPSIZE) {
+    uint32_t WarpsNeeded = (NumThreads+WARPSIZE-1)/WARPSIZE;
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    uint32_t WarpId = BlockThreadId/WARPSIZE;
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, BlockThreadId);
+
+    return BlockThreadId == 0;
+  } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
+    return BlockThreadId == 0;
+  }
+
+  // Get the OMP thread Id. This is different from BlockThreadId in the case of
+  // an L2 parallel region.
+  return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized) == 0;
+}
+#endif // __CUDA_ARCH__ >= 700
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
+                                      reduce_data, shflFct, cpyFct,
+                                      /*isSPMDExecutionMode=*/isSPMDMode());
+}
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
+                                      reduce_data, shflFct, cpyFct,
+                                      /*isSPMDExecutionMode=*/true,
+                                      /*isRuntimeUninitialized=*/true);
+}
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
+                                      reduce_data, shflFct, cpyFct,
+                                      /*isSPMDExecutionMode=*/false,
+                                      /*isRuntimeUninitialized=*/true);
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+INLINE
+int32_t nvptx_teams_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
+    bool isSPMDExecutionMode, bool isRuntimeUninitialized = false) {
+  uint32_t ThreadId = GetLogicalThreadIdInBlock();
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team master participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads = isSPMDExecutionMode
+    ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true, isRuntimeUninitialized)
+    : /*Master thread only*/ 1;
+  uint32_t TeamId = GetBlockIdInKernel();
+  uint32_t NumTeams = GetNumberOfBlocksInKernel();
+  __shared__ volatile bool IsLastTeam;
+
+  // Team masters of all teams write to the scratchpad.
+  if (ThreadId == 0) {
+    unsigned int *timestamp = GetTeamsReductionTimestamp();
+    char *scratchpad = GetTeamsReductionScratchpad();
+
+    scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
+    __threadfence();
+
+    // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
+    // It resets 'timestamp' back to 0 once the last team increments
+    // this counter.
+    unsigned val = atomicInc(timestamp, NumTeams-1);
+    IsLastTeam = val == NumTeams - 1;
+  }
+
+  // We have to wait on L1 barrier because in GENERIC mode the workers
+  // are waiting on barrier 0 for work.
+  //
+  // If we guard this barrier as follows it leads to deadlock, probably
+  // because of a compiler bug: if (!IsGenericMode()) __syncthreads();
+  uint16_t SyncWarps = (NumThreads+WARPSIZE-1)/WARPSIZE;
+  named_sync(L1_BARRIER, SyncWarps*WARPSIZE);
+
+  // If this team is not the last, quit.
+  if (/* Volatile read by all threads */ !IsLastTeam)
+    return 0;
+
+  //
+  // Last team processing.
+  //
+
+  // Threads in excess of #teams do not participate in reduction of the
+  // scratchpad values.
+  uint32_t ActiveThreads = NumThreads;
+  if (NumTeams < NumThreads) {
+    ActiveThreads = (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
+  }
+  if (ThreadId >= ActiveThreads)
+    return 0;
+
+  // Load from scratchpad and reduce.
+  char *scratchpad = GetTeamsReductionScratchpad();
+  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/0);
+  for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
+    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/1);
+
+  uint32_t WarpsNeeded = (ActiveThreads+WARPSIZE-1)/WARPSIZE;
+  uint32_t WarpId = ThreadId/WARPSIZE;
+
+  // Reduce across warps to the warp master.
+  if ((ActiveThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1)) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (ActiveThreads > 1) // Partial warp but contiguous lanes
+    // Only SPMD execution mode comes thru this case.
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+        /*LaneCount=*/ActiveThreads % WARPSIZE, /*LaneId=*/ThreadId % WARPSIZE);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  if (ActiveThreads > WARPSIZE) {
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
+  }
+
+  return ThreadId == 0;
+}
+#else
+INLINE
+int32_t nvptx_teams_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
+    bool isSPMDExecutionMode, bool isRuntimeUninitialized = false) {
+  uint32_t ThreadId = GetLogicalThreadIdInBlock();
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team master participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads = isSPMDExecutionMode
+    ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true, isRuntimeUninitialized)
+    : /*Master thread only*/ 1;
+  uint32_t TeamId = GetBlockIdInKernel();
+  uint32_t NumTeams = GetNumberOfBlocksInKernel();
+  __shared__ volatile bool IsLastTeam;
+
+  // Team masters of all teams write to the scratchpad.
+  if (ThreadId == 0) {
+    unsigned int *timestamp = GetTeamsReductionTimestamp();
+    char *scratchpad = GetTeamsReductionScratchpad();
+
+    scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
+    __threadfence();
+
+    // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
+    // It resets 'timestamp' back to 0 once the last team increments
+    // this counter.
+    unsigned val = atomicInc(timestamp, NumTeams-1);
+    IsLastTeam = val == NumTeams - 1;
+  }
+
+  // We have to wait on L1 barrier because in GENERIC mode the workers
+  // are waiting on barrier 0 for work.
+  //
+  // If we guard this barrier as follows it leads to deadlock, probably
+  // because of a compiler bug: if (!IsGenericMode()) __syncthreads();
+  uint16_t SyncWarps = (NumThreads+WARPSIZE-1)/WARPSIZE;
+  named_sync(L1_BARRIER, SyncWarps*WARPSIZE);
+
+  // If this team is not the last, quit.
+  if (/* Volatile read by all threads */ !IsLastTeam)
+    return 0;
+
+  //
+  // Last team processing.
+  //
+
+  // Threads in excess of #teams do not participate in reduction of the
+  // scratchpad values.
+  if (ThreadId >= NumTeams)
+    return 0;
+
+  // Load from scratchpad and reduce.
+  char *scratchpad = GetTeamsReductionScratchpad();
+  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/0);
+  for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
+    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/1);
+
+  // Reduce across warps to the warp master.
+  uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+  if (Liveness == 0xffffffff) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else // Partial warp but contiguous lanes
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+        /*LaneCount=*/__popc(Liveness), /*LaneId=*/ThreadId % WARPSIZE);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
+  if (ActiveThreads > WARPSIZE) {
+    uint32_t WarpsNeeded = (ActiveThreads+WARPSIZE-1)/WARPSIZE;
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    uint32_t WarpId = ThreadId/WARPSIZE;
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
+  }
+
+  return ThreadId == 0;
+}
+#endif // __CUDA_ARCH__ >= 700
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
+                                         size_t reduce_size, void *reduce_data,
+                                         kmp_ShuffleReductFctPtr shflFct,
+                                         kmp_InterWarpCopyFctPtr cpyFct,
+                                         kmp_CopyToScratchpadFctPtr scratchFct,
+                                         kmp_LoadReduceFctPtr ldFct) {
+  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+                                   reduce_data, shflFct, cpyFct, scratchFct,
+                                   ldFct, /*isSPMDExecutionMode=*/isSPMDMode());
+}
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
+  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+                                   reduce_data, shflFct, cpyFct, scratchFct,
+                                   ldFct,
+                                   /*isSPMDExecutionMode=*/true,
+                                   /*isRuntimeUninitialized=*/true);
+}
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
+  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+                                   reduce_data, shflFct, cpyFct, scratchFct,
+                                   ldFct,
+                                   /*isSPMDExecutionMode=*/false,
+                                   /*isRuntimeUninitialized=*/true);
+}
Index: libomptarget/deviceRTLs/nvptx/src/state-queue.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/state-queue.h
@@ -0,0 +1,53 @@
+//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a queue to hand out OpenMP state objects to teams of
+// one or more kernels.
+//
+// Reference:
+// Thomas R.W. Scogland and Wu-chun Feng. 2015.
+// Design and Evaluation of Scalable Concurrent Queues for Many-Core
+// Architectures. International Conference on Performance Engineering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __STATE_QUEUE_H
+#define __STATE_QUEUE_H
+
+#include <stdint.h>
+
+#include "option.h" // choices we have
+
+template<typename ElementType, uint32_t SIZE>
+class omptarget_nvptx_Queue {
+private:
+  ElementType elements[SIZE];
+  volatile ElementType *elementQueue[SIZE];
+  volatile uint32_t head;
+  volatile uint32_t ids[SIZE];
+  volatile uint32_t tail;
+
+  static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
+  INLINE uint32_t ENQUEUE_TICKET();
+  INLINE uint32_t DEQUEUE_TICKET();
+  INLINE uint32_t ID(uint32_t ticket);
+  INLINE bool IsServing(uint32_t slot, uint32_t id);
+  INLINE void PushElement(uint32_t slot, ElementType *element);
+  INLINE ElementType *PopElement(uint32_t slot);
+  INLINE void DoneServing(uint32_t slot, uint32_t id);
+
+public:
+  INLINE omptarget_nvptx_Queue() {};
+  INLINE void Enqueue(ElementType *element);
+  INLINE ElementType *Dequeue();
+};
+
+#include "state-queuei.h"
+
+#endif
Index: libomptarget/deviceRTLs/nvptx/src/state-queuei.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/state-queuei.h
@@ -0,0 +1,89 @@
+//===------- state-queue.cu - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of a queue to hand out OpenMP state
+// objects to teams of one or more kernels.
+//
+// Reference:
+// Thomas R.W. Scogland and Wu-chun Feng. 2015.
+// Design and Evaluation of Scalable Concurrent Queues for Many-Core
+// Architectures. International Conference on Performance Engineering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "state-queue.h"
+
+template<typename ElementType, uint32_t SIZE>
+INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
+  return atomicAdd((unsigned int *) &tail, 1);
+}
+
+template<typename ElementType, uint32_t SIZE>
+INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
+  return atomicAdd((unsigned int *) &head, 1);
+}
+
+template<typename ElementType, uint32_t SIZE>
+INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ID(
+    uint32_t ticket) {
+  return (ticket / SIZE) * 2;
+}
+
+template<typename ElementType, uint32_t SIZE>
+INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(
+    uint32_t slot, uint32_t id) {
+  return atomicAdd((unsigned int *) &ids[slot], 0) == id;
+}
+
+template<typename ElementType, uint32_t SIZE>
+INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(
+    uint32_t slot, ElementType *element) {
+  atomicExch((unsigned long long *) &elementQueue[slot], (unsigned long long) element);
+}
+
+template<typename ElementType, uint32_t SIZE>
+INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(
+    uint32_t slot) {
+  return (ElementType *)
+    atomicAdd((unsigned long long *) &elementQueue[slot], (unsigned long long) 0);
+}
+
+template<typename ElementType, uint32_t SIZE>
+INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(
+    uint32_t slot, uint32_t id) {
+  atomicExch((unsigned int *) &ids[slot], (id + 1) % MAX_ID);
+}
+
+template<typename ElementType, uint32_t SIZE>
+INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::Enqueue(
+    ElementType *element) {
+  uint32_t ticket = ENQUEUE_TICKET();
+  uint32_t slot   = ticket % SIZE;
+  uint32_t id     = ID(ticket) + 1;
+  while (!IsServing(slot, id))
+    ;
+  PushElement(slot, element);
+  DoneServing(slot, id);
+}
+
+template<typename ElementType, uint32_t SIZE>
+INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::Dequeue() {
+  uint32_t ticket = DEQUEUE_TICKET();
+  uint32_t slot   = ticket % SIZE;
+  uint32_t id     = ID(ticket);
+  while (!IsServing(slot, id))
+    ;
+  ElementType *element = PopElement(slot);
+  // This is to populate the queue because of the lack of GPU constructors.
+  if (element == 0)
+    element = &elements[slot];
+  DoneServing(slot, id);
+  return element;
+}
+
Index: libomptarget/deviceRTLs/nvptx/src/support.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/support.h
@@ -0,0 +1,91 @@
+//===--------- support.h - NVPTX OpenMP support functions -------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper to some functions natively supported by the GPU.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Parameters
+////////////////////////////////////////////////////////////////////////////////
+enum ExecutionMode {
+  Generic = 0x00u,
+  Spmd = 0x01u,
+  ModeMask = 0x01u,
+};
+
+enum RuntimeMode {
+  RuntimeInitialized = 0x00u,
+  RuntimeUninitialized = 0x02u,
+  RuntimeMask = 0x02u,
+};
+
+INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
+INLINE bool isGenericMode();
+INLINE bool isSPMDMode();
+INLINE bool isRuntimeUninitialized();
+INLINE bool isRuntimeInitialized();
+
+////////////////////////////////////////////////////////////////////////////////
+// get info from machine
+////////////////////////////////////////////////////////////////////////////////
+
+// get low level ids of resources
+INLINE int GetThreadIdInBlock();
+INLINE int GetBlockIdInKernel();
+INLINE int GetNumberOfBlocksInKernel();
+INLINE int GetNumberOfThreadsInBlock();
+
+// get global ids to locate tread/team info (constant regardless of OMP)
+INLINE int GetLogicalThreadIdInBlock();
+INLINE int GetMasterThreadID();
+INLINE int GetNumberOfWorkersInTeam();
+
+// get OpenMP thread and team ids
+INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
+                          bool isRuntimeUninitialized); // omp_thread_num
+INLINE int GetOmpTeamId();               // omp_team_num
+
+// get OpenMP number of threads and team
+INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
+                                 bool isRuntimeUninitialized); // omp_num_threads
+INLINE int GetNumberOfOmpTeams();               // omp_num_teams
+
+// get OpenMP number of procs
+INLINE int GetNumberOfProcsInTeam();
+INLINE int GetNumberOfProcsInDevice();
+
+// masters
+INLINE int IsTeamMaster(int ompThreadId);
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory
+////////////////////////////////////////////////////////////////////////////////
+
+// safe alloc and free
+INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
+INLINE void *SafeFree(void *ptr, const char *msg);
+// pad to a alignment (power of 2 only)
+INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+#define ADD_BYTES(_addr, _bytes)                                               \
+  ((void *)((char *)((void *)(_addr)) + (_bytes)))
+#define SUB_BYTES(_addr, _bytes)                                               \
+  ((void *)((char *)((void *)(_addr)) - (_bytes)))
+
+////////////////////////////////////////////////////////////////////////////////
+// Named Barrier Routines
+////////////////////////////////////////////////////////////////////////////////
+INLINE void named_sync(const int barrier, const int num_threads);
+
+////////////////////////////////////////////////////////////////////////////////
+// Teams Reduction Scratchpad Helpers
+////////////////////////////////////////////////////////////////////////////////
+INLINE unsigned int *GetTeamsReductionTimestamp();
+INLINE char *GetTeamsReductionScratchpad();
+INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
Index: libomptarget/deviceRTLs/nvptx/src/supporti.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -0,0 +1,222 @@
+//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper implementation to some functions natively supported by the GPU.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Parameters
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
+  execution_param = EMode;
+  execution_param |= RMode;
+}
+
+INLINE bool isGenericMode() {
+  return (execution_param & ModeMask) == Generic;
+}
+
+INLINE bool isSPMDMode() {
+  return (execution_param & ModeMask) == Spmd;
+}
+
+INLINE bool isRuntimeUninitialized() {
+  return (execution_param & RuntimeMask) == RuntimeUninitialized;
+}
+
+INLINE bool isRuntimeInitialized() {
+  return (execution_param & RuntimeMask) == RuntimeInitialized;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support: get info from machine
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Calls to the NVPTX layer  (assuming 1D layout)
+//
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE int GetThreadIdInBlock() { return threadIdx.x; }
+
+INLINE int GetBlockIdInKernel() { return blockIdx.x; }
+
+INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
+
+INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// The master thread id is the first thread (lane) of the last warp.
+// Thread id is 0 indexed.
+// E.g: If NumThreads is 33, master id is 32.
+//      If NumThreads is 64, master id is 32.
+//      If NumThreads is 97, master id is 96.
+//      If NumThreads is 1024, master id is 992.
+//
+// Called in Generic Execution Mode only.
+INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(warpSize - 1); }
+
+// The last warp is reserved for the master; other warps are workers.
+// Called in Generic Execution Mode only.
+INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
+
+////////////////////////////////////////////////////////////////////////////////
+// get thread id in team
+
+// This function may be called in a parallel region by the workers
+// or a serial region by the master.  If the master (whose CUDA thread
+// id is GetMasterThreadID()) calls this routine, we return 0 because
+// it is a shadow for the first worker.
+INLINE int GetLogicalThreadIdInBlock() {
+//  return GetThreadIdInBlock() % GetMasterThreadID();
+
+  // Implemented using control flow (predication) instead of with a modulo
+  // operation.
+  int tid = GetThreadIdInBlock();
+  if (isGenericMode() && tid >= GetMasterThreadID())
+    return 0;
+  else
+    return tid;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// OpenMP Thread Support Layer
+//
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE int GetOmpThreadId(int threadId,
+                          bool isSPMDExecutionMode,
+                          bool isRuntimeUninitialized) {
+  // omp_thread_num
+  int rc;
+
+  if (isRuntimeUninitialized) {
+    rc = GetThreadIdInBlock();
+    if (!isSPMDExecutionMode && rc >= GetMasterThreadID())
+      rc = 0;
+  } else {
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+    rc = currTaskDescr->ThreadId();
+  }
+  return rc;
+}
+
+INLINE int GetNumberOfOmpThreads(int threadId,
+                                 bool isSPMDExecutionMode,
+                                 bool isRuntimeUninitialized) {
+  // omp_num_threads
+  int rc;
+
+  if (isRuntimeUninitialized) {
+    rc = isSPMDExecutionMode ? GetNumberOfThreadsInBlock()
+                             : GetNumberOfThreadsInBlock() - warpSize;
+  } else {
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
+            threadId);
+    ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
+    rc = currTaskDescr->ThreadsInTeam();
+  }
+
+  return rc;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Team id linked to OpenMP
+
+INLINE int GetOmpTeamId() {
+  // omp_team_num
+  return GetBlockIdInKernel(); // assume 1 block per team
+}
+
+INLINE int GetNumberOfOmpTeams() {
+  // omp_num_teams
+  return GetNumberOfBlocksInKernel(); // assume 1 block per team
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Masters
+
+INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
+
+////////////////////////////////////////////////////////////////////////////////
+// get OpenMP number of procs
+
+// Get the number of processors in the device.
+INLINE int GetNumberOfProcsInDevice() {
+  if (isGenericMode())
+    return GetNumberOfWorkersInTeam();
+  else
+    return GetNumberOfThreadsInBlock();
+}
+
+INLINE int GetNumberOfProcsInTeam() {
+  return GetNumberOfProcsInDevice();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE unsigned long PadBytes(unsigned long size,
+                              unsigned long alignment) // must be a power of 2
+{
+  // compute the necessary padding to satify alignment constraint
+  ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
+         "alignment %ld is not a power of 2\n", alignment);
+  return (~(unsigned long)size + 1) & (alignment - 1);
+}
+
+INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
+{
+  void *ptr = malloc(size);
+  PRINT(LD_MEM, "malloc data of size %d for %s: 0x%llx\n", size, msg, P64(ptr));
+  ASSERT(LT_SAFETY, ptr, "failed to allocate %d bytes for %s\n", size, msg);
+  return ptr;
+}
+
+INLINE void *SafeFree(void *ptr, const char *msg) {
+  PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", P64(ptr), msg);
+  free(ptr);
+  return NULL;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Named Barrier Routines
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void named_sync(const int barrier, const int num_threads) {
+  asm volatile("bar.sync %0, %1;" : : "r"(barrier), "r"(num_threads) : "memory" );
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Teams Reduction Scratchpad Helpers
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE unsigned int *GetTeamsReductionTimestamp() {
+  return static_cast<unsigned int *>(ReductionScratchpadPtr);
+}
+
+INLINE char *GetTeamsReductionScratchpad() {
+  return static_cast<char *>(ReductionScratchpadPtr) + 256;
+}
+
+INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
+  ReductionScratchpadPtr = ScratchpadPtr;
+}
Index: libomptarget/deviceRTLs/nvptx/src/sync.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -0,0 +1,147 @@
+//===------------ sync.h - NVPTX OpenMP synchronizations --------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Include all synchronization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP Ordered calls
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_ordered\n");
+}
+
+EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_end_ordered\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP Barriers
+////////////////////////////////////////////////////////////////////////////////
+
+// a team is a block: we can use CUDA native synchronization mechanism
+// FIXME: what if not all threads (warps) participate to the barrier?
+// We may need to implement it differently
+
+EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
+  __syncthreads();
+  PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
+  return 0;
+}
+
+EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
+  tid = GetLogicalThreadIdInBlock();
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
+  if (!currTaskDescr->InL2OrHigherParallelRegion()) {
+    int numberOfActiveOMPThreads = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+    // On Volta and newer architectures we require that all lanes in
+    // a warp (at least, all present for the kernel launch) participate in the
+    // barrier.  This is enforced when launching the parallel region.  An
+    // exception is when there are < warpSize workers.  In this case only 1 worker
+    // is started, so we don't need a barrier.
+    if (numberOfActiveOMPThreads > 1) {
+#endif
+      // The #threads parameter must be rounded up to the warpSize.
+      int threads = warpSize * ((numberOfActiveOMPThreads + warpSize - 1) / warpSize);
+
+      PRINT(LD_SYNC, "call kmpc_barrier with %d omp threads, sync parameter %d\n",
+            numberOfActiveOMPThreads, threads);
+      // Barrier #1 is for synchronization among active threads.
+      named_sync(L1_BARRIER, threads);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+    } // numberOfActiveOMPThreads > 1
+#endif
+  }
+  PRINT0(LD_SYNC, "completed kmpc_barrier\n");
+}
+
+// Emit a simple barrier call in SPMD mode.  Assumes the caller is in an L0 parallel
+// region and that all worker threads participate.
+EXTERN void __kmpc_barrier_simple_spmd(kmp_Indent *loc_ref, int32_t tid) {
+  PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
+  __syncthreads();
+  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
+}
+
+// Emit a simple barrier call in Generic mode.  Assumes the caller is in an L0 parallel
+// region and that all worker threads participate.
+EXTERN void __kmpc_barrier_simple_generic(kmp_Indent *loc_ref, int32_t tid) {
+  int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - warpSize;
+  // The #threads parameter must be rounded up to the warpSize.
+  int threads = warpSize * ((numberOfActiveOMPThreads + warpSize - 1) / warpSize);
+
+  PRINT(LD_SYNC, "call kmpc_barrier_simple_generic with %d omp threads, sync parameter %d\n",
+        numberOfActiveOMPThreads, threads);
+  // Barrier #1 is for synchronization among active threads.
+  named_sync(L1_BARRIER, threads);
+  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP MASTER
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE int32_t IsMaster() {
+  // only the team master updates the state
+  int tid = GetLogicalThreadIdInBlock();
+  int ompThreadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
+  return IsTeamMaster(ompThreadId);
+}
+
+EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_master\n");
+  return IsMaster();
+}
+
+EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_end_master\n");
+  ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP SINGLE
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_single\n");
+  // decide to implement single with master; master get the single
+  return IsMaster();
+}
+
+EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_end_single\n");
+  // decide to implement single with master: master get the single
+  ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
+  // sync barrier is explicitely called... so that is not a problem
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Flush
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_flush(kmp_Indent *loc) {
+  PRINT0(LD_IO, "call kmpc_flush\n");
+  __threadfence_block();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Vote
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN int32_t __kmpc_warp_active_thread_mask() {
+  PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
+  return __ACTIVEMASK();
+}
Index: libomptarget/deviceRTLs/nvptx/src/task.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/task.cu
@@ -0,0 +1,207 @@
+//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Task implementation support.
+//
+//  explicit task structure uses
+//  omptarget_nvptx task
+//  kmp_task
+//
+//  where kmp_task is
+//    - klegacy_TaskDescr    <- task pointer
+//        shared -> X
+//        routine
+//        part_id
+//        descr
+//    -  private (of size given by task_alloc call). Accessed by
+//       task+sizeof(klegacy_TaskDescr)
+//        * private data *
+//    - shared: X. Accessed by shared ptr in klegacy_TaskDescr
+//        * pointer table to shared variables *
+//    - end
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
+    kmp_Indent *loc,     // unused
+    uint32_t global_tid, // unused
+    int32_t flag, // unused (because in our impl, all are immediately exec
+    size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,
+    kmp_TaskFctPtr taskSub) {
+  PRINT(LD_IO, "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
+               "fct 0x%llx)\n",
+        P64(sizeOfTaskInclPrivate), P64(sizeOfSharedTable), P64(taskSub));
+  // want task+priv to be a multiple of 8 bytes
+  size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
+  sizeOfTaskInclPrivate += padForTaskInclPriv;
+  size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
+  ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
+         "need task descr of size %d to be a multiple of %d\n",
+         sizeof(omptarget_nvptx_TaskDescr), sizeof(void *));
+  size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
+          totSize, "explicit task descriptor");
+  kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr;
+  ASSERT0(LT_FUSSY, (uint64_t)newKmpTaskDescr ==
+                        (uint64_t)ADD_BYTES(newExplicitTaskDescr,
+                                            sizeof(omptarget_nvptx_TaskDescr)),
+          "bad size assumptions");
+  // init kmp_TaskDescr
+  newKmpTaskDescr->sharedPointerTable =
+      (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate);
+  newKmpTaskDescr->sub = taskSub;
+  newKmpTaskDescr->destructors = NULL;
+  PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
+        P64(newKmpTaskDescr), P64(newExplicitTaskDescr));
+
+  return newKmpTaskDescr;
+}
+
+EXTERN int32_t __kmpc_omp_task(kmp_Indent *loc, uint32_t global_tid,
+    kmp_TaskDescr *newKmpTaskDescr) {
+  return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0,
+      0, 0);
+}
+
+EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newKmpTaskDescr,
+                                         int32_t depNum, void *depList,
+                                         int32_t noAliasDepNum,
+                                         void *noAliasDepList) {
+  PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
+        P64(newKmpTaskDescr));
+  // 1. get explict task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+          "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+          "bad assumptions");
+
+  // 2. push new context: update new task descriptor
+  int tid = GetLogicalThreadIdInBlock();
+  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
+  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
+                                                             newTaskDescr);
+
+  // 3. call sub
+  PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
+        P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr));
+  newKmpTaskDescr->sub(0, newKmpTaskDescr);
+  PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
+        P64(newKmpTaskDescr->sub));
+
+  // 4. pop context
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
+                                                             parentTaskDescr);
+  // 5. free
+  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
+  return 0;
+}
+
+EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, uint32_t global_tid,
+                                      kmp_TaskDescr *newKmpTaskDescr) {
+  PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
+        P64(newKmpTaskDescr));
+  // 1. get explict task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+          "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+          "bad assumptions");
+
+  // 2. push new context: update new task descriptor
+  int tid = GetLogicalThreadIdInBlock();
+  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
+  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
+                                                             newTaskDescr);
+  // 3... noting to call... is inline
+  // 4 & 5 ... done in complete
+}
+
+EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newKmpTaskDescr) {
+  PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
+        P64(newKmpTaskDescr));
+  // 1. get explict task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+          "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+          "bad assumptions");
+  // 2. get parent
+  omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
+  // 3... noting to call... is inline
+  // 4. pop context
+  int tid = GetLogicalThreadIdInBlock();
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
+                                                             parentTaskDescr);
+  // 5. free
+  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
+}
+
+EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, uint32_t global_tid,
+                                 int32_t depNum, void *depList,
+                                 int32_t noAliasDepNum, void *noAliasDepList) {
+  PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN int32_t __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid,
+    int end_part) {
+  PRINT0(LD_IO, "call to __kmpc_taskyield()\n");
+  // do nothing: tasks are executed immediately, no yielding allowed
+  return 0;
+}
+
+EXTERN int32_t __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_taskwait()\n");
+  // nothing to do as all our tasks are executed as final
+  return 0;
+}
+
+EXTERN void __kmpc_taskloop(kmp_Indent *loc, uint32_t global_tid,
+    kmp_TaskDescr *newKmpTaskDescr, int if_val, uint64_t *lb, uint64_t *ub,
+    int64_t st, int nogroup, int32_t sched, uint64_t grainsize,
+    void *task_dup) {
+
+  // skip task entirely if empty iteration space
+  if(*lb > *ub) return;
+
+  // the compiler has already stored lb and ub in the kmp_TaskDescr structure
+  // as we are using a single task to execute the entire loop, we can leave
+  // the initial task_t untouched
+
+  __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
+}