diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -78,6 +78,7 @@
 # Build offloading plugins and device RTLs if they are available.
 add_subdirectory(plugins)
 add_subdirectory(deviceRTLs)
+add_subdirectory(DeviceRTL)
 
 # Add tests.
 add_subdirectory(test)
diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -0,0 +1,208 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build the Device RTL for all toolchains that are available
+#
+##===----------------------------------------------------------------------===##
+
+# TODO: copied from NVPTX, need to be generalized.
+
+# By default we will not build NVPTX deviceRTL on a CUDA free system
+set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL
+  "Whether build NVPTX deviceRTL on CUDA free system.")
+
+if (NOT (LIBOMPTARGET_DEP_CUDA_FOUND OR LIBOMPTARGET_BUILD_NVPTX_BCLIB))
+  libomptarget_say("Not building NVPTX deviceRTL by default on CUDA free system.")
+  return()
+endif()
+
+# Check if we can create an LLVM bitcode implementation of the runtime library
+# that could be inlined in the user application. For that we need to find
+# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
+# an LLVM linker.
+set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
+  "Location of a CUDA compiler capable of emitting LLVM bitcode.")
+set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
+  "Location of a linker capable of linking LLVM bitcode objects.")
+
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
+  set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
+elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(cuda_compiler ${CMAKE_C_COMPILER})
+else()
+  libomptarget_say("Not building NVPTX deviceRTL: clang not found")
+  return()
+endif()
+
+# Get compiler directory to try to locate a suitable linker.
+get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY)
+set(llvm_link "${compiler_dir}/llvm-link")
+set(opt "${compiler_dir}/opt")
+
+if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
+  set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER})
+elseif (EXISTS ${llvm_link})
+  set(bc_linker ${llvm_link})
+else()
+  libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found")
+  return()
+endif()
+
+# TODO: This part needs to be refined when libomptarget is going to support
+# Windows!
+# TODO: This part can also be removed if we can change the clang driver to make
+# it support device only compilation.
+if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
+  set(aux_triple x86_64-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
+  set(aux_triple powerpc64le-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+  set(aux_triple aarch64-unknown-linux-gnu)
+else()
+  libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+  return()
+endif()
+
+set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
+set(include_directory ${devicertl_base_directory}/include)
+set(source_directory ${devicertl_base_directory}/src)
+
+set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80)
+
+set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING
+  "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
+string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES)
+
+if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all")
+  set(nvptx_sm_list ${all_capabilities})
+elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto")
+  if (NOT LIBOMPTARGET_DEP_CUDA_FOUND)
+    libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.")
+  endif()
+  set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH})
+else()
+  string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}")
+endif()
+
+# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the
+# build.
+if (NOT nvptx_sm_list)
+  libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list")
+  return()
+endif()
+
+# Check all SM values
+foreach(sm ${nvptx_sm_list})
+  if (NOT ${sm} IN_LIST all_capabilities)
+    libomptarget_warning_say("[NVPTX] Compute capability ${sm} is not supported. Make sure clang can work with it.")
+  endif()
+endforeach()
+
+# Override default MAX_SM in src/target_impl.h if requested
+if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
+  set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
+endif()
+
+# Activate RTL message dumps if requested by the user.
+set(LIBOMPTARGET_DEVICE_DEBUG FALSE CACHE BOOL
+  "Activate NVPTX device RTL debug messages.")
+
+libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
+
+set(src_files
+  ${source_directory}/Configuration.cpp
+  ${source_directory}/Debug.cpp
+  ${source_directory}/Kernel.cpp
+  ${source_directory}/Mapping.cpp
+  ${source_directory}/Misc.cpp
+  ${source_directory}/Parallelism.cpp
+  ${source_directory}/Reduction.cpp
+  ${source_directory}/State.cpp
+  ${source_directory}/Synchronization.cpp
+  ${source_directory}/Tasking.cpp
+  ${source_directory}/Utils.cpp
+  ${source_directory}/Workshare.cpp
+)
+
+set(clang_opt_flags -O1 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=2048)
+set(link_opt_flags  -O1        -openmp-opt-disable)
+
+# Set flags for LLVM Bitcode compilation.
+set(bc_flags -S -x c++ -std=c++17
+              ${clang_opt_flags}
+             -target nvptx64
+             -Xclang -emit-llvm-bc
+             -Xclang -aux-triple -Xclang ${aux_triple}
+             -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+             -Xclang -target-feature -Xclang +ptx61
+             -I${include_directory}
+)
+
+if(${LIBOMPTARGET_DEVICE_DEBUG})
+  list(APPEND bc_flags -DOMPTARGET_DEBUG=-1)
+else()
+  list(APPEND bc_flags -DOMPTARGET_DEBUG=0)
+endif()
+
+# Create target to build all Bitcode libraries.
+add_custom_target(omptarget-new-nvptx-bc)
+
+# Generate a Bitcode library for all the compute capabilities the user requested
+foreach(sm ${nvptx_sm_list})
+  # TODO: replace this with declare variant and isa selector.
+  set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+  set(bc_files "")
+  foreach(src ${src_files})
+    get_filename_component(infile ${src} ABSOLUTE)
+    get_filename_component(outfile ${src} NAME)
+    set(outfile "${outfile}-sm_${sm}.bc")
+
+    add_custom_command(OUTPUT ${outfile}
+      COMMAND ${cuda_compiler} ${bc_flags}
+        ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+      DEPENDS ${infile}
+      IMPLICIT_DEPENDS CXX ${infile}
+      COMMENT "Building LLVM bitcode ${outfile}"
+      VERBATIM
+    )
+    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
+
+    list(APPEND bc_files ${outfile})
+  endforeach()
+
+  set(bclib_name "libomptarget-new-nvptx-sm_${sm}.bc")
+
+  # Link to a bitcode library.
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+      COMMAND ${bc_linker}
+        -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+      DEPENDS ${bc_files}
+      COMMENT "Linking LLVM bitcode ${bclib_name}"
+  )
+
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt
+      COMMAND ${opt} ${link_opt_flags} ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                      -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+      COMMENT "Optimizing LLVM bitcode ${bclib_name}"
+  )
+  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
+
+  set(bclib_target_name "omptarget-new-nvptx-sm_${sm}-bc")
+
+  add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt)
+  add_dependencies(omptarget-new-nvptx-bc ${bclib_target_name})
+
+  # Copy library to destination.
+  add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                    ${LIBOMPTARGET_LIBRARY_DIR})
+
+  # Install bitcode library under the lib destination folder.
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+endforeach()
diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -0,0 +1,35 @@
+//===--- Configuration.h - OpenMP device configuration interface -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// API to query the global (constant) device environment.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_CONFIGURATION_H
+#define OMPTARGET_CONFIGURATION_H
+
+#include "Types.h"
+
+namespace _OMP {
+namespace config {
+
+enum DebugLevel : int32_t { Assertion };
+
+/// Return the number of devices in the system, same number as returned on the
+/// host by omp_get_num_devices.
+uint32_t getNumDevices();
+
+/// Return the user choosen debug level.
+int32_t getDebugLevel();
+
+bool isDebugMode(DebugLevel Level);
+
+} // namespace config
+} // namespace _OMP
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -0,0 +1,30 @@
+//===-------- Debug.h ---- Debug utilities ------------------------ C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_DEBUG_H
+#define OMPTARGET_DEVICERTL_DEBUG_H
+
+/// Assertion
+///
+/// {
+extern "C" {
+void __assert_assume(bool cond, const char *exp, const char *file, int line);
+}
+
+#define ASSERT(e) __assert_assume(e, #e, __FILE__, __LINE__)
+
+///}
+
+// TODO: We need to allow actual printf.
+#define PRINTF(fmt, ...) (void)fmt;
+#define PRINT(str) PRINTF("%s", str)
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -0,0 +1,345 @@
+//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_INTERFACE_H
+#define OMPTARGET_DEVICERTL_INTERFACE_H
+
+#include "Types.h"
+
+/// External API
+///
+///{
+
+extern "C" {
+
+/// ICV: dyn-var, constant 0
+///
+/// setter: ignored.
+/// getter: returns 0.
+///
+///{
+void omp_set_dynamic(int);
+int omp_get_dynamic(void);
+///}
+
+/// ICV: nthreads-var, integer
+///
+/// scope: data environment
+///
+/// setter: ignored.
+/// getter: returns false.
+///
+/// implementation notes:
+///
+///
+///{
+void omp_set_num_threads(int);
+int omp_get_max_threads(void);
+///}
+
+/// ICV: thread-limit-var, computed
+///
+/// getter: returns thread limited defined during launch.
+///
+///{
+int omp_get_thread_limit(void);
+///}
+
+/// ICV: max-active-level-var, constant 1
+///
+/// setter: ignored.
+/// getter: returns 1.
+///
+///{
+void omp_set_max_active_levels(int);
+int omp_get_max_active_levels(void);
+///}
+
+/// ICV: places-partition-var
+///
+///
+///{
+///}
+
+/// ICV: active-level-var, 0 or 1
+///
+/// getter: returns 0 or 1.
+///
+///{
+int omp_get_active_level(void);
+///}
+
+/// ICV: level-var
+///
+/// getter: returns parallel region nesting
+///
+///{
+int omp_get_level(void);
+///}
+
+/// ICV: run-sched-var
+///
+///
+///{
+void omp_set_schedule(omp_sched_t, int);
+void omp_get_schedule(omp_sched_t *, int *);
+///}
+
+/// TODO this is incomplete.
+int omp_get_num_threads(void);
+int omp_get_thread_num(void);
+void omp_set_nested(int);
+
+int omp_get_nested(void);
+
+void omp_set_max_active_levels(int Level);
+
+int omp_get_max_active_levels(void);
+
+omp_proc_bind_t omp_get_proc_bind(void);
+
+int omp_get_num_places(void);
+
+int omp_get_place_num_procs(int place_num);
+
+void omp_get_place_proc_ids(int place_num, int *ids);
+
+int omp_get_place_num(void);
+
+int omp_get_partition_num_places(void);
+
+void omp_get_partition_place_nums(int *place_nums);
+
+int omp_get_cancellation(void);
+
+void omp_set_default_device(int deviceId);
+
+int omp_get_default_device(void);
+
+int omp_get_num_devices(void);
+
+int omp_get_num_teams(void);
+
+int omp_get_team_num();
+
+int omp_get_initial_device(void);
+
+/// Synchronization
+///
+///{
+void omp_init_lock(omp_lock_t *Lock);
+
+void omp_destroy_lock(omp_lock_t *Lock);
+
+void omp_set_lock(omp_lock_t *Lock);
+
+void omp_unset_lock(omp_lock_t *Lock);
+
+int omp_test_lock(omp_lock_t *Lock);
+///}
+
+/// Tasking
+///
+///{
+int omp_in_final(void);
+
+int omp_get_max_task_priority(void);
+///}
+
+/// Misc
+///
+///{
+double omp_get_wtick(void);
+
+double omp_get_wtime(void);
+///}
+}
+
+extern "C" {
+/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be
+/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be
+/// called by any thread, allocation happens *per thread*.
+void *__kmpc_alloc_shared(uint64_t Bytes);
+
+/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like
+/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the
+/// allocated by __kmpc_alloc_shared by the same thread.
+void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
+
+/// Allocate sufficient space for \p NumArgs sequential `void*` and store the
+/// allocation address in \p GlobalArgs.
+///
+/// Called by the main thread prior to a parallel region.
+///
+/// We also remember it in GlobalArgsPtr to ensure the worker threads and
+/// deallocation function know the allocation address too.
+void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs);
+
+/// Deallocate the memory allocated by __kmpc_begin_sharing_variables.
+///
+/// Called by the main thread after a parallel region.
+void __kmpc_end_sharing_variables(void **GlobalArgs, uint64_t NumArgs);
+
+/// Store the allocation address obtained via __kmpc_begin_sharing_variables in
+/// \p GlobalArgs.
+///
+/// Called by the worker threads in the parallel region (function).
+void __kmpc_get_shared_variables(void ***GlobalArgs);
+
+/// Kernel
+///
+///{
+int8_t __kmpc_is_spmd_exec_mode();
+
+int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
+                           bool UseGenericStateMachine, bool);
+
+void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool);
+
+///}
+
+/// Reduction
+///
+///{
+void __kmpc_nvptx_end_reduce(int32_t TId);
+
+void __kmpc_nvptx_end_reduce_nowait(int32_t TId);
+
+int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct);
+
+int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
+    ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
+    ListGlobalFnTy glredFct);
+///}
+
+/// Synchronization
+///
+///{
+void __kmpc_ordered(IdentTy *Loc, int32_t TId);
+
+void __kmpc_end_ordered(IdentTy *Loc, int32_t TId);
+
+int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId);
+
+void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId);
+
+void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
+
+int32_t __kmpc_master(IdentTy *Loc, int32_t TId);
+
+void __kmpc_end_master(IdentTy *Loc, int32_t TId);
+
+int32_t __kmpc_single(IdentTy *Loc, int32_t TId);
+
+void __kmpc_end_single(IdentTy *Loc, int32_t TId);
+
+void __kmpc_flush(IdentTy *Loc);
+
+__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask();
+
+void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask);
+
+void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
+
+void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
+///}
+
+/// Parallelism
+///
+///{
+/// TODO
+void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn);
+
+/// TODO
+bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn);
+
+/// TODO
+void __kmpc_kernel_end_parallel();
+
+/// TODO
+void __kmpc_serialized_parallel(IdentTy *Loc, uint32_t);
+
+/// TODO
+void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t);
+
+/// TODO
+void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
+
+/// TODO
+void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams,
+                           int32_t ThreadLimit);
+
+/// TODO
+uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t);
+
+/// TODO
+void __kmpc_push_num_threads(IdentTy *Loc, int32_t, int32_t NumThreads);
+///}
+
+/// Tasking
+///
+///{
+TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t,
+                                        uint32_t TaskSizeInclPrivateValues,
+                                        uint32_t SharedValuesSize,
+                                        TaskFnTy TaskFn);
+
+int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
+                        TaskDescriptorTy *TaskDescriptor);
+
+int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor, int32_t,
+                                  void *, int32_t, void *);
+
+void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
+                               TaskDescriptorTy *TaskDescriptor);
+
+void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor);
+
+void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
+                          void *);
+
+void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId);
+
+void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId);
+
+int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int);
+
+int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId);
+
+void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
+                     TaskDescriptorTy *TaskDescriptor, int,
+                     uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
+                     int32_t, uint64_t, void *);
+///}
+
+/// Misc
+///
+///{
+int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal);
+
+int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
+///}
+
+/// Shuffle
+///
+///{
+int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
+int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
+///}
+}
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h
@@ -0,0 +1,86 @@
+//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_MAPPING_H
+#define OMPTARGET_MAPPING_H
+
+#include "Types.h"
+
+namespace _OMP {
+
+namespace mapping {
+
+#pragma omp declare target
+
+inline constexpr uint32_t MaxThreadsPerTeam = 1024;
+
+#pragma omp end declare target
+
+/// Initialize the mapping machinery.
+void init(bool IsSPMD);
+
+/// Return true if the kernel is executed in SPMD mode.
+bool isSPMDMode();
+
+/// Return true if the kernel is executed in generic mode.
+bool isGenericMode();
+
+/// Return true if the executing thread is the main thread in generic mode.
+bool isMainThreadInGenericMode();
+
+/// Return true if the executing thread has the lowest Id of the active threads
+/// in the warp.
+bool isLeaderInWarp();
+
+/// Return a mask describing all active threads in the warp.
+LaneMaskTy activemask();
+
+/// Return a mask describing all threads with a smaller Id in the warp.
+LaneMaskTy lanemaskLT();
+
+/// Return a mask describing all threads with a larget Id in the warp.
+LaneMaskTy lanemaskGT();
+
+/// Return the thread Id in the warp, in [0, getWarpSize()).
+uint32_t getThreadIdInWarp();
+
+/// Return the thread Id in the block, in [0, getBlockSize()).
+uint32_t getThreadIdInBlock();
+
+/// Return the warp id in the block.
+uint32_t getWarpId();
+
+/// Return the warp size, thus number of threads in the warp.
+uint32_t getWarpSize();
+
+/// Return the number of warps in the block.
+uint32_t getNumberOfWarpsInBlock();
+
+/// Return the block Id in the kernel, in [0, getKernelSize()).
+uint32_t getBlockId();
+
+/// Return the block size, thus number of threads in the block.
+uint32_t getBlockSize();
+
+/// Return the number of blocks in the kernel.
+uint32_t getNumberOfBlocks();
+
+/// Return the kernel size, thus number of threads in the kernel.
+uint32_t getKernelSize();
+
+/// Return the number of processing elements on the device.
+uint32_t getNumberOfProcessorElements();
+
+} // namespace mapping
+
+} // namespace _OMP
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -0,0 +1,200 @@
+//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_STATE_H
+#define OMPTARGET_STATE_H
+
+#include "Debug.h"
+#include "Types.h"
+
+#pragma omp declare target
+
+namespace _OMP {
+
+namespace state {
+
+inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
+
+/// Initialize the state machinery. Must be called by all threads.
+void init(bool IsSPMD);
+
+/// TODO
+enum ValueKind {
+  VK_NThreads,
+  VK_Level,
+  VK_ActiveLevel,
+  VK_MaxActiveLevels,
+  VK_RunSched,
+  // ---
+  VK_RunSchedChunk,
+  VK_ParallelRegionFn,
+  VK_ParallelTeamSize,
+};
+
+/// TODO
+void enterDataEnvironment();
+
+/// TODO
+void exitDataEnvironment();
+
+/// TODO
+struct DateEnvironmentRAII {
+  DateEnvironmentRAII() { enterDataEnvironment(); }
+  ~DateEnvironmentRAII() { exitDataEnvironment(); }
+};
+
+/// TODO
+void resetStateForThread(uint32_t TId);
+
+uint32_t &lookup32(ValueKind VK, bool IsReadonly);
+void *&lookupPtr(ValueKind VK, bool IsReadonly);
+
+/// A class without actual state used to provide a nice interface to lookup and
+/// update ICV values we can declare in global scope.
+template <typename Ty, ValueKind Kind> struct Value {
+  __attribute__((flatten, always_inline)) operator Ty() {
+    return lookup(/* IsReadonly */ true);
+  }
+
+  __attribute__((flatten, always_inline)) Value &operator=(const Ty &Other) {
+    set(Other);
+    return *this;
+  }
+
+  __attribute__((flatten, always_inline)) Value &operator++() {
+    inc(1);
+    return *this;
+  }
+
+  __attribute__((flatten, always_inline)) Value &operator--() {
+    inc(-1);
+    return *this;
+  }
+
+private:
+  Ty &lookup(bool IsReadonly) {
+    Ty &t = lookup32(Kind, IsReadonly);
+    return t;
+  }
+
+  Ty &inc(int UpdateVal) {
+    return (lookup(/* IsReadonly */ false) += UpdateVal);
+  }
+
+  Ty &set(Ty UpdateVal) { return (lookup(/* IsReadonly */ false) = UpdateVal); }
+
+  template <typename VTy, typename Ty2> friend struct ValueRAII;
+};
+
+/// A mookup class without actual state used to provide
+/// a nice interface to lookup and update ICV values
+/// we can declare in global scope.
+template <typename Ty, ValueKind Kind> struct PtrValue {
+  __attribute__((flatten, always_inline)) operator Ty() {
+    return lookup(/* IsReadonly */ true);
+  }
+
+  __attribute__((flatten, always_inline)) PtrValue &operator=(const Ty Other) {
+    set(Other);
+    return *this;
+  }
+
+private:
+  Ty &lookup(bool IsReadonly) { return lookupPtr(Kind, IsReadonly); }
+
+  Ty &set(Ty UpdateVal) { return (lookup(/* IsReadonly */ false) = UpdateVal); }
+
+  template <typename VTy, typename Ty2> friend struct ValueRAII;
+};
+
+template <typename VTy, typename Ty> struct ValueRAII {
+  ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active)
+      : Ptr(Active ? V.lookup(/* IsReadonly */ false) : Val), Val(OldValue),
+        Active(Active) {
+    if (!Active)
+      return;
+    ASSERT(Ptr == OldValue && "ValueRAII initialization with wrong old value!");
+    Ptr = NewValue;
+  }
+  ~ValueRAII() {
+    if (Active)
+      Ptr = Val;
+  }
+
+private:
+  Ty &Ptr;
+  Ty Val;
+  bool Active;
+};
+
+/// TODO
+inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
+
+/// TODO
+inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
+    ParallelRegionFn;
+
+void runAndCheckState(void(Func(void)));
+
+void assumeInitialState(bool IsSPMD);
+
+} // namespace state
+
+namespace icv {
+
+/// TODO
+inline state::Value<uint32_t, state::VK_NThreads> NThreads;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_Level> Level;
+
+/// The `active-level` describes which of the parallel level counted with the
+/// `level-var` is active. There can only be one.
+///
+/// active-level-var is 1, if ActiveLevelVar is not 0, otherweise it is 0.
+inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_RunSched> RunSched;
+
+} // namespace icv
+
+namespace memory {
+
+/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
+///
+/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
+void *allocShared(uint64_t Size, const char *Reason);
+
+/// Free \p Ptr, alloated via allocShared, for \p Reason.
+///
+/// Note: See the restrictions on __kmpc_free_shared for proper usage.
+void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
+
+/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
+void *allocGlobal(uint64_t Size, const char *Reason);
+
+/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
+void freeGlobal(void *Ptr, const char *Reason);
+
+} // namespace memory
+
+} // namespace _OMP
+
+#pragma omp end declare target
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -0,0 +1,69 @@
+//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
+#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
+
+#include "Types.h"
+
+namespace _OMP {
+
+namespace synchronize {
+
+/// Initialize the synchronization machinery. Must be called by all threads.
+void init(bool IsSPMD);
+
+/// Synchronize all threads in a warp identified by \p Mask.
+void warp(LaneMaskTy Mask);
+
+/// Synchronize all threads in a block.
+void threads();
+
+} // namespace synchronize
+
+namespace fence {
+
+/// Memory fence with \p Ordering semantics for the team.
+void team(int Ordering);
+
+/// Memory fence with \p Ordering semantics for the contention group.
+void kernel(int Ordering);
+
+/// Memory fence with \p Ordering semantics for the system.
+void system(int Ordering);
+
+} // namespace fence
+
+namespace atomic {
+
+/// Atomically read \p Addr with \p Ordering semantics.
+uint32_t read(uint32_t *Addr, int Ordering);
+
+/// Atomically store \p V to \p Addr with \p Ordering semantics.
+uint32_t store(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically store \p V to \p Addr with \p Ordering semantics.
+uint64_t store(uint64_t *Addr, uint64_t V, int Ordering);
+
+/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
+uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);
+
+} // namespace atomic
+
+} // namespace _OMP
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Types.h
@@ -0,0 +1,200 @@
+//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_TYPES_H
+#define OMPTARGET_TYPES_H
+
+/// Base type declarations for freestanding mode
+///
+///{
+using int8_t = char;
+using uint8_t = unsigned char;
+using int16_t = short;
+using uint16_t = unsigned short;
+using int32_t = int;
+using uint32_t = unsigned int;
+using int64_t = long;
+using uint64_t = unsigned long;
+
+static_assert(sizeof(int8_t) == 1, "type size mismatch");
+static_assert(sizeof(uint8_t) == 1, "type size mismatch");
+static_assert(sizeof(int16_t) == 2, "type size mismatch");
+static_assert(sizeof(uint16_t) == 2, "type size mismatch");
+static_assert(sizeof(int32_t) == 4, "type size mismatch");
+static_assert(sizeof(uint32_t) == 4, "type size mismatch");
+static_assert(sizeof(int64_t) == 8, "type size mismatch");
+static_assert(sizeof(uint64_t) == 8, "type size mismatch");
+///}
+
+enum omp_proc_bind_t {
+  omp_proc_bind_false = 0,
+  omp_proc_bind_true = 1,
+  omp_proc_bind_master = 2,
+  omp_proc_bind_close = 3,
+  omp_proc_bind_spread = 4
+};
+
+enum omp_sched_t {
+  omp_sched_static = 1,  /* chunkSize >0 */
+  omp_sched_dynamic = 2, /* chunkSize >0 */
+  omp_sched_guided = 3,  /* chunkSize >0 */
+  omp_sched_auto = 4,    /* no chunkSize */
+};
+
+enum kmp_sched_t {
+  kmp_sched_static_chunk = 33,
+  kmp_sched_static_nochunk = 34,
+  kmp_sched_dynamic = 35,
+  kmp_sched_guided = 36,
+  kmp_sched_runtime = 37,
+  kmp_sched_auto = 38,
+
+  kmp_sched_static_balanced_chunk = 45,
+
+  kmp_sched_static_ordered = 65,
+  kmp_sched_static_nochunk_ordered = 66,
+  kmp_sched_dynamic_ordered = 67,
+  kmp_sched_guided_ordered = 68,
+  kmp_sched_runtime_ordered = 69,
+  kmp_sched_auto_ordered = 70,
+
+  kmp_sched_distr_static_chunk = 91,
+  kmp_sched_distr_static_nochunk = 92,
+  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
+
+  kmp_sched_default = kmp_sched_static_nochunk,
+  kmp_sched_unordered_first = kmp_sched_static_chunk,
+  kmp_sched_unordered_last = kmp_sched_auto,
+  kmp_sched_ordered_first = kmp_sched_static_ordered,
+  kmp_sched_ordered_last = kmp_sched_auto_ordered,
+  kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
+  kmp_sched_distribute_last =
+      kmp_sched_distr_static_chunk_sched_static_chunkone,
+
+  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
+   * Since we need to distinguish the three possible cases (no modifier,
+   * monotonic modifier, nonmonotonic modifier), we need separate bits for
+   * each modifier. The absence of monotonic does not imply nonmonotonic,
+   * especially since 4.5 says that the behaviour of the "no modifier" case
+   * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
+   *
+   * Since we're passing a full 32 bit value, we can use a couple of high
+   * bits for these flags; out of paranoia we avoid the sign bit.
+   *
+   * These modifiers can be or-ed into non-static schedules by the compiler
+   * to pass the additional information. They will be stripped early in the
+   * processing in __kmp_dispatch_init when setting up schedules, so
+   * most of the code won't ever see schedules with these bits set.
+   */
+  kmp_sched_modifier_monotonic = (1 << 29),
+  /**< Set if the monotonic schedule modifier was present */
+  kmp_sched_modifier_nonmonotonic = (1 << 30),
+/**< Set if the nonmonotonic schedule modifier was present */
+
+#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
+  (enum kmp_sched_t)(                                                          \
+      (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
+#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_NONMONOTONIC(s)                                           \
+  (((s)&kmp_sched_modifier_nonmonotonic) != 0)
+#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
+  (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
+   0)
+
+};
+
+struct TaskDescriptorTy;
+using TaskFnTy = int32_t (*)(int32_t global_tid, TaskDescriptorTy *taskDescr);
+struct TaskDescriptorTy {
+  void *Payload;
+  TaskFnTy TaskFn;
+};
+
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+using LaneMaskTy = uint64_t;
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(amdgcn)}, implementation = {extension(match_none)})
+using LaneMaskTy = uint64_t;
+#pragma omp end declare variant
+
+namespace lanes {
+enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
+} // namespace lanes
+
+/// The ident structure that describes a source location. The struct is
+/// identical to the one in the kmp.h file. We maintain the same data structure
+/// for compatibility.
+struct IdentTy {
+  int32_t reserved_1;  /**<  might be used in Fortran; see above  */
+  int32_t flags;       /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
+                            identifies this union member  */
+  int32_t reserved_2;  /**<  not really used in Fortran any more; see above */
+  int32_t reserved_3;  /**<  source[4] in Fortran, do not use for C++  */
+  char const *psource; /**<  String describing the source location.
+                       The string is composed of semi-colon separated fields
+                       which describe the source file, the function and a pair
+                       of line numbers that delimit the construct. */
+};
+
+using __kmpc_impl_lanemask_t = LaneMaskTy;
+
+using ParallelRegionFnTy = void *;
+
+using CriticalNameTy = int32_t[8];
+
+struct omp_lock_t {
+  void *Lock;
+};
+
+using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num);
+using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id,
+                                   int16_t lane_offset, int16_t shortCircuit);
+using ListGlobalFnTy = void (*)(void *buffer, int idx, void *reduce_data);
+
+/// Macros for allocating variables in different address spaces.
+///{
+
+// Follows the pattern in interface.h
+typedef enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
+} omp_allocator_handle_t;
+
+#define __PRAGMA(STR) _Pragma(#STR)
+#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
+
+#define SHARED(NAME)                                                           \
+  NAME [[clang::loader_uninitialized]];                                        \
+  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+
+// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
+//       now that's not the case.
+#define THREAD_LOCAL(NAME)                                                     \
+  NAME [[clang::loader_uninitialized, clang::address_space(5)]]
+
+// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
+//       does?
+#define CONSTANT(NAME)                                                         \
+  NAME [[clang::loader_uninitialized, clang::address_space(4)]]
+
+///}
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Utils.h b/openmp/libomptarget/DeviceRTL/include/Utils.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Utils.h
@@ -0,0 +1,72 @@
+//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_UTILS_H
+#define OMPTARGET_DEVICERTL_UTILS_H
+
+#include "Types.h"
+
+namespace _OMP {
+namespace utils {
+
+/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
+/// is identified by \p Mask.
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
+
+/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
+uint64_t pack(uint32_t LowBits, uint32_t HighBits);
+
+/// Unpack \p Val into \p LowBits and \p HighBits.
+void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
+
+/// Round up \p V to a \p Boundary.
+template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
+  return (V + Boundary - 1) / Boundary * Boundary;
+}
+
+/// Advance \p Ptr by \p Bytes bytes.
+template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
+  return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint32_t V) {
+  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+  return __builtin_ffs(V);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint64_t V) {
+  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+  return __builtin_ffsl(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint32_t V) {
+  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+  return __builtin_popcount(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint64_t V) {
+  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+  return __builtin_popcountl(V);
+}
+
+#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
+
+} // namespace utils
+} // namespace _OMP
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/generated_microtask_cases.gen b/openmp/libomptarget/DeviceRTL/include/generated_microtask_cases.gen
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/generated_microtask_cases.gen
@@ -0,0 +1,405 @@
+case 0:
+((void (*)(int32_t *, int32_t *
+))fn)(&global_tid, &bound_tid
+);
+break;
+case 1:
+((void (*)(int32_t *, int32_t *
+, void *))fn)(&global_tid, &bound_tid
+, args[0]);
+break;
+case 2:
+((void (*)(int32_t *, int32_t *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1]);
+break;
+case 3:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2]);
+break;
+case 4:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+);
+break;
+case 5:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4]);
+break;
+case 6:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5]);
+break;
+case 7:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6]);
+break;
+case 8:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+);
+break;
+case 9:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8]);
+break;
+case 10:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9]);
+break;
+case 11:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10]);
+break;
+case 12:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+);
+break;
+case 13:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12]);
+break;
+case 14:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13]);
+break;
+case 15:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14]);
+break;
+case 16:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+);
+break;
+case 17:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16]);
+break;
+case 18:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17]);
+break;
+case 19:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18]);
+break;
+case 20:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+);
+break;
+case 21:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20]);
+break;
+case 22:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21]);
+break;
+case 23:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22]);
+break;
+case 24:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+);
+break;
+case 25:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24]);
+break;
+case 26:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25]);
+break;
+case 27:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26]);
+break;
+case 28:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26], args[27]
+);
+break;
+case 29:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26], args[27]
+, args[28]);
+break;
+case 30:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26], args[27]
+, args[28], args[29]);
+break;
+case 31:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26], args[27]
+, args[28], args[29], args[30]);
+break;
+case 32:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26], args[27]
+, args[28], args[29], args[30], args[31]
+);
+break;
diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -0,0 +1,44 @@
+//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the data object of the constant device environment and the
+// query API.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Configuration.h"
+#include "State.h"
+#include "Types.h"
+
+using namespace _OMP;
+
+struct DeviceEnvironmentTy {
+  int32_t DebugLevel;
+};
+
+#pragma omp declare target
+
+// TOOD: We want to change the name as soon as the old runtime is gone.
+DeviceEnvironmentTy CONSTANT(omptarget_device_environment)
+    __attribute__((used));
+
+int32_t config::getDebugLevel() {
+  // TODO: Implement libomptarget initialization of DeviceEnvironmentTy
+  return 0;
+}
+
+uint32_t config::getNumDevices() {
+  // TODO: Implement libomptarget initialization of DeviceEnvironmentTy
+  return 1;
+}
+
+bool config::isDebugMode(config::DebugLevel Level) {
+  return config::getDebugLevel() > Level;
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -0,0 +1,31 @@
+//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug utilities
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Configuration.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+extern "C" {
+void __assert_assume(bool cond, const char *exp, const char *file, int line) {
+  if (!cond && config::isDebugMode(config::DebugLevel::Assertion)) {
+    PRINTF("ASSERTION failed: %s at %s, line %d\n", exp, file, line);
+    __builtin_trap();
+  }
+
+  __builtin_assume(cond);
+}
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -0,0 +1,111 @@
+//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the kernel entry points for the device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+static void inititializeRuntime(bool IsSPMD) {
+  // Order is important here.
+  synchronize::init(IsSPMD);
+  mapping::init(IsSPMD);
+  state::init(IsSPMD);
+}
+
+/// Simple generic state machine for worker threads.
+static void genericStateMachine(IdentTy *Ident) {
+
+  uint32_t TId = mapping::getThreadIdInBlock();
+
+  do {
+    ParallelRegionFnTy WorkFn = 0;
+
+    // Wait for the signal that we have a new work function.
+    synchronize::threads();
+
+    // Retrieve the work function from the runtime.
+    bool IsActive = __kmpc_kernel_parallel(&WorkFn);
+
+    // If there is nothing more to do, break out of the state machine by
+    // returning to the caller.
+    if (!WorkFn)
+      return;
+
+    if (IsActive) {
+      ASSERT(!mapping::isSPMDMode());
+      ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId);
+      __kmpc_kernel_end_parallel();
+    }
+
+    synchronize::threads();
+
+  } while (true);
+}
+
+extern "C" {
+
+/// Initialization
+///
+/// \param Ident               Source location identification, can be NULL.
+///
+int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
+                           bool UseGenericStateMachine, bool) {
+  if (IsSPMD) {
+    inititializeRuntime(/* IsSPMD */ true);
+    synchronize::threads();
+  } else {
+    inititializeRuntime(/* IsSPMD */ false);
+    // No need to wait since only the main threads will execute user
+    // code and workers will run into a barrier right away.
+  }
+
+  if (IsSPMD) {
+    state::assumeInitialState(IsSPMD);
+    return -1;
+  }
+
+  if (mapping::isMainThreadInGenericMode())
+    return -1;
+
+  if (UseGenericStateMachine)
+    genericStateMachine(Ident);
+
+  return mapping::getThreadIdInBlock();
+}
+
+/// De-Initialization
+///
+/// In non-SPMD, this function releases the workers trapped in a state machine
+/// and also any memory dynamically allocated by the runtime.
+///
+/// \param Ident Source location identification, can be NULL.
+///
+void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool) {
+  state::assumeInitialState(IsSPMD);
+  if (IsSPMD)
+    return;
+
+  // Signal the workers to exit the state machine and exit the kernel.
+  state::ParallelRegionFn = nullptr;
+}
+
+int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -0,0 +1,221 @@
+//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mapping.h"
+#include "State.h"
+#include "Types.h"
+#include "Utils.h"
+
+#pragma omp declare target
+
+using namespace _OMP;
+
+namespace _OMP {
+namespace impl {
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+uint32_t getGridDim(uint32_t n, uint16_t d) {
+  uint32_t q = n / d;
+  return q + (n > q * d);
+}
+
+uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size,
+                         uint16_t group_size) {
+  uint32_t r = grid_size - group_id * group_size;
+  return (r < group_size) ? r : group_size;
+}
+
+LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
+
+LaneMaskTy lanemaskLT() {
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
+  return Mask & Ballot;
+}
+
+LaneMaskTy lanemaskGT() {
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  if (Lane == (mapping::getWarpSize() - 1))
+    return 0;
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
+  return Mask & Ballot;
+}
+
+uint32_t getThreadIdInWarp() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
+
+uint32_t getBlockSize() {
+  // TODO: verify this logic for generic mode.
+  return getWorkgroupDim(__builtin_amdgcn_workgroup_id_x(),
+                         __builtin_amdgcn_grid_size_x(),
+                         __builtin_amdgcn_workgroup_size_x());
+}
+
+uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); }
+
+uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); }
+
+uint32_t getNumberOfBlocks() {
+  return getGridDim(__builtin_amdgcn_grid_size_x(),
+                    __builtin_amdgcn_workgroup_size_x());
+}
+
+uint32_t getNumberOfProcessorElements() {
+  // TODO
+  return mapping::getBlockSize();
+}
+
+uint32_t getWarpId() {
+  return mapping::getThreadIdInBlock() / mapping::getWarpSize();
+}
+
+uint32_t getWarpSize() { return 64; }
+
+uint32_t getNumberOfWarpsInBlock() {
+  return mapping::getBlockSize() / mapping::getWarpSize();
+}
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+LaneMaskTy activemask() {
+  unsigned int Mask;
+  asm("activemask.b32 %0;" : "=r"(Mask));
+  return Mask;
+}
+
+LaneMaskTy lanemaskLT() {
+  __kmpc_impl_lanemask_t Res;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res));
+  return Res;
+}
+
+LaneMaskTy lanemaskGT() {
+  __kmpc_impl_lanemask_t Res;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res));
+  return Res;
+}
+
+uint32_t getThreadIdInWarp() {
+  return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1);
+}
+
+uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
+
+uint32_t getBlockSize() {
+  return __nvvm_read_ptx_sreg_ntid_x() -
+         (!mapping::isSPMDMode() * mapping::getWarpSize());
+}
+
+uint32_t getKernelSize() { return __nvvm_read_ptx_sreg_nctaid_x(); }
+
+uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); }
+
+uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); }
+
+uint32_t getNumberOfProcessorElements() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+uint32_t getWarpId() {
+  return mapping::getThreadIdInBlock() / mapping::getWarpSize();
+}
+
+uint32_t getWarpSize() { return 32; }
+
+uint32_t getNumberOfWarpsInBlock() {
+  return (mapping::getBlockSize() + mapping::getWarpSize() - 1) /
+         mapping::getWarpSize();
+}
+
+#pragma omp end declare variant
+///}
+
+} // namespace impl
+} // namespace _OMP
+
+bool mapping::isMainThreadInGenericMode() {
+  if (mapping::isSPMDMode() || icv::Level)
+    return false;
+
+  // Check if this is the last warp in the block.
+  uint32_t MainTId = (mapping::getNumberOfProcessorElements() - 1) &
+                     ~(mapping::getWarpSize() - 1);
+  return mapping::getThreadIdInBlock() == MainTId;
+}
+
+bool mapping::isLeaderInWarp() {
+  __kmpc_impl_lanemask_t Active = mapping::activemask();
+  __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
+  return utils::popc(Active & LaneMaskLT) == 0;
+}
+
+LaneMaskTy mapping::activemask() { return impl::activemask(); }
+
+LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
+
+LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
+
+uint32_t mapping::getThreadIdInWarp() { return impl::getThreadIdInWarp(); }
+
+uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); }
+
+uint32_t mapping::getBlockSize() { return impl::getBlockSize(); }
+
+uint32_t mapping::getKernelSize() { return impl::getKernelSize(); }
+
+uint32_t mapping::getBlockId() { return impl::getBlockId(); }
+
+uint32_t mapping::getNumberOfBlocks() { return impl::getNumberOfBlocks(); }
+
+uint32_t mapping::getNumberOfProcessorElements() {
+  return impl::getNumberOfProcessorElements();
+}
+
+uint32_t mapping::getWarpId() { return impl::getWarpId(); }
+
+uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+
+uint32_t mapping::getNumberOfWarpsInBlock() {
+  return impl::getNumberOfWarpsInBlock();
+}
+
+/// Execution mode
+///
+///{
+static int SHARED(IsSPMDMode);
+
+void mapping::init(bool IsSPMD) {
+  if (!mapping::getThreadIdInBlock())
+    IsSPMDMode = IsSPMD;
+}
+
+bool mapping::isSPMDMode() { return IsSPMDMode; }
+
+bool mapping::isGenericMode() { return !isSPMDMode(); }
+///}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
@@ -0,0 +1,73 @@
+//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Types.h"
+
+#pragma omp declare target
+
+namespace _OMP {
+namespace impl {
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+double getWTick() { return ((double)1E-9); }
+
+double getWTime() {
+  // The intrinsics for measuring time have undocumented frequency
+  // This will probably need to be found by measurement on a number of
+  // architectures. Until then, return 0, which is very inaccurate as a
+  // timer but resolves the undefined symbol at link time.
+  return 0;
+}
+
+#pragma omp end declare variant
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+double getWTick() {
+  // Timer precision is 1ns
+  return ((double)1E-9);
+}
+
+double getWTime() {
+  unsigned long long nsecs;
+  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
+  return (double)nsecs * getWTick();
+}
+
+#pragma omp end declare variant
+
+} // namespace impl
+} // namespace _OMP
+
+/// Interfaces
+///
+///{
+
+extern "C" {
+int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
+
+int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
+
+double omp_get_wtick(void) { return _OMP::impl::getWTick(); }
+
+double omp_get_wtime(void) { return _OMP::impl::getWTime(); }
+}
+
+///}
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -0,0 +1,198 @@
+//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Parallel implementation in the GPU. Here is the pattern:
+//
+//    while (not finished) {
+//
+//    if (master) {
+//      sequential code, decide which par loop to do, or if finished
+//     __kmpc_kernel_prepare_parallel() // exec by master only
+//    }
+//    syncthreads // A
+//    __kmpc_kernel_parallel() // exec by all
+//    if (this thread is included in the parallel) {
+//      switch () for all parallel loops
+//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
+//    }
+//
+//
+//    The reason we don't exec end_parallel for the threads not included
+//    in the parallel loop is that for each barrier in the parallel
+//    region, these non-included threads will cycle through the
+//    syncthread A. Thus they must preserve their current threadId that
+//    is larger than thread in team.
+//
+//    To make a long story short...
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+namespace {
+
+uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
+  uint32_t NThreadsICV =
+      NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
+  uint32_t NumThreads = mapping::getBlockSize();
+
+  if (NThreadsICV != 0 && NThreadsICV < NumThreads)
+    NumThreads = NThreadsICV;
+
+  // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
+  if (NumThreads < mapping::getWarpSize())
+    NumThreads = 1;
+  else
+    NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
+
+  return NumThreads;
+}
+
+// Invoke an outlined parallel function unwrapping arguments (up to 32).
+void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
+                     void **args, int64_t nargs) {
+  switch (nargs) {
+#include "generated_microtask_cases.gen"
+  default:
+    PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
+    __builtin_trap();
+  }
+}
+
+} // namespace
+
+extern "C" {
+
+void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
+                        int32_t num_threads, int proc_bind, void *fn,
+                        void *wrapper_fn, void **args, int64_t nargs) {
+
+  uint32_t TId = mapping::getThreadIdInBlock();
+  // Handle the serialized case first, same for SPMD/non-SPMD.
+  if (OMP_UNLIKELY(!if_expr || icv::Level)) {
+    __kmpc_serialized_parallel(ident, TId);
+    invokeMicrotask(TId, 0, fn, args, nargs);
+    __kmpc_end_serialized_parallel(ident, TId);
+    return;
+  }
+
+  uint32_t NumThreads = determineNumberOfThreads(num_threads);
+  if (mapping::isSPMDMode()) {
+    {
+      state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
+                                            1u, TId == 0);
+      state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0);
+      state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0);
+      synchronize::threads();
+
+      if (TId < NumThreads)
+        invokeMicrotask(TId, 0, fn, args, nargs);
+    }
+    synchronize::threads();
+    return;
+  }
+
+  // We do *not* create a new data environment because all threads in the team
+  // that are active are now running this parallel region. They share the
+  // TeamState, which has an increase level-var and potentially active-level
+  // set, but they do not have individual ThreadStates yet. If they ever
+  // modify the ICVs beyond this point a ThreadStates will be allocated.
+
+  bool IsActiveParallelRegion = NumThreads > 1;
+  if (!IsActiveParallelRegion) {
+    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true);
+    invokeMicrotask(TId, 0, fn, args, nargs);
+    return;
+  }
+
+  void **GlobalArgs = nullptr;
+  if (nargs) {
+    __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
+#pragma unroll
+    for (int I = 0; I < nargs; I++)
+      GlobalArgs[I] = args[I];
+  }
+
+  {
+    state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
+                                          1u, true);
+    state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
+                                          (void *)nullptr, true);
+    state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true);
+    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true);
+
+    // Master signals work to activate workers.
+    synchronize::threads();
+    // Master waits for workers to signal.
+    synchronize::threads();
+  }
+
+  if (nargs)
+    memory::freeShared(GlobalArgs, nargs * sizeof(void *),
+                       "global args free shared");
+}
+
+__attribute__((noinline)) bool
+__kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
+  // Work function and arguments for L1 parallel region.
+  *WorkFn = state::ParallelRegionFn;
+
+  // If this is the termination signal from the master, quit early.
+  if (!*WorkFn)
+    return false;
+
+  // Set to true for workers participating in the parallel region.
+  uint32_t TId = mapping::getThreadIdInBlock();
+  bool ThreadIsActive = TId < state::ParallelTeamSize;
+  return ThreadIsActive;
+}
+
+__attribute__((noinline)) void __kmpc_kernel_end_parallel() {
+  // In case we have modified an ICV for this thread before a ThreadState was
+  // created. We drop it now to not contaminate the next parallel region.
+  ASSERT(!mapping::isSPMDMode());
+  uint32_t TId = mapping::getThreadIdInBlock();
+  state::resetStateForThread(TId);
+  ASSERT(!mapping::isSPMDMode());
+}
+
+void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) {
+  state::enterDataEnvironment();
+  ++icv::Level;
+}
+
+void __kmpc_end_serialized_parallel(IdentTy *, uint32_t TId) {
+  state::exitDataEnvironment();
+  --icv::Level;
+}
+
+uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
+
+int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
+
+void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) {
+  icv::NThreads = NumThreads;
+}
+
+void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
+                           int32_t thread_limit) {}
+
+void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -0,0 +1,318 @@
+//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of reduction with KMPC interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+namespace {
+
+#pragma omp declare target
+
+void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
+  for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
+    shflFct(reduce_data, /*LaneId - not used= */ 0,
+            /*Offset = */ mask, /*AlgoVersion=*/0);
+  }
+}
+
+void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
+                               uint32_t size, uint32_t tid) {
+  uint32_t curr_size;
+  uint32_t mask;
+  curr_size = size;
+  mask = curr_size / 2;
+  while (mask > 0) {
+    shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
+    curr_size = (curr_size + 1) / 2;
+    mask = curr_size / 2;
+  }
+}
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
+                                          ShuffleReductFnTy shflFct) {
+  uint32_t size, remote_id, physical_lane_id;
+  physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
+  __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
+  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
+  uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
+  __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
+  do {
+    Liveness = mapping::activemask();
+    remote_id = utils::ffs(Liveness & lanemask_gt);
+    size = utils::popc(Liveness);
+    logical_lane_id /= 2;
+    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
+            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
+  } while (logical_lane_id % 2 == 0 && size > 1);
+  return (logical_lane_id == 0);
+}
+#endif
+
+static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
+                                            uint64_t reduce_size,
+                                            void *reduce_data,
+                                            ShuffleReductFnTy shflFct,
+                                            InterWarpCopyFnTy cpyFct,
+                                            bool isSPMDExecutionMode, bool) {
+  uint32_t BlockThreadId = mapping::getThreadIdInBlock();
+  if (mapping::isMainThreadInGenericMode())
+    BlockThreadId = 0;
+  uint32_t NumThreads = omp_get_num_threads();
+  if (NumThreads == 1)
+    return 1;
+    /*
+     * This reduce function handles reduction within a team. It handles
+     * parallel regions in both L1 and L2 parallelism levels. It also
+     * supports Generic, SPMD, and NoOMP modes.
+     *
+     * 1. Reduce within a warp.
+     * 2. Warp master copies value to warp 0 via shared memory.
+     * 3. Warp 0 reduces to a single value.
+     * 4. The reduced value is available in the thread that returns 1.
+     */
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  uint32_t WarpsNeeded =
+      (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+  uint32_t WarpId = mapping::getWarpId();
+
+  // Volta execution model:
+  // For the Generic execution mode a parallel region either has 1 thread and
+  // beyond that, always a multiple of 32. For the SPMD execution mode we may
+  // have any number of threads.
+  if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/NumThreads % mapping::getWarpSize(),
+                              /*LaneId=*/mapping::getThreadIdInBlock() %
+                                  mapping::getWarpSize());
+
+  // When we have more than [mapping::getWarpSize()] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > mapping::getWarpSize()) {
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                BlockThreadId);
+  }
+  return BlockThreadId == 0;
+#else
+  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
+  if (Liveness == lanes::All) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/utils::popc(Liveness),
+                              /*LaneId=*/mapping::getThreadIdInBlock() %
+                                  mapping::getWarpSize());
+  else { // Dispersed lanes. Only threads in L2
+         // parallel region may enter here; return
+         // early.
+    return gpu_irregular_simd_reduce(reduce_data, shflFct);
+  }
+
+  // When we have more than [mapping::getWarpSize()] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > mapping::getWarpSize()) {
+    uint32_t WarpsNeeded =
+        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                BlockThreadId);
+
+    return BlockThreadId == 0;
+  }
+
+  // Get the OMP thread Id. This is different from BlockThreadId in the case of
+  // an L2 parallel region.
+  return TId == 0;
+#endif // __CUDA_ARCH__ >= 700
+}
+
+uint32_t roundToWarpsize(uint32_t s) {
+  if (s < mapping::getWarpSize())
+    return 1;
+  return (s & ~(unsigned)(mapping::getWarpSize() - 1));
+}
+
+uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
+
+static volatile uint32_t IterCnt = 0;
+static volatile uint32_t Cnt = 0;
+
+} // namespace
+
+extern "C" {
+int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
+  return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
+                                      shflFct, cpyFct, mapping::isSPMDMode(),
+                                      false);
+}
+
+int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
+    ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
+    ListGlobalFnTy glredFct) {
+
+  // Terminate all threads in non-SPMD mode except for the master thread.
+  uint32_t ThreadId = mapping::getThreadIdInBlock();
+  if (mapping::isGenericMode()) {
+    if (!mapping::isMainThreadInGenericMode())
+      return 0;
+    ThreadId = 0;
+  }
+
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team master participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads = omp_get_num_threads();
+  uint32_t TeamId = omp_get_team_num();
+  uint32_t NumTeams = omp_get_num_teams();
+  static unsigned SHARED(Bound);
+  static unsigned SHARED(ChunkTeamCount);
+
+  // Block progress for teams greater than the current upper
+  // limit. We always only allow a number of teams less or equal
+  // to the number of slots in the buffer.
+  bool IsMaster = (ThreadId == 0);
+  while (IsMaster) {
+    Bound = atomic::read((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
+    if (TeamId < Bound + num_of_records)
+      break;
+  }
+
+  if (IsMaster) {
+    int ModBockId = TeamId % num_of_records;
+    if (TeamId < num_of_records) {
+      lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
+    } else
+      lgredFct(GlobalBuffer, ModBockId, reduce_data);
+
+    fence::system(__ATOMIC_SEQ_CST);
+
+    // Increment team counter.
+    // This counter is incremented by all teams in the current
+    // BUFFER_SIZE chunk.
+    ChunkTeamCount =
+        atomic::inc((uint32_t *)&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST);
+  }
+  // Synchronize
+  if (mapping::isSPMDMode())
+    __kmpc_barrier(Loc, TId);
+
+  // reduce_data is global or shared so before being reduced within the
+  // warp we need to bring it in local memory:
+  // local_reduce_data = reduce_data[i]
+  //
+  // Example for 3 reduction variables a, b, c (of potentially different
+  // types):
+  //
+  // buffer layout (struct of arrays):
+  // a, a, ..., a, b, b, ... b, c, c, ... c
+  // |__________|
+  //     num_of_records
+  //
+  // local_data_reduce layout (struct):
+  // a, b, c
+  //
+  // Each thread will have a local struct containing the values to be
+  // reduced:
+  //      1. do reduction within each warp.
+  //      2. do reduction across warps.
+  //      3. write the final result to the main reduction variable
+  //         by returning 1 in the thread holding the reduction result.
+
+  // Check if this is the very last team.
+  unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
+  if (ChunkTeamCount == NumTeams - Bound - 1) {
+    //
+    // Last team processing.
+    //
+    if (ThreadId >= NumRecs)
+      return 0;
+    NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
+    if (ThreadId >= NumThreads)
+      return 0;
+
+    // Load from buffer and reduce.
+    glcpyFct(GlobalBuffer, ThreadId, reduce_data);
+    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
+      glredFct(GlobalBuffer, i, reduce_data);
+
+    // Reduce across warps to the warp master.
+    if (NumThreads > 1) {
+      gpu_regular_warp_reduce(reduce_data, shflFct);
+
+      // When we have more than [mapping::getWarpSize()] number of threads
+      // a block reduction is performed here.
+      uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
+      if (ActiveThreads > mapping::getWarpSize()) {
+        uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
+                               mapping::getWarpSize();
+        // Gather all the reduced values from each warp
+        // to the first warp.
+        cpyFct(reduce_data, WarpsNeeded);
+
+        uint32_t WarpId = ThreadId / mapping::getWarpSize();
+        if (WarpId == 0)
+          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                    ThreadId);
+      }
+    }
+
+    if (IsMaster) {
+      Cnt = 0;
+      IterCnt = 0;
+      return 1;
+    }
+    return 0;
+  }
+  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
+    // Allow SIZE number of teams to proceed writing their
+    // intermediate results to the global buffer.
+    atomic::add((uint32_t *)&IterCnt, uint32_t(num_of_records),
+                __ATOMIC_SEQ_CST);
+  }
+
+  return 0;
+}
+
+void __kmpc_nvptx_end_reduce(int32_t TId) {}
+
+void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {}
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -0,0 +1,519 @@
+//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "State.h"
+#include "Configuration.h"
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+/// Memory implementation
+///
+///{
+
+namespace {
+
+/// Fallback implementations are missing to trigger a link time error.
+/// Implementations for new devices, including the host, should go into a
+/// dedicated begin/end declare variant.
+///
+///{
+
+extern "C" {
+void *malloc(uint64_t Size);
+void free(void *Ptr);
+}
+
+///}
+
+/// AMDGCN implementations of the shuffle sync idiom.
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+extern "C" {
+void *malloc(uint64_t Size) {
+  // TODO: Use some preallocated space for dynamic malloc.
+  return nullptr;
+}
+
+void free(void *Ptr) {}
+}
+
+#pragma omp end declare variant
+///}
+
+/// Add worst-case padding so that future allocations are properly aligned.
+constexpr const uint32_t Alignment = 8;
+
+/// A "smart" stack in shared memory.
+///
+/// The stack exposes a malloc/free interface but works like a stack internally.
+/// In fact, it is a separate stack *per warp*. That means, each warp must push
+/// and pop symmetrically or this breaks, badly. The implementation will (aim
+/// to) detect non-lock-step warps and fallback to malloc/free. The same will
+/// happen if a warp runs out of memory. The master warp in generic memory is
+/// special and is given more memory than the rest.
+///
+struct SharedMemorySmartStackTy {
+  /// Initialize the stack. Must be called by all threads.
+  void init(bool IsSPMD);
+
+  /// Allocate \p Bytes on the stack for the encountering thread. Each thread
+  /// can call this function.
+  void *push(uint64_t Bytes);
+
+  /// Deallocate the last allocation made by the encountering thread and pointed
+  /// to by \p Ptr from the stack. Each thread can call this function.
+  void pop(void *Ptr, uint32_t Bytes);
+
+private:
+  /// Compute the size of the storage space reserved for a thread.
+  uint32_t computeThreadStorageTotal() {
+    uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
+    return (state::SharedScratchpadSize - NumLanesInBlock + 1) /
+           NumLanesInBlock;
+  }
+
+  /// Return the top address of the warp data stack, that is the first address
+  /// this warp will allocate memory at next.
+  void *getThreadDataTop(uint32_t TId) {
+    return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
+  }
+
+  /// The actual storage, shared among all warps.
+  unsigned char Data[state::SharedScratchpadSize]
+      __attribute__((aligned(Alignment)));
+  unsigned char Usage[mapping::MaxThreadsPerTeam]
+      __attribute__((aligned(Alignment)));
+};
+
+static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
+              "Shared scratchpad of this size not supported yet.");
+
+/// The allocation of a single shared memory scratchpad.
+static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
+
+void SharedMemorySmartStackTy::init(bool IsSPMD) {
+  Usage[mapping::getThreadIdInBlock()] = 0;
+}
+
+void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
+  // First align the number of requested bytes.
+  uint64_t AlignedBytes = (Bytes + (Alignment - 1)) / Alignment * Alignment;
+
+  uint32_t StorageTotal = computeThreadStorageTotal();
+
+  // The main thread in generic mode gets the space of its entire warp as the
+  // other threads do not participate in any computation at all.
+  if (mapping::isMainThreadInGenericMode())
+    StorageTotal *= mapping::getWarpSize();
+
+  int TId = mapping::getThreadIdInBlock();
+  if (Usage[TId] + AlignedBytes <= StorageTotal) {
+    void *Ptr = getThreadDataTop(TId);
+    Usage[TId] += AlignedBytes;
+    return Ptr;
+  }
+
+  return memory::allocGlobal(AlignedBytes,
+                             "Slow path shared memory allocation, insufficient "
+                             "shared memory stack memory!");
+}
+
+void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
+  uint64_t AlignedBytes = (Bytes + (Alignment - 1)) / Alignment * Alignment;
+  if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) {
+    int TId = mapping::getThreadIdInBlock();
+    Usage[TId] -= AlignedBytes;
+    return;
+  }
+  memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
+}
+
+} // namespace
+
+void *memory::allocShared(uint64_t Bytes, const char *Reason) {
+  return SharedMemorySmartStack.push(Bytes);
+}
+
+void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
+  SharedMemorySmartStack.pop(Ptr, Bytes);
+}
+
+void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
+  return malloc(Bytes);
+}
+
+void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
+
+///}
+
+namespace {
+
+struct ICVStateTy {
+  uint32_t NThreadsVar;
+  uint32_t LevelVar;
+  uint32_t ActiveLevelVar;
+  uint32_t MaxActiveLevelsVar;
+  uint32_t RunSchedVar;
+  uint32_t RunSchedChunkVar;
+
+  bool operator==(const ICVStateTy &Other) const;
+
+  void assertEqual(const ICVStateTy &Other) const;
+};
+
+bool ICVStateTy::operator==(const ICVStateTy &Other) const {
+  return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
+         (ActiveLevelVar == Other.ActiveLevelVar) &
+         (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
+         (RunSchedVar == Other.RunSchedVar) &
+         (RunSchedChunkVar == Other.RunSchedChunkVar);
+}
+
+void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
+  ASSERT(NThreadsVar == Other.NThreadsVar);
+  ASSERT(LevelVar == Other.LevelVar);
+  ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
+  ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar);
+  ASSERT(RunSchedVar == Other.RunSchedVar);
+  ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
+}
+
+struct TeamStateTy {
+  /// TODO: provide a proper init function.
+  void init(bool IsSPMD);
+
+  bool operator==(const TeamStateTy &) const;
+
+  void assertEqual(TeamStateTy &Other) const;
+
+  /// ICVs
+  ///
+  /// Preallocated storage for ICV values that are used if the threads have not
+  /// set a custom default. The latter is supported but unlikely and slow(er).
+  ///
+  ///{
+  ICVStateTy ICVState;
+  ///}
+
+  uint32_t ParallelTeamSize;
+  ParallelRegionFnTy ParallelRegionFnVar;
+};
+
+TeamStateTy SHARED(TeamState);
+
+void TeamStateTy::init(bool IsSPMD) {
+  ICVState.NThreadsVar = mapping::getBlockSize();
+  ICVState.LevelVar = 0;
+  ICVState.ActiveLevelVar = 0;
+  ICVState.MaxActiveLevelsVar = 1;
+  ICVState.RunSchedVar = omp_sched_static;
+  ICVState.RunSchedChunkVar = 1;
+  ParallelTeamSize = 1;
+  ParallelRegionFnVar = nullptr;
+}
+
+bool TeamStateTy::operator==(const TeamStateTy &Other) const {
+  return (ICVState == Other.ICVState) &
+         (ParallelTeamSize == Other.ParallelTeamSize);
+}
+
+void TeamStateTy::assertEqual(TeamStateTy &Other) const {
+  ICVState.assertEqual(Other.ICVState);
+  ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
+}
+
+struct ThreadStateTy {
+
+  /// ICVs have preallocated storage in the TeamStateTy which is used if a
+  /// thread has not set a custom value. The latter is supported but unlikely.
+  /// When it happens we will allocate dynamic memory to hold the values of all
+  /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
+  /// ICV struct to hold them all. This is slower than alternatives but allows
+  /// users to pay only for what they use.
+  ///
+  ICVStateTy ICVState;
+
+  ThreadStateTy *PreviousThreadState;
+
+  void init() {
+    ICVState = TeamState.ICVState;
+    PreviousThreadState = nullptr;
+  }
+
+  void init(ThreadStateTy &PreviousTS) {
+    ICVState = PreviousTS.ICVState;
+    PreviousThreadState = &PreviousTS;
+  }
+};
+
+__attribute__((loader_uninitialized))
+ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
+#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
+
+uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var) {
+  if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0))
+    return TeamState.ICVState.*Var;
+  uint32_t TId = mapping::getThreadIdInBlock();
+  if (!ThreadStates[TId]) {
+    ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
+        sizeof(ThreadStateTy), "ICV modification outside data environment"));
+    ThreadStates[TId]->init();
+  }
+  return ThreadStates[TId]->ICVState.*Var;
+}
+
+uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) {
+  uint32_t TId = mapping::getThreadIdInBlock();
+  if (OMP_UNLIKELY(ThreadStates[TId]))
+    return ThreadStates[TId]->ICVState.*Var;
+  return TeamState.ICVState.*Var;
+}
+uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) {
+  uint64_t TId = mapping::getThreadIdInBlock();
+  if (OMP_UNLIKELY(ThreadStates[TId]))
+    return ThreadStates[TId]->ICVState.*Var;
+  return TeamState.ICVState.*Var;
+}
+
+int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
+                             int OutOfBoundsVal = -1) {
+  if (Level == 0)
+    return DefaultVal;
+  int LevelVar = omp_get_level();
+  if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
+    return OutOfBoundsVal;
+  int ActiveLevel = icv::ActiveLevel;
+  if (OMP_UNLIKELY(Level != ActiveLevel))
+    return DefaultVal;
+  return Val;
+}
+
+} // namespace
+
+uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly) {
+  switch (Kind) {
+  case state::VK_NThreads:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::NThreadsVar);
+    return lookupForModify32Impl(&ICVStateTy::NThreadsVar);
+  case state::VK_Level:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::LevelVar);
+    return lookupForModify32Impl(&ICVStateTy::LevelVar);
+  case state::VK_ActiveLevel:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::ActiveLevelVar);
+    return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar);
+  case state::VK_MaxActiveLevels:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar);
+    return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar);
+  case state::VK_RunSched:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::RunSchedVar);
+    return lookupForModify32Impl(&ICVStateTy::RunSchedVar);
+  case state::VK_RunSchedChunk:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::RunSchedChunkVar);
+    return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar);
+  case state::VK_ParallelTeamSize:
+    return TeamState.ParallelTeamSize;
+  default:
+    break;
+  }
+  __builtin_unreachable();
+}
+
+void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
+  switch (Kind) {
+  case state::VK_ParallelRegionFn:
+    return TeamState.ParallelRegionFnVar;
+  default:
+    break;
+  }
+  __builtin_unreachable();
+}
+
+void state::init(bool IsSPMD) {
+  SharedMemorySmartStack.init(IsSPMD);
+  if (!mapping::getThreadIdInBlock())
+    TeamState.init(IsSPMD);
+
+  ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
+}
+
+void state::enterDataEnvironment() {
+  unsigned TId = mapping::getThreadIdInBlock();
+  ThreadStateTy *NewThreadState =
+      static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
+  NewThreadState->init(*ThreadStates[TId]);
+  ThreadStates[TId] = NewThreadState;
+}
+
+void state::exitDataEnvironment() {
+  unsigned TId = mapping::getThreadIdInBlock();
+  resetStateForThread(TId);
+}
+
+void state::resetStateForThread(uint32_t TId) {
+  if (OMP_LIKELY(!ThreadStates[TId]))
+    return;
+
+  ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
+  __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy));
+  ThreadStates[TId] = PreviousThreadState;
+}
+
+void state::runAndCheckState(void(Func(void))) {
+  TeamStateTy OldTeamState = TeamState;
+  OldTeamState.assertEqual(TeamState);
+
+  Func();
+
+  OldTeamState.assertEqual(TeamState);
+}
+
+void state::assumeInitialState(bool IsSPMD) {
+  TeamStateTy InitialTeamState;
+  InitialTeamState.init(IsSPMD);
+  InitialTeamState.assertEqual(TeamState);
+  ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]);
+  ASSERT(mapping::isSPMDMode() == IsSPMD);
+}
+
+extern "C" {
+void omp_set_dynamic(int V) {}
+
+int omp_get_dynamic(void) { return 0; }
+
+void omp_set_num_threads(int V) { icv::NThreads = V; }
+
+int omp_get_max_threads(void) { return icv::NThreads; }
+
+int omp_get_level(void) {
+  int LevelVar = icv::Level;
+  ASSERT(LevelVar >= 0);
+  return LevelVar;
+}
+
+int omp_get_active_level(void) { return !!icv::ActiveLevel; }
+
+int omp_in_parallel(void) { return !!icv::ActiveLevel; }
+
+void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
+  *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
+  *ChunkSize = state::RunSchedChunk;
+}
+
+void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
+  icv::RunSched = (int)ScheduleKind;
+  state::RunSchedChunk = ChunkSize;
+}
+
+int omp_get_ancestor_thread_num(int Level) {
+  return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
+}
+
+int omp_get_thread_num(void) {
+  return omp_get_ancestor_thread_num(omp_get_level());
+}
+
+int omp_get_team_size(int Level) {
+  return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
+}
+
+int omp_get_num_threads(void) { return state::ParallelTeamSize; }
+
+int omp_get_thread_limit(void) { return mapping::getKernelSize(); }
+
+int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
+
+void omp_set_nested(int) {}
+
+int omp_get_nested(void) { return false; }
+
+void omp_set_max_active_levels(int Levels) {
+  icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
+}
+
+int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
+
+omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
+
+int omp_get_num_places(void) { return 0; }
+
+int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
+
+void omp_get_place_proc_ids(int, int *) {
+  // TODO
+}
+
+int omp_get_place_num(void) { return 0; }
+
+int omp_get_partition_num_places(void) { return 0; }
+
+void omp_get_partition_place_nums(int *) {
+  // TODO
+}
+
+int omp_get_cancellation(void) { return 0; }
+
+void omp_set_default_device(int) {}
+
+int omp_get_default_device(void) { return -1; }
+
+int omp_get_num_devices(void) { return config::getNumDevices(); }
+
+int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
+
+int omp_get_team_num() { return mapping::getBlockId(); }
+
+int omp_get_initial_device(void) { return -1; }
+}
+
+extern "C" {
+__attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
+  return memory::allocShared(Bytes, "Frontend alloc shared");
+}
+
+__attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
+  memory::freeShared(Ptr, Bytes, "Frontend free shared");
+}
+
+[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
+#pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
+    allocator(omp_pteam_mem_alloc)
+
+void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs) {
+  SharedMemVariableSharingSpacePtr =
+      (void **)__kmpc_alloc_shared(sizeof(void *) * NumArgs);
+  *GlobalArgs = SharedMemVariableSharingSpacePtr;
+}
+
+void __kmpc_end_sharing_variables(void **GlobalArgsPtr, uint64_t NumArgs) {
+  __kmpc_free_shared(SharedMemVariableSharingSpacePtr,
+                     sizeof(void *) * NumArgs);
+}
+
+void __kmpc_get_shared_variables(void ***GlobalArgs) {
+  *GlobalArgs = SharedMemVariableSharingSpacePtr;
+}
+}
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -0,0 +1,314 @@
+//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Include all synchronization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Synchronization.h"
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Types.h"
+#include "Utils.h"
+
+#pragma omp declare target
+
+using namespace _OMP;
+
+namespace impl {
+
+/// Atomics
+///
+///{
+/// NOTE: This function needs to be implemented by every target.
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);
+
+uint32_t atomicRead(uint32_t *Address, int Ordering) {
+  return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST);
+}
+
+uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __atomic_fetch_add(Address, Val, Ordering);
+}
+uint32_t atomicMax(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __atomic_fetch_max(Address, Val, Ordering);
+}
+
+uint32_t atomicExchange(uint32_t *Address, uint32_t Val, int Ordering) {
+  uint32_t R;
+  __atomic_exchange(Address, &Val, &R, Ordering);
+  return R;
+}
+uint32_t atomicCAS(uint32_t *Address, uint32_t Compare, uint32_t Val,
+                   int Ordering) {
+  (void)__atomic_compare_exchange(Address, &Compare, &Val, false, Ordering,
+                                  Ordering);
+  return Compare;
+}
+
+uint64_t atomicAdd(uint64_t *Address, uint64_t Val, int Ordering) {
+  return __atomic_fetch_add(Address, Val, Ordering);
+}
+///}
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, "");
+}
+
+uint32_t SHARD(namedBarrierTracker);
+
+void namedBarrierInit() {
+  // Don't have global ctors, and shared memory is not zero init
+  atomic::store(&namedBarrierTracker, 0u, __ATOMIC_RELEASE);
+}
+
+void namedBarrier() {
+  uint32_t NumThreads = omp_get_num_threads();
+  // assert(NumThreads % 32 == 0);
+
+  uint32_t WarpSize = maping::getWarpSize();
+  uint32_t NumWaves = NumThreads / WarpSize;
+
+  fence::team(__ATOMIC_ACQUIRE);
+
+  // named barrier implementation for amdgcn.
+  // Uses two 16 bit unsigned counters. One for the number of waves to have
+  // reached the barrier, and one to count how many times the barrier has been
+  // passed. These are packed in a single atomically accessed 32 bit integer.
+  // Low bits for the number of waves, assumed zero before this call.
+  // High bits to count the number of times the barrier has been passed.
+
+  // precondition: NumWaves != 0;
+  // invariant: NumWaves * WarpSize == NumThreads;
+  // precondition: NumWaves < 0xffffu;
+
+  // Increment the low 16 bits once, using the lowest active thread.
+  if (mapping::isLeaderInWarp()) {
+    uint32_t load = atomic::add(&namedBarrierTracker, 1,
+                                __ATOMIC_RELAXED); // commutative
+
+    // Record the number of times the barrier has been passed
+    uint32_t generation = load & 0xffff0000u;
+
+    if ((load & 0x0000ffffu) == (NumWaves - 1)) {
+      // Reached NumWaves in low bits so this is the last wave.
+      // Set low bits to zero and increment high bits
+      load += 0x00010000u; // wrap is safe
+      load &= 0xffff0000u; // because bits zeroed second
+
+      // Reset the wave counter and release the waiting waves
+      atomic::store(&namedBarrierTracker, load, __ATOMIC_RELAXED);
+    } else {
+      // more waves still to go, spin until generation counter changes
+      do {
+        __builtin_amdgcn_s_sleep(0);
+        load = atomi::load(&namedBarrierTracker, __ATOMIC_RELAXED);
+      } while ((load & 0xffff0000u) == generation);
+    }
+  }
+  fence::team(__ATOMIC_RELEASE);
+}
+
+void syncWarp(__kmpc_impl_lanemask_t) {
+  // AMDGCN doesn't need to sync threads in a warp
+}
+
+void syncThreads() { __builtin_amdgcn_s_barrier(); }
+
+void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); }
+
+void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); }
+
+void fenceSystem(int Ordering) { __builtin_amdgcn_fence(Ordering, ""); }
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __nvvm_atom_inc_gen_ui(Address, Val);
+}
+
+void namedBarrierInit() {}
+
+void namedBarrier() {
+  uint32_t NumThreads = omp_get_num_threads();
+  ASSERT(NumThreads % 32 == 0);
+
+  // The named barrier for active parallel threads of a team in an L1 parallel
+  // region to synchronize with each other.
+  constexpr int BarrierNo = 7;
+  asm volatile("barrier.sync %0, %1;"
+               :
+               : "r"(BarrierNo), "r"(NumThreads)
+               : "memory");
+}
+
+void fenceTeam(int) { __nvvm_membar_cta(); }
+
+void fenceKernel(int) { __nvvm_membar_gl(); }
+
+void fenceSystem(int) { __nvvm_membar_sys(); }
+
+void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
+
+void syncThreads() {
+  constexpr int BarrierNo = 8;
+  asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
+}
+
+constexpr uint32_t OMP_SPIN = 1000;
+constexpr uint32_t UNSET = 0;
+constexpr uint32_t SET = 1;
+
+// TODO: This seems to hide a bug in the declare variant handling. If it is
+// called before it is defined
+//       here the overload won't happen. Investigate lalter!
+void unsetLock(omp_lock_t *Lock) {
+  (void)atomicExchange((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST);
+}
+
+int testLock(omp_lock_t *Lock) {
+  return atomicAdd((uint32_t *)Lock, 0u, __ATOMIC_SEQ_CST);
+}
+
+void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
+
+void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); }
+
+void setLock(omp_lock_t *Lock) {
+  // TODO: not sure spinning is a good idea here..
+  while (atomicCAS((uint32_t *)Lock, UNSET, SET, __ATOMIC_SEQ_CST) != UNSET) {
+    int32_t start = __nvvm_read_ptx_sreg_clock();
+    int32_t now;
+    for (;;) {
+      now = __nvvm_read_ptx_sreg_clock();
+      int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
+      if (cycles >= OMP_SPIN * mapping::getBlockId()) {
+        break;
+      }
+    }
+  } // wait for 0 to be the read value
+}
+
+#pragma omp end declare variant
+///}
+
+} // namespace impl
+
+void synchronize::init(bool IsSPMD) {
+  if (!IsSPMD)
+    impl::namedBarrierInit();
+}
+
+void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
+
+void synchronize::threads() { impl::syncThreads(); }
+
+void fence::team(int Ordering) { impl::fenceTeam(Ordering); }
+
+void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); }
+
+void fence::system(int Ordering) { impl::fenceSystem(Ordering); }
+
+uint32_t atomic::read(uint32_t *Addr, int Ordering) {
+  return impl::atomicRead(Addr, Ordering);
+}
+
+uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
+  return impl::atomicInc(Addr, V, Ordering);
+}
+
+uint32_t atomic::add(uint32_t *Addr, uint32_t V, int Ordering) {
+  return impl::atomicAdd(Addr, V, Ordering);
+}
+
+uint64_t atomic::add(uint64_t *Addr, uint64_t V, int Ordering) {
+  return impl::atomicAdd(Addr, V, Ordering);
+}
+
+extern "C" {
+void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
+
+void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
+
+int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
+  __kmpc_barrier(Loc, TId);
+  return 0;
+}
+
+void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
+  if (mapping::isMainThreadInGenericMode())
+    return __kmpc_flush(Loc);
+
+  if (mapping::isSPMDMode())
+    return __kmpc_barrier_simple_spmd(Loc, TId);
+
+  impl::namedBarrier();
+}
+
+__attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
+                                                          int32_t TId) {
+  synchronize::threads();
+}
+
+int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
+  return omp_get_team_num() == 0;
+}
+
+void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
+
+int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
+  return __kmpc_master(Loc, TId);
+}
+
+void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
+  // The barrier is explicitly called.
+}
+
+void __kmpc_flush(IdentTy *Loc) { fence::kernel(__ATOMIC_SEQ_CST); }
+
+__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
+  return mapping::activemask();
+}
+
+void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { synchronize::warp(Mask); }
+
+void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+  omp_set_lock(reinterpret_cast<omp_lock_t *>(Name));
+}
+
+void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+  omp_unset_lock(reinterpret_cast<omp_lock_t *>(Name));
+}
+
+void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
+
+void omp_destroy_lock(omp_lock_t *Lock) { impl::destoryLock(Lock); }
+
+void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
+
+void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
+
+int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); }
+} // extern "C"
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Tasking.cpp b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
@@ -0,0 +1,104 @@
+//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Task implementation support.
+//
+// TODO: We should not allocate and execute the task in two steps. A new API is
+//       needed for that though.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Interface.h"
+#include "State.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t,
+                                        uint64_t TaskSizeInclPrivateValues,
+                                        uint64_t SharedValuesSize,
+                                        TaskFnTy TaskFn) {
+  auto TaskSizeInclPrivateValuesPadded =
+      utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *)));
+  auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
+  TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
+      TaskSizeTotal, "explicit task descriptor");
+  TaskDescriptor->Payload =
+      utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
+  TaskDescriptor->TaskFn = TaskFn;
+
+  return TaskDescriptor;
+}
+
+int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
+                        TaskDescriptorTy *TaskDescriptor) {
+  return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
+}
+
+int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor, int32_t,
+                                  void *, int32_t, void *) {
+  state::DateEnvironmentRAII DERAII;
+
+  TaskDescriptor->TaskFn(0, TaskDescriptor);
+
+  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
+  return 0;
+}
+
+void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
+                               TaskDescriptorTy *TaskDescriptor) {
+  state::enterDataEnvironment();
+}
+
+void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor) {
+  state::exitDataEnvironment();
+
+  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
+}
+
+void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
+                          void *) {}
+
+void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {}
+
+void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {}
+
+int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; }
+
+int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; }
+
+void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
+                     TaskDescriptorTy *TaskDescriptor, int,
+                     uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
+                     int32_t, uint64_t, void *) {
+  // Skip task entirely if empty iteration space.
+  if (*LowerBound > *UpperBound)
+    return;
+
+  // The compiler has already stored lb and ub in the TaskDescriptorTy structure
+  // as we are using a single task to execute the entire loop, we can leave
+  // the initial task_t untouched
+  __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
+}
+
+int omp_in_final(void) {
+  // treat all tasks as final... Specs may expect runtime to keep
+  // track more precisely if a task was actively set by users... This
+  // is not explicitly specified; will treat as if runtime can
+  // actively decide to put a non-final task into a final one.
+  return 1;
+}
+
+int omp_get_max_task_priority(void) { return 0; }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -0,0 +1,141 @@
+//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Utils.h"
+
+#include "Interface.h"
+#include "Mapping.h"
+
+#pragma omp declare target
+
+using namespace _OMP;
+
+namespace _OMP {
+/// Helper to keep code alive without introducing a performance penalty.
+__attribute__((used, weak, optnone)) void keepAlive() {
+  __kmpc_barrier_simple_spmd(nullptr, 0);
+}
+} // namespace _OMP
+
+namespace impl {
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
+  *LowBits = (uint32_t)(Val & UINT64_C(0x00000000FFFFFFFF));
+  *HighBits = (uint32_t)((Val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
+}
+
+uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
+  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
+}
+
+#pragma omp end declare variant
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
+  uint32_t LowBitsLocal, HighBitsLocal;
+  asm("mov.b64 {%0,%1}, %2;"
+      : "=r"(LowBitsLocal), "=r"(HighBitsLocal)
+      : "l"(Val));
+  *LowBits = LowBitsLocal;
+  *HighBits = HighBitsLocal;
+}
+
+uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
+  uint64_t Val;
+  asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
+  return Val;
+}
+
+#pragma omp end declare variant
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+  int Width = mapping::getWarpSize();
+  int Self = mapping::getgetThreadIdInWarp();
+  int Index = SrcLane + (Self & ~(Width - 1));
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
+                    int32_t Width) {
+  int Self = mapping::getThreadIdInWarp();
+  int Index = Self + LaneDelta;
+  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
+}
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
+  int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
+  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
+}
+
+#pragma omp end declare variant
+} // namespace impl
+
+uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
+  return impl::Pack(LowBits, HighBits);
+}
+
+void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
+  impl::Unpack(Val, &LowBits, &HighBits);
+}
+
+int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+  return impl::shuffle(Mask, Var, SrcLane);
+}
+
+int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
+                           int32_t Width) {
+  return impl::shuffleDown(Mask, Var, Delta, Width);
+}
+
+extern "C" {
+int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
+  return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
+}
+
+int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
+  uint32_t lo, hi;
+  utils::unpack(Val, lo, hi);
+  hi = impl::shuffleDown(lanes::All, hi, Delta, Width);
+  lo = impl::shuffleDown(lanes::All, lo, Delta, Width);
+  return utils::pack(lo, hi);
+}
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -0,0 +1,598 @@
+//===----- Workshare.cpp -  OpenMP workshare implementation ------ C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the KMPC interface
+// for the loop construct plus other worksharing constructs that use the same
+// interface as loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+// TODO:
+struct DynamicScheduleTracker {
+  int64_t Chunk;
+  int64_t LoopUpperBound;
+  int64_t NextLowerBound;
+  int64_t Stride;
+  kmp_sched_t ScheduleType;
+  DynamicScheduleTracker *NextDST;
+};
+
+#define ASSERT0(...)
+
+// used by the library for the interface with the app
+#define DISPATCH_FINISHED 0
+#define DISPATCH_NOTFINISHED 1
+
+// used by dynamic scheduling
+#define FINISHED 0
+#define NOT_FINISHED 1
+#define LAST_CHUNK 2
+
+#pragma omp declare target
+
+// TODO: This variable is a hack inherited from the old runtime.
+uint64_t SHARED(Cnt);
+
+template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling with chunk
+
+  // Generic implementation of OMP loop scheduling with static policy
+  /*! \brief Calculate initial bounds for static loop and stride
+   *  @param[in] loc location in code of the call (not used here)
+   *  @param[in] global_tid global thread id
+   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
+   *  @param[in] plastiter pointer to last iteration
+   *  @param[in,out] pointer to loop lower bound. it will contain value of
+   *  lower bound of first chunk
+   *  @param[in,out] pointer to loop upper bound. It will contain value of
+   *  upper bound of first chunk
+   *  @param[in,out] pointer to loop stride. It will contain value of stride
+   *  between two successive chunks executed by the same thread
+   *  @param[in] loop increment bump
+   *  @param[in] chunk size
+   */
+
+  // helper function for static chunk
+  static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
+                             T entityId, T numberOfEntities) {
+    // each thread executes multiple chunks all of the same size, except
+    // the last one
+    // distance between two successive chunks
+    stride = numberOfEntities * chunk;
+    lb = lb + entityId * chunk;
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    // Say ub' is the begining of the last chunk. Then who ever has a
+    // lower bound plus a multiple of the increment equal to ub' is
+    // the last one.
+    T beginingLastChunk = inputUb - (inputUb % chunk);
+    last = ((beginingLastChunk - lb) % stride) == 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling without chunk
+
+  // helper function for static no chunk
+  static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
+                               T entityId, T numberOfEntities) {
+    // No chunk size specified.  Each thread or warp gets at most one
+    // chunk; chunks are all almost of equal size
+    T loopSize = ub - lb + 1;
+
+    chunk = loopSize / numberOfEntities;
+    T leftOver = loopSize - chunk * numberOfEntities;
+
+    if (entityId < leftOver) {
+      chunk++;
+      lb = lb + entityId * chunk;
+    } else {
+      lb = lb + entityId * chunk + leftOver;
+    }
+
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    last = lb <= inputUb && inputUb <= ub;
+    stride = loopSize; // make sure we only do 1 chunk per warp
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for Static Init
+
+  static void for_static_init(int32_t gtid, int32_t schedtype,
+                              int32_t *plastiter, T *plower, T *pupper,
+                              ST *pstride, ST chunk, bool IsSPMDExecutionMode) {
+    // When IsRuntimeUninitialized is true, we assume that the caller is
+    // in an L0 parallel region and that all worker threads participate.
+
+    // Assume we are in teams region or that we use a single block
+    // per target region
+    int numberOfActiveOMPThreads = omp_get_num_threads();
+
+    // All warps that are in excess of the maximum requested, do
+    // not execute the loop
+    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
+            "current thread is not needed here; error");
+
+    // copy
+    int lastiter = 0;
+    T lb = *plower;
+    T ub = *pupper;
+    ST stride = *pstride;
+
+    // init
+    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
+    case kmp_sched_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_balanced_chunk: {
+      if (chunk > 0) {
+        // round up to make sure the chunk is enough to cover all iterations
+        T tripCount = ub - lb + 1; // +1 because ub is inclusive
+        T span = (tripCount + numberOfActiveOMPThreads - 1) /
+                 numberOfActiveOMPThreads;
+        // perform chunk adjustment
+        chunk = (span + chunk - 1) & ~(chunk - 1);
+
+        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+        T oldUb = ub;
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        if (ub > oldUb)
+          ub = oldUb;
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+      break;
+    }
+    case kmp_sched_distr_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
+                       omp_get_num_teams());
+        break;
+      } // note: if chunk <=0, use nochunk
+    }
+    case kmp_sched_distr_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
+                       omp_get_num_teams());
+      break;
+    }
+    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
+      ForStaticChunk(lastiter, lb, ub, stride, chunk,
+                     numberOfActiveOMPThreads * omp_get_team_num() + gtid,
+                     omp_get_num_teams() * numberOfActiveOMPThreads);
+      break;
+    }
+    default: {
+      // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                     numberOfActiveOMPThreads);
+      break;
+    }
+    }
+    // copy back
+    *plastiter = lastiter;
+    *plower = lb;
+    *pupper = ub;
+    *pstride = stride;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch Init
+
+  static int OrderedSchedule(kmp_sched_t schedule) {
+    return schedule >= kmp_sched_ordered_first &&
+           schedule <= kmp_sched_ordered_last;
+  }
+
+  static void dispatch_init(IdentTy *loc, int32_t threadId,
+                            kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
+                            DynamicScheduleTracker *DST) {
+    int tid = mapping::getThreadIdInBlock();
+    T tnum = omp_get_num_threads();
+    T tripCount = ub - lb + 1; // +1 because ub is inclusive
+    ASSERT0(LT_FUSSY, threadId < tnum,
+            "current thread is not needed here; error");
+
+    /* Currently just ignore the monotonic and non-monotonic modifiers
+     * (the compiler isn't producing them * yet anyway).
+     * When it is we'll want to look at them somewhere here and use that
+     * information to add to our schedule choice. We shouldn't need to pass
+     * them on, they merely affect which schedule we can legally choose for
+     * various dynamic cases. (In particular, whether or not a stealing scheme
+     * is legal).
+     */
+    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+    // Process schedule.
+    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
+      if (OrderedSchedule(schedule))
+        __kmpc_barrier(loc, threadId);
+      schedule = kmp_sched_static_chunk;
+      chunk = tripCount; // one thread gets the whole loop
+    } else if (schedule == kmp_sched_runtime) {
+      // process runtime
+      omp_sched_t rtSched;
+      int ChunkInt;
+      omp_get_schedule(&rtSched, &ChunkInt);
+      chunk = ChunkInt;
+      switch (rtSched) {
+      case omp_sched_static: {
+        if (chunk > 0)
+          schedule = kmp_sched_static_chunk;
+        else
+          schedule = kmp_sched_static_nochunk;
+        break;
+      }
+      case omp_sched_auto: {
+        schedule = kmp_sched_static_chunk;
+        chunk = 1;
+        break;
+      }
+      case omp_sched_dynamic:
+      case omp_sched_guided: {
+        schedule = kmp_sched_dynamic;
+        break;
+      }
+      }
+    } else if (schedule == kmp_sched_auto) {
+      schedule = kmp_sched_static_chunk;
+      chunk = 1;
+    } else {
+      // ASSERT(LT_FUSSY,
+      //        schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+      //        "unknown schedule %d & chunk %lld\n", (int)schedule,
+      //        (long long)chunk);
+    }
+
+    // init schedules
+    if (schedule == kmp_sched_static_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      DST->ScheduleType = schedule;
+      // save ub
+      DST->LoopUpperBound = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
+    } else if (schedule == kmp_sched_static_balanced_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      DST->ScheduleType = schedule;
+      // save ub
+      DST->LoopUpperBound = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      // round up to make sure the chunk is enough to cover all iterations
+      T span = (tripCount + tnum - 1) / tnum;
+      // perform chunk adjustment
+      chunk = (span + chunk - 1) & ~(chunk - 1);
+
+      T oldUb = ub;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+      if (ub > oldUb)
+        ub = oldUb;
+      // save computed params
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
+    } else if (schedule == kmp_sched_static_nochunk) {
+      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+      // save sched state
+      DST->ScheduleType = schedule;
+      // save ub
+      DST->LoopUpperBound = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
+    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
+      // save data
+      DST->ScheduleType = schedule;
+      if (chunk < 1)
+        chunk = 1;
+      DST->Chunk = chunk;
+      DST->LoopUpperBound = ub;
+      DST->NextLowerBound = lb;
+      __kmpc_barrier(loc, threadId);
+      if (tid == 0) {
+        Cnt = 0;
+        fence::team(__ATOMIC_SEQ_CST);
+      }
+      __kmpc_barrier(loc, threadId);
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch next
+
+  static uint64_t NextIter() {
+    __kmpc_impl_lanemask_t active = mapping::activemask();
+    uint32_t leader = utils::ffs(active) - 1;
+    uint32_t change = utils::popc(active);
+    __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
+    unsigned int rank = utils::popc(active & lane_mask_lt);
+    uint64_t warp_res;
+    if (rank == 0) {
+      warp_res = atomic::add(&Cnt, change, __ATOMIC_SEQ_CST);
+    }
+    warp_res = utils::shuffle(active, warp_res, leader);
+    return warp_res + rank;
+  }
+
+  static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
+                              T loopUpperBound) {
+    T N = NextIter();
+    lb = loopLowerBound + N * chunkSize;
+    ub = lb + chunkSize - 1; // Clang uses i <= ub
+
+    // 3 result cases:
+    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
+    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
+    //  NOT_FINISHED
+    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
+    // a.
+    if (lb <= loopUpperBound && ub < loopUpperBound) {
+      return NOT_FINISHED;
+    }
+    // b.
+    if (lb <= loopUpperBound) {
+      ub = loopUpperBound;
+      return LAST_CHUNK;
+    }
+    // c. if we are here, we are in case 'c'
+    lb = loopUpperBound + 2;
+    ub = loopUpperBound + 1;
+    return FINISHED;
+  }
+
+  static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
+                           T *plower, T *pupper, ST *pstride,
+                           DynamicScheduleTracker *DST) {
+    // ID of a thread in its own warp
+
+    // automatically selects thread or warp ID based on selected implementation
+    ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
+            "current thread is not needed here; error");
+    // retrieve schedule
+    kmp_sched_t schedule = DST->ScheduleType;
+
+    // xxx reduce to one
+    if (schedule == kmp_sched_static_chunk ||
+        schedule == kmp_sched_static_nochunk) {
+      T myLb = DST->NextLowerBound;
+      T ub = DST->LoopUpperBound;
+      // finished?
+      if (myLb > ub) {
+        return DISPATCH_FINISHED;
+      }
+      // not finished, save current bounds
+      ST chunk = DST->Chunk;
+      *plower = myLb;
+      T myUb = myLb + chunk - 1; // Clang uses i <= ub
+      if (myUb > ub)
+        myUb = ub;
+      *pupper = myUb;
+      *plast = (int32_t)(myUb == ub);
+
+      // increment next lower bound by the stride
+      ST stride = DST->Stride;
+      DST->NextLowerBound = myLb + stride;
+      return DISPATCH_NOTFINISHED;
+    }
+    ASSERT0(LT_FUSSY,
+            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+            "bad sched");
+    T myLb, myUb;
+    int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
+                                    DST->LoopUpperBound);
+
+    if (finished == FINISHED)
+      return DISPATCH_FINISHED;
+
+    // not finished (either not finished or last chunk)
+    *plast = (int32_t)(finished == LAST_CHUNK);
+    *plower = myLb;
+    *pupper = myUb;
+    *pstride = 1;
+
+    return DISPATCH_NOTFINISHED;
+  }
+
+  static void dispatch_fini() {
+    // nothing
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // end of template class that encapsulate all the helper functions
+  ////////////////////////////////////////////////////////////////////////////////
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (dyn loops)
+////////////////////////////////////////////////////////////////////////////////
+
+// TODO: This is a stopgap. We probably want to expand the dispatch API to take
+//       an DST pointer which can then be allocated properly without malloc.
+DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
+
+// Create a new DST, link the current one, and define the new as current.
+static DynamicScheduleTracker *pushDST() {
+  DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
+      memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
+  *NewDST = DynamicScheduleTracker({0});
+  NewDST->NextDST = ThreadDSTPtr;
+  ThreadDSTPtr = NewDST;
+  return ThreadDSTPtr;
+}
+
+// Return the current DST.
+static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
+
+// Pop the current DST and restore the last one.
+static void popDST() {
+  DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
+  memory::freeGlobal(ThreadDSTPtr, "remove DST");
+  ThreadDSTPtr = OldDST;
+}
+
+extern "C" {
+
+// init
+void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
+                            int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
+                             uint32_t lb, uint32_t ub, int32_t st,
+                             int32_t chunk) {
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
+                            int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
+                             uint64_t lb, uint64_t ub, int64_t st,
+                             int64_t chunk) {
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+// next
+int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
+                           int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
+                            uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
+                           int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
+                            uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+// fini
+void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
+  popDST();
+}
+
+void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
+  popDST();
+}
+
+void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
+  popDST();
+}
+
+void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
+  popDST();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (static loops)
+////////////////////////////////////////////////////////////////////////////////
+
+void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
+                              int32_t schedtype, int32_t *plastiter,
+                              int32_t *plower, int32_t *pupper,
+                              int32_t *pstride, int32_t incr, int32_t chunk) {
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
+                               int32_t schedtype, int32_t *plastiter,
+                               uint32_t *plower, uint32_t *pupper,
+                               int32_t *pstride, int32_t incr, int32_t chunk) {
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
+                              int32_t schedtype, int32_t *plastiter,
+                              int64_t *plower, int64_t *pupper,
+                              int64_t *pstride, int64_t incr, int64_t chunk) {
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
+                               int32_t schedtype, int32_t *plastiter,
+                               uint64_t *plower, uint64_t *pupper,
+                               int64_t *pstride, int64_t incr, int64_t chunk) {
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
+}
+
+#pragma omp end declare target