Index: CMakeLists.txt
===================================================================
--- CMakeLists.txt
+++ CMakeLists.txt
@@ -1,2 +1,3 @@
 cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
 add_subdirectory(runtime)
+add_subdirectory(libomptarget)
Index: libomptarget/Build_With_CMake.txt
===================================================================
--- /dev/null
+++ libomptarget/Build_With_CMake.txt
@@ -0,0 +1,127 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+=====================================================================
+How to Build the LLVM* OpenMP* Offloading Runtime Library using CMake
+=====================================================================
+
+==== Version of CMake required: v2.8.0 or above ====
+ 
+============================================
+How to call cmake initially, then repeatedly
+============================================
+- When calling cmake for the first time, all needed compiler options
+  must be specified on the command line.  After this initial call to
+  cmake, the compiler definitions must not be included for further calls
+  to cmake.  Other options can be specified on the command line multiple
+  times including all definitions in the Build options section below.
+- Example of configuring, building, reconfiguring, rebuilding:
+  $ mkdir build
+  $ cd build
+  $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ ..  # Initial configuration
+  $ make
+  ...
+  $ make clean
+  $ cmake -DCMAKE_BUILD_TYPE=Debug ..                         # Second configuration
+  $ make
+  ...
+  $ rm -rf *
+  $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ ..  # Third configuration
+  $ make
+- Notice in the example how the compiler definitions are only specified
+  for an empty build directory, but other Build options are used at any time.
+- The file CMakeCache.txt which is created after the first call to cmake
+  is a configuration file which holds all the values for the Build options.
+  These configuration values can be changed using a text editor to modify 
+  CMakeCache.txt as opposed to using definitions on the command line.
+- To have cmake create a particular type of build generator file simply 
+  inlude the -G <Generator name> option:
+  $ cmake -G "Unix Makefiles" ...
+  You can see a list of generators cmake supports by executing cmake with
+  no arguments and a list will be printed.
+
+=====================
+Instructions to Build
+=====================
+ $ cd libomptarget_top_level/ [ directory with plugins/ , deviceRTLs/ , etc. ]
+ $ mkdir build
+ $ cd build
+
+ [ Unix* Libraries ]
+ $ cmake -DCMAKE_C_COMPILER=<C Compiler> -DCMAKE_CXX_COMPILER=<C++ Compiler> ..
+
+ $ make
+ $ make install
+
+===========
+Tests
+===========
+After the library has been built, there are optional tests that can be 
+performed.  Some will be skipped based upon the platform.
+To run the tests,
+$ make check-libomptarget
+
+=============
+CMake options
+=============
+-DCMAKE_C_COMPILER=<C compiler name>
+Specify the C compiler
+
+-DCMAKE_CXX_COMPILER=<C++ compiler name>
+Specify the C++ compiler
+
+==== First values listed are the default value ====
+-DCMAKE_BUILD_TYPE=Release|Debug|RelWithDebInfo
+Build type can be Release, Debug, or RelWithDebInfo.
+
+-DLIBOMPTARGET_ENABLE_WERROR=true|false
+Should consider warnings as errors.
+
+-DLIBOMPTARGET_LLVM_LIT_EXECUTABLE=""
+Full path to the llvm-lit tool. Required for testing in out-of-tree builds.
+
+-DLIBOMPTARGET_FILECHECK_EXECUTABLE=""
+Full path to the FileCheck tool. Required for testing in out-of-tree builds.
+
+-DLIBOMPTARGET_OPENMP_HEADER_FOLDER=""
+Path of the folder that contains omp.h. This is required for testing 
+out-of-tree builds.
+
+-DLIBOMPTARGET_OPENMP_HOST_RTL_FOLDER=""
+Path of the folder that contains libomp.so. This is required for testing 
+out-of-tree builds.
+
+==== NVPTX device RTL specific ====
+-DLIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER=""
+Host compiler to use with NVCC. This compiler is not going to be used to produce
+any binary. Instead, this is used to overcome the input compiler checks done by
+NVCC. E.g. if using a default host compiler that is not compatible with NVCC,
+this option can be use to pass to NVCC a valid compiler to avoid the error. 
+
+-DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY="35"
+Comma-separated list of CUDA compute capabilities that should be supported by
+the NVPTX device RTL. E.g. for compute capabilities 3.0 and 3.5, the option 
+"30,35" should be used.
+
+=======================
+Example usages of CMake
+=======================
+---- Typical usage ----
+cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ ..
+cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
+
+---- Request an NVPTX runtime library that supports compute capability 5.0 ----
+cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY="50"
+
+=========
+Footnotes
+=========
+[*] Other names and brands may be claimed as the property of others.
Index: libomptarget/CMakeLists.txt
===================================================================
--- /dev/null
+++ libomptarget/CMakeLists.txt
@@ -0,0 +1,124 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build offloading library libomptarget.so.
+#
+##===----------------------------------------------------------------------===##
+
+# CMAKE libomptarget
+cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
+
+# Add cmake directory to search for custom cmake functions.
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH})
+
+# Standalone build or part of LLVM?
+set(LIBOMPTARGET_STANDALONE_BUILD FALSE)
+if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}" OR
+   "${CMAKE_SOURCE_DIR}/libomptarget" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+  project(libomptarget C CXX)
+  set(LIBOMPTARGET_STANDALONE_BUILD TRUE)
+endif()
+  
+  
+if(${LIBOMPTARGET_STANDALONE_BUILD})
+  set(LIBOMPTARGET_ENABLE_WERROR FALSE CACHE BOOL
+    "Enable -Werror flags to turn warnings into errors for supporting compilers.")
+  # CMAKE_BUILD_TYPE was not defined, set default to Release
+  if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+  endif()
+else()
+  set(LIBOMP_ENABLE_WERROR ${LLVM_ENABLE_WERROR})
+endif()
+
+# Compiler flag checks.
+include(config-ix)
+
+# Message utilities.
+include(LibomptargetUtils)
+
+# Get dependencies for the different components of the project.
+include(LibomptargetGetDependencies)
+
+# This is a list of all the targets that are supported/tested right now.
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
+
+# Once the plugins for the different targets are validated, they will be added to
+# the list of supported targets in the current system.
+set (LIBOMPTARGET_SYSTEM_TARGETS "")
+
+# Set base directories - required for lit to locate the tests.
+set(LIBOMPTARGET_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(LIBOMPTARGET_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+# We need C++11 support.
+if(LIBOMPTARGET_HAVE_STD_CPP11_FLAG)
+
+  # We can only build the offloading library if libelf is available.
+  if(LIBOMPTARGET_DEP_LIBELF_FOUND)
+  
+    libomptarget_say("Building offloading runtime library libomptarget.")
+  
+    # Add libelf include directories.
+    include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
+  
+    # Enable support for C++11.
+    add_definitions(-std=c++11)
+    
+    if(LIBOMPTARGET_ENABLE_WERROR AND LIBOMPTARGET_HAVE_WERROR_FLAG)
+      add_definitions(-Werror)
+    endif()
+    
+    # If building this library in debug mode, we define a macro to enable
+    # dumping progress messages at runtime.
+    string( TOLOWER "${CMAKE_BUILD_TYPE}" LIBOMPTARGET_CMAKE_BUILD_TYPE)
+    if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
+      add_definitions(-DOMPTARGET_DEBUG)
+      add_definitions(-g)
+      add_definitions(-O0)
+    endif()
+    
+    set(src_files
+      src/omptarget.cpp
+      src/targets_info.cpp
+    )
+    
+    include_directories(src/)
+    
+    # Build libomptarget library with libelf and libdl dependencies.
+    add_library(omptarget SHARED ${src_files})
+    target_link_libraries(omptarget 
+      ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} 
+      dl
+      "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports")
+    
+    # Retrieve the path to the resulting library so that it can be used for 
+    # testing.
+    get_target_property(LIBOMPTARGET_LIBRARY_DIR omptarget LIBRARY_OUTPUT_DIRECTORY)
+    if(NOT LIBOMPTARGET_LIBRARY_DIR)
+      set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+    endif()
+    
+    # Build offloading plugins and the device RTL libraries if they are available.
+    add_subdirectory(plugins)
+    add_subdirectory(deviceRTLs)
+    
+    # Add tests.
+    add_subdirectory(test)
+    
+  else(LIBOMPTARGET_DEP_LIBELF_FOUND)
+    libomptarget_say("Not building offloading runtime library libomptarget: libelf dependency wasn't found.")
+  endif(LIBOMPTARGET_DEP_LIBELF_FOUND)
+  
+else(LIBOMPTARGET_HAVE_STD_CPP11_FLAG)
+  libomptarget_say("Not building offloading runtime library libomptarget: host compiler must have c++11 support.")
+endif(LIBOMPTARGET_HAVE_STD_CPP11_FLAG)
Index: libomptarget/README.txt
===================================================================
--- /dev/null
+++ libomptarget/README.txt
@@ -0,0 +1,72 @@
+
+    README for the LLVM* OpenMP* Offloading Runtime Library (libomptarget)
+    ======================================================================
+
+How to Build the LLVM* OpenMP* Offloading Runtime Library (libomptarget)
+========================================================================
+In-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp (libomptarget lives under ./libomptarget) into llvm/projects
+$ cd where-you-want-to-build
+$ mkdir build && cd build
+$ cmake path/to/llvm -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make omptarget
+
+Out-of-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp (libomptarget lives under ./libomptarget)
+$ cd where-you-want-to-live/openmp/libomptarget
+$ mkdir build && cd build
+$ cmake path/to/openmp -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make
+
+For details about building, please look at Build_With_CMake.txt
+
+Architectures Supported
+=======================
+The current library has been only tested in Linux operating system and the 
+following host architectures: 
+* Intel(R) 64 architecture
+* IBM(R) Power architecture (big endian)
+* IBM(R) Power architecture (little endian)
+
+The currently supported offloading device architectures are:
+* Intel(R) 64 architecture (generic 64-bit plugin - mostly for testing purposes)
+* IBM(R) Power architecture (big endian) (generic 64-bit plugin - mostly for testing purposes)
+* IBM(R) Power architecture (little endian) (generic 64-bit plugin - mostly for testing purposes)
+* CUDA(R) enabled 64-bit NVIDIA(R) GPU architectures
+
+Supported RTL Build Configurations
+==================================
+Supported Architectures: Intel(R) 64, IBM(R) Power 7 and Power 8
+
+              ---------------------------
+              |   gcc      |   clang    |
+--------------|------------|------------|
+| Linux* OS   |  Yes(1)    |  Yes(2)    |
+-----------------------------------------
+
+(1) gcc version 4.8.2 or later is supported.
+(2) clang version 3.7 or later is supported.
+
+
+Front-end Compilers that work with this RTL
+===========================================
+
+The following compilers are known to do compatible code generation for
+this RTL: 
+  - clang (from the OpenMP development branch at http://clang-omp.github.io/ )
+  - clang (development branch at http://clang.llvm.org - several features still 
+    under development)
+
+-----------------------------------------------------------------------
+
+Notices
+=======
+This library and related compiler support is still under development, so the 
+employed interface is likely to change in the future.
+
+*Other names and brands may be claimed as the property of others.
+
Index: libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
===================================================================
--- /dev/null
+++ libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -0,0 +1,114 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Try to detect in the system several dependencies required by the different
+# components of libomptarget. These are the dependencies we have:
+#
+# libelf : required by some targets to handle the ELF files at runtime.
+# libffi : required to launch target kernels given function and argument 
+#          pointers.
+# CUDA : required to control offloading to NVIDIA GPUs.
+
+include (FindPackageHandleStandardArgs)
+
+################################################################################
+# Looking for libelf...
+################################################################################
+
+find_path (
+  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR
+  NAMES
+    libelf.h
+  PATHS
+    /usr/include
+    /usr/local/include
+    /opt/local/include
+    /sw/include
+    ENV CPATH
+  PATH_SUFFIXES
+    libelf)
+
+find_library (
+  LIBOMPTARGET_DEP_LIBELF_LIBRARIES
+  NAMES
+    elf
+  PATHS
+    /usr/lib
+    /usr/local/lib
+    /opt/local/lib
+    /sw/lib
+    ENV LIBRARY_PATH
+    ENV LD_LIBRARY_PATH)
+    
+set(LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+find_package_handle_standard_args(
+  LIBOMPTARGET_DEP_LIBELF 
+  DEFAULT_MSG
+  LIBOMPTARGET_DEP_LIBELF_LIBRARIES
+  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS)
+
+mark_as_advanced(
+  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS 
+  LIBOMPTARGET_DEP_LIBELF_LIBRARIES)
+  
+################################################################################
+# Looking for libffi...
+################################################################################
+
+find_path (
+  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR
+  NAMES
+    ffi.h
+  PATHS
+    /usr/include
+    /usr/local/include
+    /opt/local/include
+    /sw/include
+    ENV CPATH
+  PATH_SUFFIXES
+    libffi)
+
+find_library (
+  LIBOMPTARGET_DEP_LIBFFI_LIBRARIES
+  NAMES
+    ffi
+  PATHS
+    /usr/lib
+    /usr/local/lib
+    /opt/local/lib
+    /sw/lib
+    ENV LIBRARY_PATH
+    ENV LD_LIBRARY_PATH)
+
+set(LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+find_package_handle_standard_args(
+  LIBOMPTARGET_DEP_LIBFFI 
+  DEFAULT_MSG
+  LIBOMPTARGET_DEP_LIBFFI_LIBRARIES
+  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS)
+
+mark_as_advanced(
+  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS 
+  LIBOMPTARGET_DEP_LIBFFI_LIBRARIES)
+  
+################################################################################
+# Looking for CUDA...
+################################################################################
+find_package(CUDA QUIET)
+
+set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDA_FOUND})
+set(LIBOMPTARGET_DEP_CUDA_LIBRARIES ${CUDA_LIBRARIES})
+set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
+
+mark_as_advanced(
+  LIBOMPTARGET_DEP_CUDA_FOUND 
+  LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS
+  LIBOMPTARGET_DEP_CUDA_LIBRARIES)
Index: libomptarget/cmake/Modules/LibomptargetUtils.cmake
===================================================================
--- /dev/null
+++ libomptarget/cmake/Modules/LibomptargetUtils.cmake
@@ -0,0 +1,28 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# void libomptarget_say(string message_to_user);
+# - prints out message_to_user
+macro(libomptarget_say message_to_user)
+  message(STATUS "LIBOMPTARGET: ${message_to_user}")
+endmacro()
+
+# void libomptarget_warning_say(string message_to_user);
+# - prints out message_to_user with a warning
+macro(libomptarget_warning_say message_to_user)
+  message(WARNING "LIBOMPTARGET: ${message_to_user}")
+endmacro()
+
+# void libomptarget_error_say(string message_to_user);
+# - prints out message_to_user with an error and exits cmake
+macro(libomptarget_error_say message_to_user)
+  message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}")
+endmacro()
Index: libomptarget/cmake/Modules/config-ix.cmake
===================================================================
--- /dev/null
+++ libomptarget/cmake/Modules/config-ix.cmake
@@ -0,0 +1,17 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+
+# Checking C, CXX
+check_cxx_compiler_flag(-std=c++11 LIBOMPTARGET_HAVE_STD_CPP11_FLAG)
+check_c_compiler_flag(-Werror LIBOMPTARGET_HAVE_WERROR_FLAG)
Index: libomptarget/deviceRTLs/CMakeLists.txt
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/CMakeLists.txt
@@ -0,0 +1,14 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+# ##===----------------------------------------------------------------------===##
+#
+# Build a device RTL for each available machine available.
+#
+##===----------------------------------------------------------------------===##
+
+add_subdirectory(nvptx)
Index: libomptarget/deviceRTLs/nvptx/CMakeLists.txt
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -0,0 +1,90 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
+#
+##===----------------------------------------------------------------------===##
+
+set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING
+  "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.")
+  
+if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER)
+  find_program(CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER})
+  if(NOT CUDA_HOST_COMPILER)
+    libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.")
+  endif()
+endif()
+
+# We can't use clang as nvcc host processor, so we attempt to replace it with
+# gcc.
+if(CUDA_HOST_COMPILER MATCHES "(clang)|(.*/clang)$")
+
+  find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc)
+  
+  if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER)
+    libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.")
+    libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.")
+    return()
+  endif()
+  set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}")
+endif()
+
+if(LIBOMPTARGET_DEP_CUDA_FOUND)
+  libomptarget_say("Building CUDA offloading device RTL.")
+  
+  # We really don't have any host code, so we don't need to care about 
+  # propagating host flags.
+  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+  
+  set(cuda_src_files
+      src/cancel.cu
+      src/critical.cu
+      src/debug.cu
+      src/libcall.cu
+      src/loop.cu
+      src/omptarget-nvptx.cu
+      src/parallel.cu
+      src/reduction.cu
+      src/stdio.cu
+      src/sync.cu
+      src/task.cu
+  )
+
+  include_directories(
+      src/
+  )
+
+  # Get all the compute capabilities the user requested or use SM_35 by default.
+  if(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY)
+    string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
+    foreach(sm ${nvptx_sm_list})
+      set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
+    endforeach()
+  else()
+    set(CUDA_ARCH -arch sm_35)
+  endif()
+  
+  # Activate RTL message dumps if requested by the user.
+  if(LIBOMPTARGET_NVPTX_DEBUG)
+    set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v)
+  endif()
+
+  # NVPTX runtime library has to be statically linked. Dynamic linking is not
+  # yet supported by the CUDA toolchain on the device.
+  set(BUILD_SHARED_LIBS OFF)
+  set(CUDA_SEPARABLE_COMPILATION ON)
+
+  cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} 
+      OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
+
+  target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES})
+
+else()
+  libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.")
+endif()
Index: libomptarget/deviceRTLs/nvptx/src/cancel.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/cancel.cu
@@ -0,0 +1,34 @@
+//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to be used in the implementation of OpenMP cancel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
+EXTERN int32_t __kmpc_cancellationpoint(
+  kmp_Indent* loc, 
+  int32_t global_tid, 
+  int32_t cancelVal)
+{
+  PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", cancelVal);
+  // disabled
+  return FALSE;
+}
+
+EXTERN int32_t __kmpc_cancel(
+  kmp_Indent* loc, 
+  int32_t global_tid, 
+  int32_t cancelVal)
+{
+  PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", cancelVal);
+  // disabled
+  return FALSE;
+}
Index: libomptarget/deviceRTLs/nvptx/src/counter_group.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/counter_group.h
@@ -0,0 +1,56 @@
+//===------ counter_group.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to implement OpenMP loop scheduling
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SRC_COUNTER_GROUP_H_
+#define SRC_COUNTER_GROUP_H_
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <cuda.h>
+
+// counter group type for synchronizations
+class omptarget_nvptx_CounterGroup
+{
+public:
+	// getters and setters
+	INLINE Counter & Event () { return v_event; }
+	INLINE volatile Counter & Start () { return v_start; }
+	INLINE Counter & Init ()  { return v_init; }
+
+	// Synchronization Interface
+
+  INLINE void    Clear(); // first time start=event
+  INLINE void    Reset(); // init = first
+	INLINE void    Init(Counter & priv); // priv = init
+	INLINE Counter Next();  // just counts number of events
+
+	// set priv to n, to be used in later waitOrRelease
+	INLINE void Complete(Counter & priv, Counter n);
+
+	// check priv and decide if we have to wait or can free the other warps
+	INLINE void Release(Counter priv, Counter current_event_value);
+	INLINE void WaitOrRelease(Counter priv, Counter current_event_value);
+
+private:
+	Counter v_event; // counter of events (atomic)
+
+	// volatile is needed to force loads to read from global
+	// memory or L2 cache and see the write by the last master
+	volatile Counter v_start; // signal when events registered are finished
+
+	Counter v_init;  // used to initialize local thread variables
+};
+
+
+#endif /* SRC_COUNTER_GROUP_H_ */
Index: libomptarget/deviceRTLs/nvptx/src/counter_groupi.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/counter_groupi.h
@@ -0,0 +1,89 @@
+//===----- counter_groupi.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface implementation for OpenMP loop scheduling
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+#include "option.h"
+
+INLINE void omptarget_nvptx_CounterGroup::Clear()
+{
+  PRINT0(LD_SYNCD, "clear counters\n") 
+  v_event = 0;
+  v_start = 0;
+  // v_init does not need to be reset (its value is dead)
+}
+
+INLINE void omptarget_nvptx_CounterGroup::Reset()
+{
+  // done by master before entering parallel
+  ASSERT(LT_FUSSY, v_event==v_start, 
+    "error, entry %lld !=start %lld at reset\n", P64(v_event), P64(v_start));
+  v_init = v_start;
+}
+
+INLINE void omptarget_nvptx_CounterGroup::Init(Counter & priv)
+{
+  PRINT(LD_SYNCD, "init priv counter 0x%llx with val %lld\n", 
+    P64(&priv), P64(v_start));
+  priv = v_start;
+}
+
+// just counts number of events
+INLINE Counter omptarget_nvptx_CounterGroup::Next()
+{
+  Counter oldVal = atomicAdd(&v_event, (Counter) 1);
+  PRINT(LD_SYNCD, "next event counter 0x%llx with val %lld->%lld\n", 
+    P64(&v_event), P64(oldVal), P64(oldVal+1));
+
+  return oldVal;
+}
+
+//set priv to n, to be used in later waitOrRelease
+INLINE void  omptarget_nvptx_CounterGroup::Complete(Counter & priv, Counter n)
+{
+  PRINT(LD_SYNCD, "complete priv counter 0x%llx with val %lld->%lld (+%d)\n", 
+    P64(&priv), P64(priv), P64(priv+n), n);
+  priv += n;
+}
+
+INLINE void  omptarget_nvptx_CounterGroup::Release (
+  Counter priv,
+  Counter current_event_value)
+{
+  if (priv - 1 == current_event_value) {
+    PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n", 
+      P64(&v_start), P64(v_start), P64(priv));
+    v_start = priv;
+  } 
+}
+
+// check priv and decide if we have to wait or can free the other warps
+INLINE void  omptarget_nvptx_CounterGroup::WaitOrRelease (
+  Counter priv,
+  Counter current_event_value)
+{
+  if (priv - 1 == current_event_value) {
+    PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n", 
+      P64(&v_start), P64(v_start), P64(priv));
+    v_start = priv;
+  } else {
+    PRINT(LD_SYNCD, "Start waiting while start counter 0x%llx with val %lld < %lld\n", 
+      P64(&v_start), P64(v_start), P64(priv));
+    while (priv > v_start) {
+      // IDLE LOOP
+      // start is volatile: it will be re-loaded at each while loop
+    }
+    PRINT(LD_SYNCD, "Done waiting as start counter 0x%llx with val %lld >= %lld\n", 
+      P64(&v_start), P64(v_start), P64(priv));
+  }
+}
+
Index: libomptarget/deviceRTLs/nvptx/src/critical.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/critical.cu
@@ -0,0 +1,40 @@
+//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of critical with KMPC interface
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdio.h>
+#include <complex.h>
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
+EXTERN 
+void __kmpc_critical(kmp_Indent *loc, int32_t global_tid, kmp_CriticalName *lck) 
+{
+  PRINT0(LD_IO, "call to kmpc_critical()\n");
+  omptarget_nvptx_TeamDescr & teamDescr = getMyTeamDescriptor();
+  omp_set_lock(teamDescr.CriticalLock());
+}
+
+EXTERN
+void __kmpc_end_critical( kmp_Indent *loc, int32_t global_tid, kmp_CriticalName *lck ) 
+{
+  PRINT0(LD_IO, "call to kmpc_end_critical()\n");
+  omptarget_nvptx_TeamDescr & teamDescr = getMyTeamDescriptor();
+  omp_unset_lock(teamDescr.CriticalLock());
+}
+
+
+
+
+
+	
+ 
Index: libomptarget/deviceRTLs/nvptx/src/debug.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/debug.h
@@ -0,0 +1,240 @@
+//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug macros to be used in the application.
+//
+//   Usage guide
+//
+//   PRINT0(flag, str)        : if debug flag is on, print (no arguments)
+//   PRINT(flag, str, args)   : if debug flag is on, print (arguments)
+//   DON(flag)                : return true if debug flag is on
+//
+//   ASSERT(flag, cond, str, args): if test flag is on, test the condition
+//                                  if the condition is false, print str+args
+//          and assert.
+//          CAUTION: cond may be evaluate twice
+//   AON(flag)                     : return true if test flag is on
+//
+//   WARNING(flag, str, args)      : if warning flag is on, print the warning
+//   WON(flag)                     : return true if warning flag is on
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_NVPTX_DEBUG_H_
+#define _OMPTARGET_NVPTX_DEBUG_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of debugging
+////////////////////////////////////////////////////////////////////////////////
+
+#define LD_SET_NONE               0ULL /* none */
+#define LD_SET_ALL               -1ULL /* all */
+
+// pos 1			  	  
+#define LD_SET_LOOP             0x1ULL /* basic loop */
+#define LD_SET_LOOPD            0x2ULL /* basic loop */
+#define LD_SET_PAR              0x4ULL /* basic parallel */
+#define LD_SET_PARD             0x8ULL /* basic parallel */
+
+// pos 2			  	  
+#define LD_SET_SYNC            0x10ULL /* sync info */
+#define LD_SET_SYNCD           0x20ULL /* sync info */
+#define LD_SET_WAIT            0x40ULL /* state when waiting */
+#define LD_SET_TASK            0x80ULL /* print task info (high level) */
+
+// pos 3			  	  		        
+#define LD_SET_IO             0x100ULL /* big region io (excl atomic) */
+#define LD_SET_IOD            0x200ULL /* big region io (excl atomic) */
+#define LD_SET_ENV            0x400ULL /* env info */
+#define LD_SET_CANCEL         0x800ULL /* print cancel info */	
+
+// pos 4			  	  		        
+#define LD_SET_MEM           0x1000ULL /* malloc / free */
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags to print selected output
+
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save on cuda buffer
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR)
+
+#ifndef OMPTARGET_NVPTX_DEBUG
+  #define OMPTARGET_NVPTX_DEBUG LD_SET_NONE
+#elif OMPTARGET_NVPTX_DEBUG
+  #warning debug is used, not good for measurements
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of asserts
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// available flags
+
+#define LT_SET_NONE       0x0 /* unsafe */
+#define LT_SET_SAFETY     0x1 /* check malloc type of stuff, input at creation, cheap */
+#define LT_SET_INPUT      0x2 /* check also all runtime inputs */
+#define LT_SET_FUSSY      0x4 /* fussy checks, expensive */
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags 
+
+#ifndef OMPTARGET_NVPTX_TEST
+  #if OMPTARGET_NVPTX_DEBUG
+    #define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY)
+  #else
+    #define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY)
+  #endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of warnings
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// available flags
+
+#define LW_SET_ALL         -1
+#define LW_SET_NONE       0x0
+#define LW_SET_ENV        0x1
+#define LW_SET_INPUT      0x2
+#define LW_SET_FUSSY      0x4   
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags 
+
+#if OMPTARGET_NVPTX_DEBUG
+  #define OMPTARGET_NVPTX_WARNING  (LW_SET_NONE)
+#else
+  #define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY)
+#endif
+
+
+////////////////////////////////////////////////////////////////////////////////
+// implemtation for debug
+////////////////////////////////////////////////////////////////////////////////
+
+#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
+  #include <stdio.h>
+#endif
+#if OMPTARGET_NVPTX_TEST
+  #include <assert.h>
+#endif
+
+// set flags that are tested (inclusion properties)
+
+#define LD_ALL      (LD_SET_ALL) 
+        	    
+#define LD_LOOP     (LD_SET_LOOP | LD_SET_LOOPD)     
+#define LD_LOOPD    (LD_SET_LOOPD)     
+#define LD_PAR      (LD_SET_PAR  | LD_SET_PARD)     
+#define LD_PARD     (LD_SET_PARD)     
+		    
+// pos 2   	    
+#define LD_SYNC     (LD_SET_SYNC  | LD_SET_SYNCD)     
+#define LD_SYNCD    (LD_SET_SYNCD)     
+#define LD_WAIT     (LD_SET_WAIT)     
+#define LD_TASK     (LD_SET_TASK)     
+		    
+// pos 3   	    
+#define LD_IO       (LD_SET_IO | LD_SET_IOD)     
+#define LD_IOD      (LD_SET_IOD)     
+#define LD_ENV      (LD_SET_ENV)     
+#define LD_CANCEL   (LD_SET_CANCEL)     
+
+// pos 3   	    
+#define LD_MEM      (LD_SET_MEM)     
+
+// implement 
+#if OMPTARGET_NVPTX_DEBUG
+
+  #define DON(_flag) ((OMPTARGET_NVPTX_DEBUG) & (_flag))
+
+  #define PRINT0(_flag, _str) { if (DON(_flag)) {		\
+    printf("<b %2d, t %4d, w %2d, l %2d>: " _str, blockIdx.x, threadIdx.x, \
+      threadIdx.x / warpSize, threadIdx.x & 0x1F); }}
+
+  #define PRINT(_flag, _str, _args...) { if (DON(_flag)) {		\
+    printf("<b %2d, t %4d, w %2d, l %2d>: " _str,  blockIdx.x, threadIdx.x, \
+      threadIdx.x / warpSize, threadIdx.x & 0x1F, _args); }}
+#else
+
+  #define DON(_flag) (FALSE)
+  #define PRINT0(flag, str)
+  #define  PRINT(flag, str, _args...)
+
+#endif
+
+// for printing without worring about precision, pointers...
+#define P64(_x) ((unsigned long long)(_x))
+
+////////////////////////////////////////////////////////////////////////////////
+// early defs for test
+////////////////////////////////////////////////////////////////////////////////
+
+#define LT_SAFETY     (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY)
+#define LT_INPUT      (LT_SET_INPUT | LT_SET_FUSSY)
+#define LT_FUSSY      (LT_SET_FUSSY)
+
+#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY
+
+  #define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
+  #define ASSERT0(_flag, _cond, _str)           { if (TON(_flag)) {assert(_cond);}}
+  #define  ASSERT(_flag, _cond, _str, _args...) { if (TON(_flag)) {assert(_cond);}}
+
+#elif OMPTARGET_NVPTX_TEST  >= LT_SET_INPUT
+
+  #define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
+  #define ASSERT0(_flag, _cond, _str) { if (TON(_flag) && !(_cond)) { \
+    printf("<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n", blockIdx.x, threadIdx.x, \
+      threadIdx.x / warpSize, threadIdx.x & 0x1F); assert(_cond);}}
+  #define ASSERT(_flag, _cond, _str, _args...) { if (TON(_flag) && !(_cond)) { \
+    printf("<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n", blockIdx.x, threadIdx.x, \
+      threadIdx.x / warpSize, threadIdx.x & 0x1F, _args); assert(_cond);}}
+
+#else 
+
+  #define TON(_flag) (FALSE)
+  #define ASSERT0(_flag, _cond, _str) 
+  #define  ASSERT(_flag, _cond, _str, _args...) 
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// early defs for warning
+
+#define LW_ALL        (LW_SET_ALL)
+#define LW_ENV        (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV)
+#define LW_INPUT      (LW_SET_FUSSY | LW_SET_INPUT)
+#define LW_FUSSY      (LW_SET_FUSSY)
+
+#if OMPTARGET_NVPTX_WARNING
+
+  #define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag))
+  #define WARNING0(_flag, _str) { if (WON(_flag)) { \
+    printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, blockIdx.x, threadIdx.x, \
+      threadIdx.x / warpSize, threadIdx.x & 0x1F); }}
+  #define WARNING(_flag, _str, _args...) { if (WON(_flag)) {	\
+    printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str,  blockIdx.x, threadIdx.x, \
+      threadIdx.x / warpSize, threadIdx.x & 0x1F, _args); }}
+
+#else
+
+  #define WON(_flag) (FALSE)
+  #define WARNING0(_flag, _str) 
+  #define  WARNING(_flag, _str, _args...) 
+
+#endif
+
+
+#endif
Index: libomptarget/deviceRTLs/nvptx/src/debug.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/debug.cu
@@ -0,0 +1,78 @@
+//===------------ debug.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of debug utilities to be
+// used in the application.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// print current state
+////////////////////////////////////////////////////////////////////////////////
+
+NOINLINE void PrintTaskDescr(omptarget_nvptx_TaskDescr *taskDescr, char *title, int level)
+{
+  omp_sched_t sched = taskDescr->GetRuntimeSched();
+  PRINT(LD_ALL, "task descr %s %d: %s, in par %d, dyn %d, rt sched %d, chunk %lld;"
+    "  tid %d, tnum %d, nthreads %d\n",
+    title, level, (taskDescr->IsParallelConstruct()?"par":"task"), 
+    taskDescr->InParallelRegion(), taskDescr->IsDynamic(), 
+    sched, taskDescr->RuntimeChunkSize(),
+    taskDescr->ThreadId(), taskDescr->ThreadsInTeam(), taskDescr->NThreads());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// debug for compiler (should eventually all vanish)
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_print_str(char *title)
+{
+  PRINT(LD_ALL, " %s\n", title);
+}
+
+EXTERN void __kmpc_print_title_int(char *title, int data)
+{
+  PRINT(LD_ALL, "%s val=%d\n", title, data);
+}
+
+EXTERN void __kmpc_print_index(char *title, int i)
+{
+  PRINT(LD_ALL, "i = %d\n", i);
+}
+
+EXTERN void __kmpc_print_int(int data)
+{
+  PRINT(LD_ALL, "val=%d\n", data);
+}
+
+EXTERN void __kmpc_print_double(double data)
+{
+  PRINT(LD_ALL, "val=%lf\n", data);
+}
+
+EXTERN void __kmpc_print_address_int64(int64_t data)
+{
+  PRINT(LD_ALL, "val=%016llx\n", data);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// substitute for printf in kernel (should vanish)
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_kernel_print(char *title) 
+{
+  PRINT(LD_ALL, " %s\n", title);
+}
+
+EXTERN void __kmpc_kernel_print_int8(char *title, int64_t data) 
+{
+  PRINT(LD_ALL, " %s val=%lld\n", title, data);
+}
Index: libomptarget/deviceRTLs/nvptx/src/interface.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -0,0 +1,658 @@
+//===------- interface.h - NVPTX OpenMP interface definitions ---- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug macros to be used in the application.
+//
+//  This file contains all the definitions that are relevant to
+//  the interface. The first section contains the interface as
+//  declared by OpenMP.  A second section includes library private calls
+//  (mostly debug, temporary?) The third section includes the compiler
+//  specific interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _INTERFACES_H_
+#define _INTERFACES_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenMP interface
+////////////////////////////////////////////////////////////////////////////////
+
+typedef uint32_t omp_lock_t;      /* arbitrary type of the right length */
+typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
+
+typedef enum omp_sched_t {
+  omp_sched_static         = 1, /* chunkSize >0 */
+  omp_sched_dynamic        = 2, /* chunkSize >0 */
+  omp_sched_guided         = 3, /* chunkSize >0 */
+  omp_sched_auto           = 4, /* no chunkSize */
+} omp_sched_t;
+
+typedef enum omp_proc_bind_t {
+  omp_proc_bind_false = 0,
+  omp_proc_bind_true = 1,
+  omp_proc_bind_master = 2,
+  omp_proc_bind_close = 3,
+  omp_proc_bind_spread = 4
+} omp_proc_bind_t;
+
+
+EXTERN double omp_get_wtick(void); 
+EXTERN double omp_get_wtime(void); 
+
+EXTERN void omp_set_num_threads(int num); 
+EXTERN int  omp_get_num_threads(void);  
+EXTERN int  omp_get_max_threads(void); 
+EXTERN int  omp_get_thread_limit(void); 
+EXTERN int  omp_get_thread_num(void); 
+EXTERN int  omp_get_num_procs(void); 
+EXTERN int  omp_in_parallel(void); 
+EXTERN int  omp_in_final(void); 
+EXTERN void omp_set_dynamic(int flag); 
+EXTERN int  omp_get_dynamic(void);  
+EXTERN void omp_set_nested(int flag);
+EXTERN int  omp_get_nested(void);
+EXTERN void omp_set_max_active_levels(int level);
+EXTERN int  omp_get_max_active_levels(void);
+EXTERN int  omp_get_level(void); 
+EXTERN int  omp_get_active_level(void); 
+EXTERN int  omp_get_ancestor_thread_num(int level); 
+EXTERN int  omp_get_team_size(int level); 
+
+EXTERN void omp_init_lock(omp_lock_t *lock);
+EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_destroy_lock(omp_lock_t *lock);
+EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_set_lock(omp_lock_t *lock);
+EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_unset_lock(omp_lock_t *lock);
+EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock);
+EXTERN int  omp_test_lock(omp_lock_t *lock);
+EXTERN int  omp_test_nest_lock(omp_nest_lock_t *lock);
+
+EXTERN void omp_get_schedule(omp_sched_t * kind, int * modifier); 
+EXTERN void omp_set_schedule(omp_sched_t kind, int modifier); 
+EXTERN omp_proc_bind_t omp_get_proc_bind(void); 
+EXTERN int  omp_get_cancellation(void); 
+EXTERN void omp_set_default_device(int deviceId); 
+EXTERN int  omp_get_default_device(void); 
+EXTERN int  omp_get_num_devices(void); 
+EXTERN int  omp_get_num_teams(void); 
+EXTERN int  omp_get_team_num(void); 
+EXTERN int  omp_is_initial_device(void);
+
+
+////////////////////////////////////////////////////////////////////////////////
+// OMPTARGET_NVPTX private (debug / temportary?) interface
+////////////////////////////////////////////////////////////////////////////////
+
+// for debug
+EXTERN void __kmpc_print_str(char *title);
+EXTERN void __kmpc_print_title_int(char *title, int data);
+EXTERN void __kmpc_print_index(char *title, int i);
+EXTERN void __kmpc_print_int(int data);
+EXTERN void __kmpc_print_double(double data);
+EXTERN void __kmpc_print_address_int64(int64_t data);
+
+////////////////////////////////////////////////////////////////////////////////
+// file below is swiped from kmpc host interface
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// kmp specifc types
+////////////////////////////////////////////////////////////////////////////////
+
+typedef enum kmp_sched_t {
+  kmp_sched_static_chunk           = 33,
+  kmp_sched_static_nochunk         = 34,
+  kmp_sched_dynamic                = 35,
+  kmp_sched_guided                 = 36,   
+  kmp_sched_runtime                = 37,
+  kmp_sched_auto                   = 38, 
+
+  kmp_sched_static_ordered         = 65,
+  kmp_sched_static_nochunk_ordered = 66,
+  kmp_sched_dynamic_ordered        = 67,
+  kmp_sched_guided_ordered         = 68,
+  kmp_sched_runtime_ordered        = 69,
+  kmp_sched_auto_ordered           = 70, 
+
+  kmp_sched_distr_static_chunk     = 91,  
+  kmp_sched_distr_static_nochunk   = 92,  
+
+  kmp_sched_default                = kmp_sched_static_nochunk,
+  kmp_sched_unordered_first        = kmp_sched_static_chunk,
+  kmp_sched_unordered_last         = kmp_sched_auto,
+  kmp_sched_ordered_first          = kmp_sched_static_ordered,
+  kmp_sched_ordered_last           = kmp_sched_auto_ordered,
+  kmp_sched_distribute_first       = kmp_sched_distr_static_chunk,
+  kmp_sched_distribute_last        = kmp_sched_distr_static_nochunk
+} kmp_sched_t;
+
+
+// parallel defs 
+typedef void kmp_Indent;
+typedef void (* kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
+typedef void (* kmp_ReductFctPtr)(void *lhsData, void *rhsData);
+
+// task defs
+typedef struct kmp_TaskDescr kmp_TaskDescr;
+typedef int32_t (* kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr);
+typedef struct kmp_TaskDescr {
+  void          *sharedPointerTable; // ptr to a table of shared var ptrs
+  kmp_TaskFctPtr sub;     // task subroutine 
+  int32_t        partId;  // unused
+  kmp_TaskFctPtr destructors; // destructor of c++ first private
+} kmp_TaskDescr;
+// task dep defs
+#define KMP_TASKDEP_IN  0x1u
+#define KMP_TASKDEP_OUT 0x2u
+typedef struct kmp_TaskDep_Public {
+  void    *addr;
+  size_t   len;
+  uint8_t  flags; // bit 0: in, bit 1: out
+} kmp_TaskDep_Public;
+
+// flags that interpret the interface part of tasking flags
+#define KMP_TASK_IS_TIED             0x1
+#define KMP_TASK_FINAL               0x2
+#define KMP_TASK_MERGED_IF0          0x4 /* unused */
+#define KMP_TASK_DESTRUCTOR_THUNK    0x8
+
+// flags for task setup return
+#define KMP_CURRENT_TASK_NOT_SUSPENDED 0
+#define KMP_CURRENT_TASK_SUSPENDED     1
+
+// sync defs
+typedef int32_t kmp_CriticalName[8];
+
+
+////////////////////////////////////////////////////////////////////////////////
+// flags for kstate (all bits initially off)
+////////////////////////////////////////////////////////////////////////////////
+
+// first 2 bits used by kmp_Reduction (defined in kmp_reduction.cpp)
+#define KMP_REDUCTION_MASK           0x3
+#define KMP_SKIP_NEXT_CALL           0x4
+#define KMP_SKIP_NEXT_CANCEL_BARRIER 0x8
+
+
+////////////////////////////////////////////////////////////////////////////////
+// data
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// external interface
+////////////////////////////////////////////////////////////////////////////////
+
+// query
+EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc);  // missing
+EXTERN int32_t __kmpc_global_num_threads(kmp_Indent *loc); // missing
+EXTERN int32_t __kmpc_bound_thread_num(kmp_Indent *loc);   // missing
+EXTERN int32_t __kmpc_bound_num_threads(kmp_Indent *loc);  // missing
+EXTERN int32_t __kmpc_in_parallel(kmp_Indent *loc);        // missing
+
+// parallel
+EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t global_tid, int32_t num_threads);
+//aee ... not supported
+//EXTERN void __kmpc_fork_call(kmp_Indent *loc, int32_t argc, kmp_ParFctPtr microtask, ...);
+EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid);
+EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, uint32_t global_tid);
+
+// for static (no chunk or chunk)
+EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid, 
+  int32_t sched, int32_t *plastiter, int32_t *plower, int32_t *pupper,
+  int32_t *pstride, int32_t incr, int32_t chunk);
+EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid, 
+  int32_t sched, int32_t *plastiter, uint32_t *plower, uint32_t *pupper,
+  int32_t *pstride, int32_t incr, int32_t chunk);
+EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid, 
+  int32_t sched, int32_t *plastiter, int64_t *plower, int64_t *pupper,
+  int64_t *pstride, int64_t incr, int64_t chunk);
+EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid, 
+  int32_t sched, int32_t *plastiter1, uint64_t *plower, uint64_t *pupper,
+  int64_t *pstride, int64_t incr, int64_t chunk);
+
+EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid);
+
+// for dynamic
+EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t global_tid, 
+  int32_t sched, int32_t lower, int32_t upper, int32_t incr, 
+  int32_t chunk);
+EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t global_tid, 
+  int32_t sched, uint32_t lower, uint32_t upper, int32_t incr, 
+  int32_t chunk);
+EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t global_tid, 
+  int32_t sched, int64_t lower, int64_t upper, int64_t incr, 
+  int64_t chunk);
+EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t global_tid, 
+  int32_t sched, uint64_t lower, uint64_t upper, int64_t incr, 
+  int64_t chunk);
+
+EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t global_tid, 
+  int32_t *plastiter, int32_t *plower, int32_t *pupper, int32_t *pstride);
+EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t global_tid, 
+  int32_t *plastiter, uint32_t *plower, uint32_t *pupper, int32_t *pstride);
+EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t global_tid, 
+  int32_t *plastiter, int64_t *plower, int64_t *pupper, int64_t *pstride);
+EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t global_tid, 
+  int32_t *plastiter, uint64_t *plower, uint64_t *pupper,int64_t *pstride);
+
+EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t global_tid);
+
+// reduction
+EXTERN  int32_t __kmpc_reduce41(kmp_Indent *loc, int32_t global_tid,
+  int32_t varNum, size_t reduceSize, void *reduceData, void *reduceArraySize,
+  kmp_ReductFctPtr *reductFct, kmp_CriticalName *lock);
+EXTERN void __kmpc_end_reduce(kmp_Indent *loc, int32_t global_tid,
+  kmp_CriticalName *lock);
+EXTERN  int32_t __kmpc_reduce_nowait41(kmp_Indent *loc, 
+  int32_t global_tid,int32_t varNum, size_t reduceSize, void *reduceData, 
+    void *reduceArraySize, kmp_ReductFctPtr *reductFct, kmp_CriticalName *lock);
+EXTERN void __kmpc_end_reduce_nowait(kmp_Indent *loc, 
+  int32_t global_tid, kmp_CriticalName *lock);
+
+// sync barrier
+EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc, int32_t global_tid);
+
+// single
+EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid);
+EXTERN void    __kmpc_end_single(kmp_Indent *loc, int32_t global_tid);
+
+// sync
+EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid);
+EXTERN void    __kmpc_end_master(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_critical(kmp_Indent *loc, int32_t global_tid,
+  kmp_CriticalName *crit);
+EXTERN void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid,
+  kmp_CriticalName *crit);
+EXTERN void __kmpc_flush(kmp_Indent *loc);
+
+// tasks
+EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Indent *loc, 
+  uint32_t global_tid, int32_t flag, size_t sizeOfTaskInclPrivate, 
+  size_t sizeOfSharedTable, 
+  kmp_TaskFctPtr sub);
+EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, 
+  uint32_t global_tid, kmp_TaskDescr *newLegacyTaskDescr, 
+  int32_t depNum, void * depList, int32_t noAliasDepNum,
+  void * noAliasDepList);
+EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, 
+  uint32_t global_tid, kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, 
+  uint32_t global_tid, kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, 
+  uint32_t global_tid, int32_t depNum, void * depList,
+  int32_t noAliasDepNum, void * noAliasDepList);
+EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid);
+EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid);
+EXTERN void __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid);
+EXTERN void __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid);
+
+// cancel
+EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent* loc, int32_t global_tid, 
+  int32_t cancelVal);
+EXTERN int32_t __kmpc_cancel(kmp_Indent* loc, int32_t global_tid, 
+  int32_t cancelVal);
+
+// target (no target call here)
+
+// atomic
+//EXTERN void __array_atomicfixed4_add(kmp_Indent *loc, int gtid, int32_t *addr1, int32_t *val, int64_t n);
+EXTERN void __kmpc_atomic_fixed1_wr (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_add (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_sub (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_sub_rev (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_mul (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_div (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_div_rev (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1u_div (kmp_Indent *loc, int gtid, uint8_t *addr, uint8_t val) ;
+EXTERN void __kmpc_atomic_fixed1u_div_rev (kmp_Indent *loc, int gtid, uint8_t *addr, uint8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_min (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_max (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_andb (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_orb (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_xor (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_andl (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_orl (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_eqv (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_neqv (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_shl (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_shl_rev (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_shr (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1_shr_rev (kmp_Indent *loc, int gtid, int8_t *addr, int8_t val) ;
+EXTERN void __kmpc_atomic_fixed1u_shr (kmp_Indent *loc, int gtid, uint8_t *addr, uint8_t val) ;
+EXTERN void __kmpc_atomic_fixed1u_shr_rev (kmp_Indent *loc, int gtid, uint8_t *addr, uint8_t val) ;
+EXTERN void __kmpc_atomic_fixed2_wr (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_add (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_sub (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_sub_rev (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_mul (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_div (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_div_rev (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2u_div (kmp_Indent *loc, int gtid, uint16_t *addr, uint16_t val) ;
+EXTERN void __kmpc_atomic_fixed2u_div_rev (kmp_Indent *loc, int gtid, uint16_t *addr, uint16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_min (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_max (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_andb (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_orb (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_xor (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_andl (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_orl (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_eqv (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_neqv (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_shl (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_shl_rev (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_shr (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_shr_rev (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed2u_shr (kmp_Indent *loc, int gtid, uint16_t *addr, uint16_t val) ;
+EXTERN void __kmpc_atomic_fixed2u_shr_rev (kmp_Indent *loc, int gtid, uint16_t *addr, uint16_t val) ;
+EXTERN void __kmpc_atomic_fixed2_swp (kmp_Indent *loc, int gtid, int16_t *addr, int16_t val) ;
+EXTERN void __kmpc_atomic_fixed4_wr (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_add (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_sub (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_sub_rev (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_mul (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_div (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_div_rev (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4u_div (kmp_Indent *loc, int gtid, uint32_t *addr, uint32_t val) ;
+EXTERN void __kmpc_atomic_fixed4u_div_rev (kmp_Indent *loc, int gtid, uint32_t *addr, uint32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_min (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_max (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_andb (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_orb (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_xor (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_andl (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_orl (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_eqv (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_neqv (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_shl (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_shl_rev (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_shr (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_shr_rev (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed4u_shr (kmp_Indent *loc, int gtid, uint32_t *addr, uint32_t val) ;
+EXTERN void __kmpc_atomic_fixed4u_shr_rev (kmp_Indent *loc, int gtid, uint32_t *addr, uint32_t val) ;
+EXTERN void __kmpc_atomic_fixed4_swp (kmp_Indent *loc, int gtid, int32_t *addr, int32_t val) ;
+EXTERN void __kmpc_atomic_fixed8_wr (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_add (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_sub (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_sub_rev (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_mul (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_div (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_div_rev (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8u_div (kmp_Indent *loc, int gtid, uint64_t *addr, uint64_t val) ;
+EXTERN void __kmpc_atomic_fixed8u_div_rev (kmp_Indent *loc, int gtid, uint64_t *addr, uint64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_min (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_max (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_andb (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_orb (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_xor (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_andl (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_orl (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_eqv (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_neqv (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_shl (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_shl_rev (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_shr (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_shr_rev (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_fixed8u_shr (kmp_Indent *loc, int gtid, uint64_t *addr, uint64_t val) ;
+EXTERN void __kmpc_atomic_fixed8u_shr_rev (kmp_Indent *loc, int gtid, uint64_t *addr, uint64_t val) ;
+EXTERN void __kmpc_atomic_fixed8_swp (kmp_Indent *loc, int gtid, int64_t *addr, int64_t val) ;
+EXTERN void __kmpc_atomic_float4_add (kmp_Indent *loc, int gtid, float *addr, float val) ;
+EXTERN void __kmpc_atomic_float4_sub (kmp_Indent *loc, int gtid, float *addr, float val) ;
+EXTERN void __kmpc_atomic_float4_sub_rev (kmp_Indent *loc, int gtid, float *addr, float val) ;
+EXTERN void __kmpc_atomic_float4_mul (kmp_Indent *loc, int gtid, float *addr, float val) ;
+EXTERN void __kmpc_atomic_float4_div (kmp_Indent *loc, int gtid, float *addr, float val) ;
+EXTERN void __kmpc_atomic_float4_div_rev (kmp_Indent *loc, int gtid, float *addr, float val) ;
+EXTERN void __kmpc_atomic_float4_min (kmp_Indent *loc, int gtid, float *addr, float val) ;
+EXTERN void __kmpc_atomic_float4_max (kmp_Indent *loc, int gtid, float *addr, float val) ;
+EXTERN void __kmpc_atomic_float8_add (kmp_Indent *loc, int gtid, double *addr, double val) ;
+EXTERN void __kmpc_atomic_float8_sub (kmp_Indent *loc, int gtid, double *addr, double val) ;
+EXTERN void __kmpc_atomic_float8_sub_rev (kmp_Indent *loc, int gtid, double *addr, double val) ;
+EXTERN void __kmpc_atomic_float8_mul (kmp_Indent *loc, int gtid, double *addr, double val) ;
+EXTERN void __kmpc_atomic_float8_div (kmp_Indent *loc, int gtid, double *addr, double val) ;
+EXTERN void __kmpc_atomic_float8_div_rev (kmp_Indent *loc, int gtid, double *addr, double val) ;
+EXTERN void __kmpc_atomic_float8_min (kmp_Indent *loc, int gtid, double *addr, double val) ;
+EXTERN void __kmpc_atomic_float8_max (kmp_Indent *loc, int gtid, double *addr, double val) ;
+EXTERN void __kmpc_atomic_cmplx4_add (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx4_sub (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx4_sub_rev (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx4_mul (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx4_div (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx4_div_rev (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx4_swp (kmp_Indent *loc, int gtid, float _Complex *addr, float _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx8_add (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx8_sub (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx8_sub_rev (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx8_mul (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx8_div (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx8_div_rev (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ;
+EXTERN void __kmpc_atomic_cmplx8_swp (kmp_Indent *loc, int gtid, double _Complex *addr, double _Complex val) ;
+EXTERN int8_t __kmpc_atomic_fixed1_wr_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_add_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_sub_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_sub_cpt_rev (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_mul_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_div_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_div_cpt_rev (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN uint8_t __kmpc_atomic_fixed1u_div_cpt (kmp_Indent *loc, int gtid, uint8_t *lhs, uint8_t rhs, int atomicFlag) ;
+EXTERN uint8_t __kmpc_atomic_fixed1u_div_cpt_rev (kmp_Indent *loc, int gtid, uint8_t *lhs, uint8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_min_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_max_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_andb_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_orb_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_xor_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_andl_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_orl_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_eqv_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_neqv_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_shl_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_shl_cpt_rev (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_shr_cpt (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN int8_t __kmpc_atomic_fixed1_shr_cpt_rev (kmp_Indent *loc, int gtid, int8_t *lhs, int8_t rhs, int atomicFlag) ;
+EXTERN uint8_t __kmpc_atomic_fixed1u_shr_cpt (kmp_Indent *loc, int gtid, uint8_t *lhs, uint8_t rhs, int atomicFlag) ;
+EXTERN uint8_t __kmpc_atomic_fixed1u_shr_cpt_rev (kmp_Indent *loc, int gtid, uint8_t *lhs, uint8_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_wr_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_add_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_sub_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_sub_cpt_rev (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_mul_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_div_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_div_cpt_rev (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN uint16_t __kmpc_atomic_fixed2u_div_cpt (kmp_Indent *loc, int gtid, uint16_t *lhs, uint16_t rhs, int atomicFlag) ;
+EXTERN uint16_t __kmpc_atomic_fixed2u_div_cpt_rev (kmp_Indent *loc, int gtid, uint16_t *lhs, uint16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_min_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_max_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_andb_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_orb_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_xor_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_andl_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_orl_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_eqv_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_neqv_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_shl_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_shl_cpt_rev (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_shr_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_shr_cpt_rev (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN uint16_t __kmpc_atomic_fixed2u_shr_cpt (kmp_Indent *loc, int gtid, uint16_t *lhs, uint16_t rhs, int atomicFlag) ;
+EXTERN uint16_t __kmpc_atomic_fixed2u_shr_cpt_rev (kmp_Indent *loc, int gtid, uint16_t *lhs, uint16_t rhs, int atomicFlag) ;
+EXTERN int16_t __kmpc_atomic_fixed2_swp_cpt (kmp_Indent *loc, int gtid, int16_t *lhs, int16_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_wr_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_add_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_sub_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_sub_cpt_rev (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_mul_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_div_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_div_cpt_rev (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN uint32_t __kmpc_atomic_fixed4u_div_cpt (kmp_Indent *loc, int gtid, uint32_t *lhs, uint32_t rhs, int atomicFlag) ;
+EXTERN uint32_t __kmpc_atomic_fixed4u_div_cpt_rev (kmp_Indent *loc, int gtid, uint32_t *lhs, uint32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_min_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_max_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_andb_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_orb_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_xor_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_andl_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_orl_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_eqv_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_neqv_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_shl_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_shl_cpt_rev (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_shr_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_shr_cpt_rev (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN uint32_t __kmpc_atomic_fixed4u_shr_cpt (kmp_Indent *loc, int gtid, uint32_t *lhs, uint32_t rhs, int atomicFlag) ;
+EXTERN uint32_t __kmpc_atomic_fixed4u_shr_cpt_rev (kmp_Indent *loc, int gtid, uint32_t *lhs, uint32_t rhs, int atomicFlag) ;
+EXTERN int32_t __kmpc_atomic_fixed4_swp_cpt (kmp_Indent *loc, int gtid, int32_t *lhs, int32_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_wr_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_add_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_sub_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_sub_cpt_rev (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_mul_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_div_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_div_cpt_rev (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN uint64_t __kmpc_atomic_fixed8u_div_cpt (kmp_Indent *loc, int gtid, uint64_t *lhs, uint64_t rhs, int atomicFlag) ;
+EXTERN uint64_t __kmpc_atomic_fixed8u_div_cpt_rev (kmp_Indent *loc, int gtid, uint64_t *lhs, uint64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_min_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_max_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_andb_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_orb_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_xor_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_andl_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_orl_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_eqv_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_neqv_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_shl_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_shl_cpt_rev (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_shr_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_shr_cpt_rev (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN uint64_t __kmpc_atomic_fixed8u_shr_cpt (kmp_Indent *loc, int gtid, uint64_t *lhs, uint64_t rhs, int atomicFlag) ;
+EXTERN uint64_t __kmpc_atomic_fixed8u_shr_cpt_rev (kmp_Indent *loc, int gtid, uint64_t *lhs, uint64_t rhs, int atomicFlag) ;
+EXTERN int64_t __kmpc_atomic_fixed8_swp_cpt (kmp_Indent *loc, int gtid, int64_t *lhs, int64_t rhs, int atomicFlag) ;
+EXTERN float __kmpc_atomic_float4_add_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ;
+EXTERN float __kmpc_atomic_float4_sub_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ;
+EXTERN float __kmpc_atomic_float4_sub_cpt_rev (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ;
+EXTERN float __kmpc_atomic_float4_mul_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ;
+EXTERN float __kmpc_atomic_float4_div_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ;
+EXTERN float __kmpc_atomic_float4_div_cpt_rev (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ;
+EXTERN float __kmpc_atomic_float4_min_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ;
+EXTERN float __kmpc_atomic_float4_max_cpt (kmp_Indent *loc, int gtid, float *lhs, float rhs, int atomicFlag) ;
+EXTERN double __kmpc_atomic_float8_add_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ;
+EXTERN double __kmpc_atomic_float8_sub_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ;
+EXTERN double __kmpc_atomic_float8_sub_cpt_rev (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ;
+EXTERN double __kmpc_atomic_float8_mul_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ;
+EXTERN double __kmpc_atomic_float8_div_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ;
+EXTERN double __kmpc_atomic_float8_div_cpt_rev (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ;
+EXTERN double __kmpc_atomic_float8_min_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ;
+EXTERN double __kmpc_atomic_float8_max_cpt (kmp_Indent *loc, int gtid, double *lhs, double rhs, int atomicFlag) ;
+
+//special case according to iomp reference
+EXTERN void  __kmpc_atomic_cmplx4_add_cpt (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ;
+EXTERN void  __kmpc_atomic_cmplx4_sub_cpt (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ;
+EXTERN void  __kmpc_atomic_cmplx4_sub_cpt_rev (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ;
+EXTERN void  __kmpc_atomic_cmplx4_mul_cpt (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ;
+EXTERN void  __kmpc_atomic_cmplx4_div_cpt (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ;
+EXTERN void  __kmpc_atomic_cmplx4_div_cpt_rev (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ;
+EXTERN void __kmpc_atomic_cmplx4_swp_cpt (kmp_Indent *loc, int gtid, float _Complex *lhs, float _Complex rhs, float _Complex *out, int atomicFlag) ;
+
+EXTERN double _Complex __kmpc_atomic_cmplx8_add_cpt (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ;
+EXTERN double _Complex __kmpc_atomic_cmplx8_sub_cpt (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ;
+EXTERN double _Complex __kmpc_atomic_cmplx8_sub_cpt_rev (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ;
+EXTERN double _Complex __kmpc_atomic_cmplx8_mul_cpt (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ;
+EXTERN double _Complex __kmpc_atomic_cmplx8_div_cpt (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ;
+EXTERN double _Complex __kmpc_atomic_cmplx8_div_cpt_rev (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ;
+EXTERN double _Complex __kmpc_atomic_cmplx8_swp_cpt (kmp_Indent *loc, int gtid, double _Complex *lhs, double _Complex rhs, int atomicFlag) ;
+
+
+//atomic for array reduction
+
+EXTERN void __array_atomic_fixed1_add (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_sub (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_mul (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_div (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_min (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_max (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_andb (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_orb (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_xor (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_shl (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_shr (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_andl (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_orl (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_eqv (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed1_neqv (kmp_Indent *id_ref, int32_t gtid, int8_t * lhs, int8_t *rhs, int64_t n) ;
+EXTERN void __array_atomic_fixed2_add (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_sub (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_mul (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_div (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_min (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_max (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_andb (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_orb (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_xor (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_shl (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_shr (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_andl (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_orl (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_eqv (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed2_neqv (kmp_Indent *id_ref, int32_t gtid, int16_t * lhs, int16_t *rhs, int64_t n) ;
+EXTERN void __array_atomic_fixed4_add (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_sub (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_mul (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_div (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_min (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_max (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_andb (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_orb (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_xor (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_shl (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_shr (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_andl (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_orl (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_eqv (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed4_neqv (kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t *rhs, int64_t n) ;
+EXTERN void __array_atomic_fixed8_add (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_sub (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_mul (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_div (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_min (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_max (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_andb (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_orb (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_xor (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_shl (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_shr (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_andl (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_orl (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_eqv (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ; 
+EXTERN void __array_atomic_fixed8_neqv (kmp_Indent *id_ref, int32_t gtid, int64_t * lhs, int64_t *rhs, int64_t n) ;
+EXTERN void __array_atomic_float4_add (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ; 
+EXTERN void __array_atomic_float4_sub (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ; 
+EXTERN void __array_atomic_float4_mul (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ; 
+EXTERN void __array_atomic_float4_div (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ; 
+EXTERN void __array_atomic_float4_min (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ; 
+EXTERN void __array_atomic_float4_max (kmp_Indent *id_ref, int32_t gtid, float * lhs, float *rhs, int64_t n) ;
+EXTERN void __array_atomic_float8_add (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ; 
+EXTERN void __array_atomic_float8_sub (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ; 
+EXTERN void __array_atomic_float8_mul (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ; 
+EXTERN void __array_atomic_float8_div (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ; 
+EXTERN void __array_atomic_float8_min (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ; 
+EXTERN void __array_atomic_float8_max (kmp_Indent *id_ref, int32_t gtid, double * lhs, double *rhs, int64_t n) ;
+
+// non standard
+EXTERN void __kmpc_kernel_init(int OmpHandle, int ThreadLimit);
+EXTERN int  __kmpc_kernel_prepare_parallel(int numThreads, int numLanes);
+EXTERN void __kmpc_kernel_parallel(int numLanes);
+EXTERN void __kmpc_kernel_end_parallel();
+
+#endif
+
Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -0,0 +1,387 @@
+//===------------ libcall.cu - NVPTX OpenMP user calls ----------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenMP runtime functions that can be
+// invoked by the user in an OpenMP region
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+NOINLINE void PrintTaskDescr(omptarget_nvptx_TaskDescr *taskDescr, char *title, int level);
+
+EXTERN double omp_get_wtick(void)
+{
+  double rc = omptarget_nvptx_threadPrivateContext->GlobalICV()->gpuCycleTime;
+  PRINT(LD_IO, "call omp_get_wtick() returns %g\n", rc);
+  return rc;
+}
+
+EXTERN double omp_get_wtime(void)
+{
+  double rc = omptarget_nvptx_threadPrivateContext->GlobalICV()->gpuCycleTime * clock64();
+  PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_num_threads(int num)
+{
+  PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
+  if (num <= 0) {
+     WARNING0(LW_INPUT, "expected positive num; ignore\n");
+  } else {
+    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+    currTaskDescr->NThreads() = num;
+  }
+}
+
+EXTERN int omp_get_num_threads(void)
+{
+  int gtid = GetGlobalThreadId();
+  int rc = GetNumberOfOmpThreads(gtid);
+  PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_max_threads(void)
+{
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  int rc = 1; // default is 1 thread avail
+  if (! currTaskDescr->InParallelRegion()) {
+    // not currently in a parallel region... all are available
+    rc = GetNumberOfProcsInTeam();
+    ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
+  }
+  PRINT(LD_IO, "call omp_get_num_threads() return %\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_thread_limit(void)
+{
+  // per contention group.. meaning threads in current team
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  int rc = currTaskDescr->ThreadLimit();
+  PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_thread_num()
+{ 
+  int gtid = GetGlobalThreadId();
+  int rc = GetOmpThreadId(gtid);
+  PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_num_procs(void) 
+{
+  int rc = GetNumberOfThreadsInBlock();
+  PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_in_parallel(void)
+{
+  int rc = 0;
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  if (currTaskDescr->InParallelRegion()) {
+    rc = 1;
+  } 
+  PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_in_final(void)
+{
+  // treat all tasks as final... Specs may expect runtime to keep
+  // track more precisely if a task was actively set by users... This
+  // is not explicitely specified; will treat as if runtime can
+  // actively decide to put a non-final task into a final one.
+  int rc = 1;
+  PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
+  return rc;
+}
+
+
+EXTERN void omp_set_dynamic(int flag)
+{
+  PRINT(LD_IO, "call omp_set_dynamic(%d)\n", flag);
+
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  if (flag) {
+    currTaskDescr->SetDynamic();
+  } else {
+    currTaskDescr->ClearDynamic();
+  }
+}
+
+EXTERN int  omp_get_dynamic(void)
+{
+  int rc = 0;
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  if (currTaskDescr->IsDynamic()) {
+    rc = 1;
+  }
+  PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_nested(int flag)
+{
+  PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n", flag);
+}
+
+EXTERN int omp_get_nested(void)
+{
+  int  rc = 0;
+  PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_max_active_levels(int level)
+{
+  PRINT(LD_IO, "call omp_set_max_active_levels(%d) is ignored (no nested support)\n", level);
+}
+
+EXTERN int omp_get_max_active_levels(void)
+{
+  int  rc = 1;
+  PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_level(void)
+{
+  int level = 0;
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread");
+  do {
+    if (currTaskDescr->IsParallelConstruct()) {
+      level++;
+    }
+    currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+  } while (currTaskDescr);
+  PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
+  return level;
+}
+
+EXTERN int omp_get_active_level(void)
+{
+  int level = 0; // no active level parallelism
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread");
+  do {
+    if (currTaskDescr->ThreadsInTeam() > 1) {
+      // has a parallel with more than one thread in team
+      level = 1;
+      break;
+    }
+    currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+  } while (currTaskDescr);
+  PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
+  return level;
+}
+
+EXTERN int omp_get_ancestor_thread_num(int level)
+{
+  int rc = 0;  // default at level 0
+  if (level>=0) {
+    int totLevel = omp_get_level();
+    if (level<=totLevel) {
+      omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+      int steps = totLevel - level;
+      PRINT(LD_IO, "backtrack %d steps\n", steps);
+      ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread");
+      do {
+        if (DON(LD_IOD)) PrintTaskDescr(currTaskDescr, (char *)"ancestor", steps);
+        if (currTaskDescr->IsParallelConstruct()) {
+	        // found the level
+	        if (! steps) {
+	          rc = currTaskDescr->ThreadId();
+	          break;
+	        }
+          steps--;
+	      }
+	      currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+      } while (currTaskDescr);
+      ASSERT0(LT_FUSSY, ! steps, "expected to find all steps");
+    }
+  }
+  PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level, rc)
+  return rc;
+}
+
+
+EXTERN int omp_get_team_size(int level)
+{
+  int rc = 1;  // default at level 0
+  if (level>=0) {
+    int totLevel = omp_get_level();
+    if (level<=totLevel) {
+      omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+      int steps = totLevel - level;
+      ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread");
+      do {
+        if (currTaskDescr->IsParallelConstruct()) {
+	        if (! steps) {
+	          // found the level
+	          rc = currTaskDescr->ThreadsInTeam();
+	          break;
+	        }
+          steps--;
+	      }
+	      currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+      } while (currTaskDescr);
+      ASSERT0(LT_FUSSY, ! steps, "expected to find all steps");
+    }
+  }
+  PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
+  return rc;
+}
+
+
+EXTERN void omp_get_schedule(omp_sched_t * kind, int * modifier)
+{
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  *kind = currTaskDescr->GetRuntimeSched();
+  *modifier = currTaskDescr->RuntimeChunkSize();
+  PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
+    (int) *kind, *modifier);
+}
+
+EXTERN void omp_set_schedule(omp_sched_t kind, int modifier)
+{
+  PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n",
+    (int) kind, modifier);
+  if (kind>=omp_sched_static && kind<omp_sched_auto) {
+    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+    currTaskDescr->SetRuntimeSched(kind);
+    currTaskDescr->RuntimeChunkSize() = modifier;
+    PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %d\n",
+      (int) currTaskDescr->GetRuntimeSched(), currTaskDescr->RuntimeChunkSize());
+
+  }
+}
+
+EXTERN omp_proc_bind_t omp_get_proc_bind(void)
+{
+  PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n");
+  return omp_proc_bind_true;
+}
+
+EXTERN int  omp_get_cancellation(void)
+{
+  int rc = omptarget_nvptx_threadPrivateContext->GlobalICV()->cancelPolicy;
+  PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_default_device(int deviceId)
+ {
+  PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n");
+}
+
+EXTERN int  omp_get_default_device(void)
+{
+  PRINT0(LD_IO, "call omp_get_default_device() is undef on device, returns 0\n");
+  return 0;
+}
+
+EXTERN int  omp_get_num_devices(void)
+{
+  PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n");
+  return 0;
+}
+
+EXTERN int  omp_get_num_teams(void)
+{
+  int rc = GetNumberOfOmpTeams();
+  PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_team_num()
+{
+  int rc = GetOmpTeamId();
+  PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);  
+  return rc;
+}
+
+EXTERN int omp_is_initial_device(void)
+{
+  PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");
+  return 0; // 0 by def on device
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// locks
+////////////////////////////////////////////////////////////////////////////////
+
+#define __OMP_SPIN 1000
+#define UNSET 0
+#define SET 1
+
+EXTERN void omp_init_lock(omp_lock_t * lock)
+{
+  *lock = UNSET;
+  PRINT0(LD_IO, "call omp_init_lock()\n");
+}
+
+EXTERN void omp_destroy_lock(omp_lock_t * lock)
+{
+  PRINT0(LD_IO, "call omp_destroy_lock()\n");
+}
+
+EXTERN void omp_set_lock(omp_lock_t * lock)
+{
+  // int atomicCAS(int* address, int compare, int val);
+   // (old == compare ? val : old)
+   int compare = UNSET;
+   int val = SET;
+
+   // TODO: not sure spinning is a good idea here..
+   while (atomicCAS(lock, compare, val) != UNSET) {
+   
+     clock_t start = clock();
+     clock_t now;
+     for (;;)
+     {
+       now = clock();
+       clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
+       if (cycles >= __OMP_SPIN*blockIdx.x) {
+         break;
+       }
+     }
+   } // wait for 0 to be the read value
+    
+  PRINT0(LD_IO, "call omp_set_lock()\n");
+}
+
+EXTERN void omp_unset_lock(omp_lock_t * lock)
+{
+  int compare = SET;
+  int val = UNSET;
+  int old = atomicCAS (lock, compare, val);
+
+  PRINT0(LD_IO, "call omp_unset_lock()\n");
+}
+
+EXTERN int omp_test_lock(omp_lock_t * lock)
+{
+  // int atomicCAS(int* address, int compare, int val);
+  // (old == compare ? val : old)
+  int compare = UNSET;
+  int val = SET;
+  
+  // TODO: should check for the lock to be SET?
+  int ret = atomicCAS(lock, compare, val);
+  
+  PRINT(LD_IO, "call omp_test_lock() return %d\n", ret);
+  
+  return ret;
+}
Index: libomptarget/deviceRTLs/nvptx/src/loop.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -0,0 +1,562 @@
+//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the KMPC interface
+// for the loop construct plus other worksharing constructs that use the same
+// interface as loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
+EXTERN void CompleteCG(omptarget_nvptx_CounterGroup & cg, Counter * priv,
+                       Counter n);
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// template class that encapsulate all the helper functions
+//
+// T is loop iteration type (32 | 64)  (unsigned | signed)
+// ST is the signed version of T
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+
+template <typename T, typename ST> 
+class omptarget_nvptx_LoopSupport {
+public:
+
+////////////////////////////////////////////////////////////////////////////////
+// Loop with static scheduling with chunk
+
+// Generic implementation of OMP loop scheduling with static policy
+/*! \brief Calculate initial bounds for static loop and stride
+ *  @param[in] loc location in code of the call (not used here)
+ *  @param[in] global_tid global thread id
+ *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
+ *  @param[in] plastiter pointer to last iteration
+ *  @param[in,out] pointer to loop lower bound. it will contain value of
+ *  lower bound of first chunk
+ *  @param[in,out] pointer to loop upper bound. It will contain value of
+ *  upper bound of first chunk
+ *  @param[in,out] pointer to loop stride. It will contain value of stride
+ *  between two successive chunks executed by the same thread
+ *  @param[in] loop increment bump
+ *  @param[in] chunk size
+ */
+
+// helper function for static chunk
+INLINE static void ForStaticChunk(T &lb, T &ub, ST &stride, ST chunk, 
+  T entityId, T numberOfEntities) 
+{
+  // each thread executes multiple chunks all of the same size, except
+  // the last one
+  
+  // distance between two successive chunks
+  stride = numberOfEntities * chunk;
+  lb = lb + entityId * chunk;
+  ub = lb + chunk -1; // Clang uses i <= ub
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Loop with static scheduling without chunk
+
+// helper function for static no chunk
+INLINE static void ForStaticNoChunk(T &lb, T &ub, ST &stride, ST &chunk, 
+  T entityId, T numberOfEntities) 
+{
+  // No chunk size specified.  Each thread or warp gets at most one
+  // chunk; chunks are all almost of equal size
+  T loopSize = ub - lb + 1;
+  
+  chunk = loopSize / numberOfEntities;
+  T leftOver = loopSize - chunk * numberOfEntities;
+    
+  if (entityId < leftOver) {
+    chunk++;
+    lb = lb + entityId * chunk;
+  } else {
+    lb = lb + entityId * chunk + leftOver;
+  }
+  
+  ub = lb + chunk -1; // Clang uses i <= ub
+  stride = loopSize; // make sure we only do 1 chunk per warp
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Support for Static Init
+
+INLINE static void for_static_init(int32_t schedtype,  
+  T *plower, T *pupper, ST *pstride, ST chunk)
+{
+	int gtid = GetGlobalThreadId();
+
+	// Assume we are in teams region or that we use a single block
+	// per target region
+	ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(gtid);
+	
+	// All warps that are in excess of the maximum requested, do
+	// not execute the loop
+	PRINT(LD_LOOP, 
+    "OMP Thread %d: schedule type %d, chunk size = %lld, mytid %d, num tids %d\n", 
+    GetOmpThreadId(gtid), schedtype, P64(chunk), GetOmpThreadId(gtid), 
+    GetNumberOfOmpThreads(gtid));
+  ASSERT0(LT_FUSSY, GetOmpThreadId(gtid)<GetNumberOfOmpThreads(gtid), 
+    "current thread is not needed here; error");
+
+  // copy
+  T lb = *plower;
+  T ub = *pupper;
+  ST stride = *pstride;
+  T entityId, numberOfEntities;
+  // init
+	switch (schedtype)
+	{
+	  case kmp_sched_static_chunk :
+      {
+        if (chunk > 0) {
+          entityId = GetOmpThreadId(gtid);
+          numberOfEntities = GetNumberOfOmpThreads(gtid);
+	        ForStaticChunk(lb, ub, stride, chunk, entityId, numberOfEntities);
+	        break;
+        }
+      } // note: if chunk <=0, use nochunk
+	  case kmp_sched_static_nochunk :
+      {
+        entityId = GetOmpThreadId(gtid);
+        numberOfEntities = GetNumberOfOmpThreads(gtid);
+  	    ForStaticNoChunk(lb, ub, stride, chunk, entityId, numberOfEntities);
+  	    break;
+      }
+    case kmp_sched_distr_static_chunk :
+      {
+        if (chunk > 0) {
+          entityId = GetOmpTeamId();
+          numberOfEntities = GetNumberOfOmpTeams();
+  	      ForStaticChunk(lb, ub, stride, chunk, entityId, numberOfEntities);
+          break;
+        } // note: if chunk <=0, use nochunk
+      }
+  	case kmp_sched_distr_static_nochunk :
+      {
+        entityId = GetOmpTeamId();
+        numberOfEntities = GetNumberOfOmpTeams();
+
+	      ForStaticNoChunk(lb, ub, stride, chunk, entityId, numberOfEntities);
+	    break;
+      }
+	  default:
+	  {
+	    ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype);
+      PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n", 
+        schedtype);
+      entityId = GetOmpThreadId(gtid);
+      numberOfEntities = GetNumberOfOmpThreads(gtid);
+	    ForStaticChunk(lb, ub, stride, chunk, entityId, numberOfEntities);
+	  }
+	}
+  // copy back
+  *plower = lb;
+  *pupper = ub;
+  *pstride = stride;
+	PRINT(LD_LOOP,"Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n",
+    GetNumberOfOmpThreads(gtid), GetNumberOfThreadsInBlock(), 
+    P64(*plower), P64(*pupper), P64(*pstride));
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Support for dispatch Init
+
+INLINE static int OrderedSchedule(kmp_sched_t schedule)
+{
+  return schedule >= kmp_sched_ordered_first && 
+    schedule <= kmp_sched_ordered_last;
+}
+
+INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, 
+  ST st, ST chunk) 
+{
+  ASSERT0(LT_FUSSY, lb==0, "exected normalized loop");
+  lb = 0;
+
+  int gtid = GetGlobalThreadId();
+  omptarget_nvptx_TaskDescr * currTaskDescr = getMyTopTaskDescriptor(gtid);
+  T tnum = currTaskDescr->ThreadsInTeam();
+  T tripCount = ub - lb + 1; // +1 because ub is inclusive
+  ASSERT0(LT_FUSSY, GetOmpThreadId(gtid)<GetNumberOfOmpThreads(gtid), 
+    "current thread is not needed here; error");
+
+  // Process schedule. 
+  if (tnum == 1  || tripCount<=1 || OrderedSchedule(schedule)) { 
+    PRINT(LD_LOOP, 
+      "go sequential as tnum=%d, trip count %lld, ordered sched=%d\n", 
+      tnum, P64(tripCount), schedule);
+    schedule = kmp_sched_static_chunk;
+    chunk = tripCount; // one thread gets the whole loop
+
+  } else if (schedule == kmp_sched_runtime) {
+    // process runtime
+    omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
+    chunk =  currTaskDescr->RuntimeChunkSize();
+    switch (rtSched) {
+      case omp_sched_static : 
+      {
+        if (chunk>0) schedule = kmp_sched_static_chunk;
+        else schedule = kmp_sched_static_nochunk;
+        break;
+      }
+      case omp_sched_auto :
+      {
+        schedule = kmp_sched_static_chunk;
+        chunk = 1;
+        break;
+      }
+      case omp_sched_dynamic :
+      case omp_sched_guided :
+      {
+        schedule = kmp_sched_dynamic;
+        break;
+     }
+    }
+    PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", schedule, P64(chunk));
+  } else if (schedule == kmp_sched_auto) {
+    schedule = kmp_sched_static_chunk;
+    chunk = 1;
+    PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", schedule, P64(chunk)); 
+  } else {
+    PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk)); 
+    ASSERT(LT_FUSSY, schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, 
+      "unknown schedule %d & chunk %lld\n",  
+      schedule, P64(chunk));
+  }
+
+  // save sched state
+  omptarget_nvptx_threadPrivateContext->ScheduleType(gtid) = schedule;
+  omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid) = ub;
+
+  // init schedules
+  if (schedule == kmp_sched_static_chunk) {
+    ASSERT0(LT_FUSSY, chunk>0, "bad chunk value");
+    // save ub
+    omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid) = ub;
+    // compute static chunk
+    ST stride;
+    T  threadId = GetOmpThreadId(gtid);
+	  ForStaticChunk(lb, ub, stride, chunk, threadId, tnum);
+    // save computed params
+    omptarget_nvptx_threadPrivateContext->Chunk(gtid) = chunk;
+    omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid) = lb;
+    omptarget_nvptx_threadPrivateContext->Stride(gtid) = stride;
+    PRINT(LD_LOOP, 
+      "dispatch init (static chunk) : num threads = %d, ub = %lld,"
+      "next lower bound = %lld, stride = %lld\n",
+      GetNumberOfOmpThreads(gtid),
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid),
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid),
+      omptarget_nvptx_threadPrivateContext->Stride(gtid));
+
+  } else if (schedule == kmp_sched_static_nochunk) {
+    ASSERT0(LT_FUSSY, chunk==0, "bad chunk value");
+    // save ub
+    omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid) = ub;
+    // compute static chunk
+    ST stride;
+    T  threadId = GetOmpThreadId(gtid);
+	  ForStaticNoChunk(lb, ub, stride, chunk, threadId, tnum);
+    // save computed params
+    omptarget_nvptx_threadPrivateContext->Chunk(gtid) = chunk;
+    omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid) = lb;
+    omptarget_nvptx_threadPrivateContext->Stride(gtid) = stride;
+    PRINT(LD_LOOP, 
+      "dispatch init (static nochunk) : num threads = %d, ub = %lld,"
+      "next lower bound = %lld, stride = %lld\n",
+      GetNumberOfOmpThreads(gtid),
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid),
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid),
+      omptarget_nvptx_threadPrivateContext->Stride(gtid));
+
+  } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
+    if (chunk<1) chunk = 1;
+    Counter eventNum =  ((tripCount -1) / chunk) + 1; // number of chunks
+    // but each thread (but one) must discover that it is last
+    eventNum += tnum;
+    omptarget_nvptx_threadPrivateContext->Chunk(gtid) = chunk;
+    omptarget_nvptx_threadPrivateContext->EventsNumber(gtid) = eventNum;
+    PRINT(LD_LOOP, 
+      "dispatch init (dyn) : num threads = %d, ub = %lld, chunk %lld, "
+      "events number = %lld\n",
+      GetNumberOfOmpThreads(gtid),
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid), 
+      omptarget_nvptx_threadPrivateContext->Chunk(gtid),
+      omptarget_nvptx_threadPrivateContext->EventsNumber(gtid));
+  }
+  
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Support for dispatch next
+
+INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup & cg, Counter priv, 
+  T & lb, T & ub, Counter & chunkId, Counter & currentEvent, 
+  T chunkSize, T loopUpperBound)
+{
+	// get next event atomically
+	Counter nextEvent = cg.Next();
+	// calculate chunk Id (priv was initialized upon entering the loop to 'start' == 'event')
+	chunkId = nextEvent - priv;
+	// calculate lower bound for all lanes in the warp
+	lb = chunkId * chunkSize;  // this code assume normalization of LB
+	ub = lb + chunkSize -1;  // Clang uses i <= ub
+
+	// 3 result cases:
+	//  a. lb and ub < loopUpperBound --> NOT_FINISHED
+	//  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> NOT_FINISHED
+	//  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
+	currentEvent = nextEvent;
+	// a.
+	if (ub <= loopUpperBound) {
+    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
+      P64(lb), P64(ub), P64(loopUpperBound));
+		return NOT_FINISHED;
+  }
+	// b.
+	if (lb <= loopUpperBound) {
+    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
+      P64(lb), P64(ub), P64(loopUpperBound));
+    ub = loopUpperBound;
+		return LAST_CHUNK;
+  }
+	// c. if we are here, we are in case 'c'
+	lb = loopUpperBound +1;
+  PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n",
+    P64(lb), P64(ub), P64(loopUpperBound));
+	return FINISHED;
+}
+
+
+INLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper, ST *pstride)
+{
+	// ID of a thread in its own warp
+	
+	// automatically selects thread or warp ID based on selected implementation
+  int gtid = GetGlobalThreadId();
+  ASSERT0(LT_FUSSY, GetOmpThreadId(gtid)<GetNumberOfOmpThreads(gtid), 
+    "current thread is not needed here; error");
+	// retrieve schedule
+  kmp_sched_t schedule = omptarget_nvptx_threadPrivateContext->ScheduleType(gtid);
+
+  // xxx reduce to one
+  if (schedule == kmp_sched_static_chunk || schedule == kmp_sched_static_nochunk) {
+    T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid);
+    T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid);
+    // finished?
+    if (myLb > ub) {
+      PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n", P64(myLb), P64(ub));
+      return DISPATCH_FINISHED;
+    }
+    // not finished, save current bounds
+    ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(gtid);
+    *plower = myLb;
+    T myUb =  myLb + chunk -1; // Clang uses i <= ub
+    if (myUb > ub) myUb = ub;
+    *pupper = myUb;
+    
+    // increment next lower bound by the stride
+    ST stride = omptarget_nvptx_threadPrivateContext->Stride(gtid);
+    omptarget_nvptx_threadPrivateContext->NextLowerBound(gtid) = myLb + stride;
+    PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", P64(*plower), P64(*pupper));
+    return DISPATCH_NOTFINISHED;
+  }
+  ASSERT0(LT_FUSSY, schedule==kmp_sched_dynamic || schedule==kmp_sched_guided, "bad sched");
+	omptarget_nvptx_TeamDescr & teamDescr = getMyTeamDescriptor();
+  T myLb, myUb;
+	Counter chunkId;
+  // xxx current event is now local
+  omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup();
+	int finished = DynamicNextChunk(cg,
+    omptarget_nvptx_threadPrivateContext->Priv(gtid), myLb, myUb, chunkId, 
+    omptarget_nvptx_threadPrivateContext->CurrentEvent(gtid),
+	  omptarget_nvptx_threadPrivateContext->Chunk(gtid), 
+    omptarget_nvptx_threadPrivateContext->LoopUpperBound(gtid));
+
+	if (finished == FINISHED) { 
+	  cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(gtid), 
+                omptarget_nvptx_threadPrivateContext->EventsNumber(gtid));
+  	cg.Release (omptarget_nvptx_threadPrivateContext->Priv(gtid), 
+      omptarget_nvptx_threadPrivateContext->CurrentEvent(gtid));
+
+ 		return DISPATCH_FINISHED;
+  }
+
+	// not finished (either not finished or last chunk)
+	*plower = myLb;
+	*pupper = myUb;
+	*pstride = 1;
+	
+	PRINT(LD_LOOP,"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n",
+    GetNumberOfOmpThreads(gtid), GetNumberOfThreadsInBlock(), 
+    P64(*plower), P64(*pupper), P64(*pstride));
+	return DISPATCH_NOTFINISHED;
+}
+
+INLINE static void dispatch_fini()
+{
+  // nothing
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// end of template class that encapsulate all the helper functions
+////////////////////////////////////////////////////////////////////////////////
+} ;
+
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (dyn loops)
+////////////////////////////////////////////////////////////////////////////////
+
+// init
+EXTERN void __kmpc_dispatch_init_4(kmp_Indent * loc, int32_t gtid, 
+  int32_t schedule, int32_t lb, int32_t ub, int32_t st, int32_t chunk)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init((kmp_sched_t) schedule, 
+    lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_4u(kmp_Indent * loc, int32_t gtid, 
+  int32_t schedule, uint32_t lb, uint32_t ub, int32_t st, int32_t chunk)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init((kmp_sched_t) schedule, 
+    lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8(kmp_Indent * loc, int32_t gtid, 
+  int32_t schedule, int64_t lb, int64_t ub, int64_t st, int64_t chunk)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init((kmp_sched_t) schedule, 
+    lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8u(kmp_Indent * loc, int32_t gtid, 
+  int32_t schedule, uint64_t lb, uint64_t ub, int64_t st, int64_t chunk)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init((kmp_sched_t) schedule, 
+    lb, ub, st, chunk);
+}
+
+// next
+EXTERN int __kmpc_dispatch_next_4(kmp_Indent * loc, int32_t gtid, 
+  int32_t * p_last, int32_t * p_lb, int32_t * p_ub, int32_t * p_st)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
+  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(p_last, 
+    p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_4u(kmp_Indent * loc, int32_t gtid, 
+  int32_t * p_last, uint32_t * p_lb, uint32_t * p_ub, int32_t * p_st)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
+  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(p_last, 
+    p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8(kmp_Indent * loc, int32_t gtid, 
+  int32_t * p_last, int64_t * p_lb, int64_t * p_ub, int64_t * p_st)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
+  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(p_last, 
+    p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8u(kmp_Indent * loc, int32_t gtid, 
+  int32_t * p_last, uint64_t * p_lb, uint64_t * p_ub, int64_t * p_st)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
+  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(p_last, 
+    p_lb, p_ub, p_st);
+}
+
+// fini
+EXTERN void __kmpc_dispatch_fini_4 (kmp_Indent * loc, int32_t gtid)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_4u (kmp_Indent * loc, int32_t gtid)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8 (kmp_Indent * loc, int32_t gtid)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8u (kmp_Indent * loc, int32_t gtid)
+{
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (static loops)
+////////////////////////////////////////////////////////////////////////////////
+
+
+EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid, 
+    int32_t schedtype, int32_t *plastiter, int32_t *plower, int32_t *pupper, 
+    int32_t *pstride, int32_t incr, int32_t chunk)
+{
+  PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init (schedtype, 
+    plower, pupper, pstride, chunk);
+}
+
+EXTERN void __kmpc_for_static_init_4u (kmp_Indent *loc, int32_t global_tid, 
+  int32_t schedtype, int32_t *plastiter, uint32_t *plower, uint32_t *pupper,
+  int32_t *pstride, int32_t incr, int32_t chunk)
+{ 
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(schedtype, 
+    plower, pupper, pstride, chunk);
+}
+
+EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid, 
+    int32_t schedtype, int32_t *plastiter, int64_t *plower, int64_t *pupper, 
+    int64_t *pstride, int64_t incr, int64_t chunk)
+{
+  PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init (schedtype, 
+    plower, pupper, pstride, chunk);
+}
+
+EXTERN void __kmpc_for_static_init_8u (kmp_Indent *loc, int32_t global_tid, 
+  int32_t schedtype, int32_t *plastiter, uint64_t *plower, uint64_t *pupper,
+  int64_t *pstride, int64_t incr, int64_t chunk)
+{ 
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(schedtype, 
+    plower, pupper, pstride, chunk);
+}
+
+
+EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) 
+{
+  PRINT0(LD_IO, "call kmpc_for_static_fini\n");
+}
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -0,0 +1,247 @@
+//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPTARGET_NVPTX_H
+#define __OMPTARGET_NVPTX_H
+
+// std includes
+#include <stdlib.h>
+#include <stdint.h>
+
+// cuda includes
+#include <cuda.h>
+#include <math.h>
+
+// local includes
+#include "../../../deviceRTLs/nvptx/src/option.h"         // choices we have
+#include "../../../deviceRTLs/nvptx/src/counter_group.h"
+#include "../../../deviceRTLs/nvptx/src/debug.h"          // debug
+#include "../../../deviceRTLs/nvptx/src/interface.h"      // interfaces with omp, compiler, and user
+#include "../../../deviceRTLs/nvptx/src/support.h"
+
+#define OMPTARGET_NVPTX_VERSION 1.1
+
+// used by the library for the interface with the app
+#define DISPATCH_FINISHED 0
+#define DISPATCH_NOTFINISHED 1
+
+// used by dynamic scheduling
+#define FINISHED 0
+#define NOT_FINISHED 1
+#define LAST_CHUNK 2
+
+
+#define TEAM_MASTER 0
+#define BARRIER_COUNTER 0
+#define ORDERED_COUNTER 1
+
+////////////////////////////////////////////////////////////////////////////////
+// global ICV
+
+typedef struct omptarget_nvptx_GlobalICV {
+  double  gpuCycleTime; // currently statically determined, should be set by host
+  uint8_t cancelPolicy; // 1 bit: enabled (true) or disabled (false)
+} omptarget_nvptx_GlobalICV;
+
+////////////////////////////////////////////////////////////////////////////////
+// task ICV and (implicit & explicit) task state
+
+class omptarget_nvptx_TaskDescr {
+ public:
+  // methods for flags
+  INLINE omp_sched_t GetRuntimeSched();
+  INLINE void SetRuntimeSched(omp_sched_t sched);
+  INLINE int  IsDynamic() { return data.items.flags & TaskDescr_IsDynamic; }
+  INLINE void SetDynamic() { data.items.flags = data.items.flags | TaskDescr_IsDynamic; }
+  INLINE void ClearDynamic() { data.items.flags = data.items.flags & (~TaskDescr_IsDynamic); }
+  INLINE int  InParallelRegion() { return data.items.flags & TaskDescr_InPar; }
+  INLINE int  IsParallelConstruct() { return data.items.flags & TaskDescr_IsParConstr; }
+  INLINE int  IsTaskConstruct() { return ! IsParallelConstruct(); }
+  // methods for other fields
+  INLINE uint16_t & NThreads() { return data.items.nthreads; }
+  INLINE uint16_t & ThreadLimit() { return data.items.threadlimit; }
+  INLINE uint16_t & ThreadId() { return data.items.threadId; }
+  INLINE uint16_t & ThreadsInTeam() { return data.items.threadsInTeam; }
+  INLINE uint64_t & RuntimeChunkSize() { return data.items.runtimeChunkSize; }
+  INLINE omptarget_nvptx_TaskDescr * GetPrevTaskDescr() { return prev; }
+  INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { prev = taskDescr; }
+  // init & copy
+  INLINE void InitLevelZeroTaskDescr();
+  INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+  INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+  INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum);
+  INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
+
+ private:
+  /* bits for flags: (6 used, 2 free)
+      3 bits (SchedMask) for runtime schedule 
+      1 bit (IsDynamic) for dynamic schedule (false = static)
+      1 bit (InPar) if this thread has encountered one or more parallel region
+      1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
+   */
+  static const uint8_t TaskDescr_SchedMask   = (0x1 | 0x2 | 0x4); 
+  static const uint8_t TaskDescr_IsDynamic   = 0x8;  
+  static const uint8_t TaskDescr_InPar       = 0x10; 
+  static const uint8_t TaskDescr_IsParConstr = 0x20; 
+
+  union { // both have same size
+    uint64_t vect[2];
+    struct TaskDescr_items {
+      uint8_t  flags; // 6 bit used (see flag above)
+      uint8_t  unused; 
+      uint16_t nthreads; // thread num for subsequent parallel regions
+      uint16_t threadlimit; // thread limit ICV
+      uint16_t threadId; // thread id
+      uint16_t threadsInTeam; // threads in current team
+      uint64_t runtimeChunkSize; // runtime chunk size
+    } items;
+  } data;
+  omptarget_nvptx_TaskDescr *prev;
+};
+
+// build on kmp
+typedef struct omptarget_nvptx_ExplicitTaskDescr {
+  omptarget_nvptx_TaskDescr taskDescr; // omptarget_nvptx task description (must be first)
+  kmp_TaskDescr   kmpTaskDescr; // kmp task description (must be last)
+} omptarget_nvptx_ExplicitTaskDescr;
+
+class omptarget_nvptx_TeamDescr;
+class omptarget_nvptx_GlobalICV;
+
+////////////////////////////////////////////////////////////////////////////////
+// thread private data (struct of arrays for better coalescing)
+// tid refers here to the global thread id
+// do not support multiple concurrent kernel a this time
+class omptarget_nvptx_ThreadPrivateContext {
+public:
+  // task
+  INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int gtid) { return & levelOneTaskDescr[gtid]; }
+  INLINE void SetTopLevelTaskDescr(int gtid, omptarget_nvptx_TaskDescr *taskICV) { topTaskDescr[gtid] = taskICV; }
+  INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int gtid);
+  // parallel
+  INLINE uint16_t & NumThreadsForNextParallel(int gtid) { return tnumForNextPar[gtid]; }
+  // sync
+  INLINE Counter & Priv(int gtid)               { return priv[gtid]; }
+  INLINE void IncrementPriv(int gtid, Counter val) { priv[gtid] += val; }
+  // schedule (for dispatch)
+  INLINE kmp_sched_t & ScheduleType(int gtid)   { return schedule[gtid]; }
+  INLINE int64_t     & Chunk(int gtid)          { return chunk[gtid]; }
+  INLINE int64_t     & LoopUpperBound(int gtid) { return loopUpperBound[gtid]; }
+  // state for dispatch with dyn/guided
+  INLINE Counter & CurrentEvent(int gtid)  { return currEvent_or_nextLowerBound[gtid]; }
+  INLINE Counter & EventsNumber(int gtid)  { return eventsNum_or_stride[gtid]; }
+  // state for dispatch with static
+  INLINE Counter & NextLowerBound(int gtid) { return currEvent_or_nextLowerBound[gtid]; }
+  INLINE Counter & Stride(int gtid)         { return eventsNum_or_stride[gtid]; }
+
+  INLINE void SetTeamContext(omptarget_nvptx_TeamDescr *teamContext) {
+    omptarget_nvptx_teamContext = teamContext;
+  }
+  INLINE void SetGlobalICV(omptarget_nvptx_GlobalICV *globalICV) {
+    omptarget_nvptx_globalICV = globalICV;
+  }
+  INLINE omptarget_nvptx_TeamDescr *TeamContext() { return omptarget_nvptx_teamContext; }
+  INLINE omptarget_nvptx_GlobalICV *GlobalICV()   { return omptarget_nvptx_globalICV; }
+
+  INLINE void InitThreadPrivateContext(int gtid);
+
+private:
+  // task ICV for implict threads in the only parallel region
+  omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_NUM_OMP_THREADS];
+  // pointer where to find the current task ICV (top of the stack) 
+  omptarget_nvptx_TaskDescr *topTaskDescr[MAX_NUM_OMP_THREADS];
+  // parallel
+  uint16_t tnumForNextPar[MAX_NUM_OMP_THREADS];
+  // sync
+  Counter priv[MAX_NUM_OMP_THREADS];
+  // schedule (for dispatch)
+  kmp_sched_t schedule[MAX_NUM_OMP_THREADS]; // remember schedule type for #for
+  int64_t chunk[MAX_NUM_OMP_THREADS];
+  int64_t loopUpperBound[MAX_NUM_OMP_THREADS];
+  // state for dispatch with dyn/guided OR static (never use both at a time)
+  Counter currEvent_or_nextLowerBound[MAX_NUM_OMP_THREADS];
+  Counter eventsNum_or_stride[MAX_NUM_OMP_THREADS];
+
+  // Pointers to other omp data in same instance
+  // These are initialized in kernel_init
+  omptarget_nvptx_TeamDescr *omptarget_nvptx_teamContext;
+  omptarget_nvptx_GlobalICV *omptarget_nvptx_globalICV;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Descriptor of a parallel region (worksharing in general)
+
+class omptarget_nvptx_WorkDescr {
+
+ public:
+  // access to data
+  INLINE omptarget_nvptx_CounterGroup & CounterGroup() { return cg; }
+  INLINE omptarget_nvptx_TaskDescr * WorkTaskDescr() { return & masterTaskICV; }
+  // init
+  INLINE void InitWorkDescr();
+
+private:
+  omptarget_nvptx_CounterGroup cg; // for barrier (no other needed)
+  omptarget_nvptx_TaskDescr masterTaskICV;
+  bool hasCancel;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+// thread private data (struct of arrays for better coalescing)
+
+class omptarget_nvptx_TeamDescr {
+ public:
+  // access to data
+  INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { return &levelZeroTaskDescr; }
+  INLINE omptarget_nvptx_WorkDescr & WorkDescr() { return workDescrForActiveParallel; }
+  INLINE omp_lock_t *CriticalLock() {return &criticalLock; }
+  // init
+  INLINE void InitTeamDescr();
+
+ private:
+  omptarget_nvptx_TaskDescr levelZeroTaskDescr ; // icv for team master initial thread
+  omptarget_nvptx_WorkDescr workDescrForActiveParallel; // one, ONLY for the active par
+  omp_lock_t criticalLock;
+}; 
+
+
+////////////////////////////////////////////////////////////////////////////////
+// global data tables
+////////////////////////////////////////////////////////////////////////////////
+
+extern __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+
+
+////////////////////////////////////////////////////////////////////////////////
+// get private data structures
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TeamDescr & getMyTeamDescriptor();
+INLINE omptarget_nvptx_WorkDescr & getMyWorkDescriptor();
+INLINE omptarget_nvptx_TaskDescr * getMyTopTaskDescriptor();
+INLINE omptarget_nvptx_TaskDescr * getMyTopTaskDescriptor(int globalThreadId);
+
+
+////////////////////////////////////////////////////////////////////////////////
+// inlined implementation
+////////////////////////////////////////////////////////////////////////////////
+
+#include "../../../deviceRTLs/nvptx/src/supporti.h"
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptxi.h"
+#include "../../../deviceRTLs/nvptx/src/counter_groupi.h"
+
+#endif
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -0,0 +1,70 @@
+//===--- omptarget-nvptx.cu - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the initialization code for the GPU
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// global data tables
+////////////////////////////////////////////////////////////////////////////////
+
+__device__ omptarget_nvptx_TeamDescr omptarget_nvptx_device_teamContexts[MAX_INSTANCES][MAX_NUM_TEAMS];
+__device__ omptarget_nvptx_ThreadPrivateContext omptarget_nvptx_device_threadPrivateContext[MAX_INSTANCES];
+__device__ omptarget_nvptx_GlobalICV omptarget_nvptx_device_globalICV[MAX_INSTANCES];
+
+__shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+
+////////////////////////////////////////////////////////////////////////////////
+// init entry points
+////////////////////////////////////////////////////////////////////////////////
+
+
+EXTERN void __kmpc_kernel_init(int OmpHandle, int ThreadLimit)
+{
+  PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", OMPTARGET_NVPTX_VERSION);
+  ASSERT0(LT_FUSSY, OmpHandle >=0 && OmpHandle < MAX_INSTANCES,
+          "omp handle out of bounds");
+  omptarget_nvptx_threadPrivateContext = &omptarget_nvptx_device_threadPrivateContext[OmpHandle];
+  omptarget_nvptx_threadPrivateContext->SetTeamContext(
+    &omptarget_nvptx_device_teamContexts[OmpHandle][0]);
+  omptarget_nvptx_threadPrivateContext->SetGlobalICV(
+    &omptarget_nvptx_device_globalICV[OmpHandle]);
+
+  // init thread private
+  int globalThreadId = GetGlobalThreadId();
+  omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(globalThreadId);
+  
+  int threadIdInBlock = GetThreadIdInBlock();
+  if (threadIdInBlock == TEAM_MASTER) {
+    PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
+    // init global icv
+    omptarget_nvptx_threadPrivateContext->GlobalICV()->gpuCycleTime = 1.0 / 745000000.0; // host reports 745 mHz
+    omptarget_nvptx_threadPrivateContext->GlobalICV()->cancelPolicy = FALSE;  // currently false only
+    // init team context
+    omptarget_nvptx_TeamDescr & currTeamDescr = getMyTeamDescriptor();
+    currTeamDescr.InitTeamDescr();
+    // this thread will start execution... has to update its task ICV
+    // to points to the level zero task ICV. That ICV was init in
+    // InitTeamDescr()
+    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(globalThreadId, 
+      currTeamDescr.LevelZeroTaskDescr());
+    
+    // set number of threads and thread limit in team to started value
+    int globalThreadId = GetGlobalThreadId();
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
+            globalThreadId);
+    currTaskDescr->NThreads() = GetNumberOfThreadsInBlock();
+    currTaskDescr->ThreadLimit() = ThreadLimit;
+  }
+}
+
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -0,0 +1,164 @@
+//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// Task Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched()
+{ 
+  // sched starts from 1..4; encode it as 0..3; so add 1 here
+  uint8_t rc = (data.items.flags & TaskDescr_SchedMask) +1;
+  return (omp_sched_t) rc;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched)
+{
+  // sched starts from 1..4; encode it as 0..3; so add 1 here
+  uint8_t val = ((uint8_t) sched) -1;
+  // clear current sched
+  data.items.flags &= ~TaskDescr_SchedMask;
+  // set new sched
+  data.items.flags |= val; 
+}
+
+INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr()
+{
+  // xxx slow method
+  /* flag: 
+      default sched is static, 
+      dyn is off (unused now anyway, but may need to sample from host ?) 
+      not in parallel
+  */
+  data.items.flags = 0; 
+  data.items.nthreads = GetNumberOfProcsInTeam();; // threads: whatever was alloc by kernel
+  data.items.threadId = 0; // is master
+  data.items.threadsInTeam = 1; // sequential
+  data.items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr)
+{
+  data.vect[0] = sourceTaskDescr->data.vect[0]; 
+  data.vect[1] = sourceTaskDescr->data.vect[1]; 
+} 
+
+INLINE void omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr)
+{
+  CopyData(sourceTaskDescr);
+  prev = sourceTaskDescr->prev;
+} 
+
+INLINE void omptarget_nvptx_TaskDescr::CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr)
+{
+  CopyData(parentTaskDescr);
+  prev = parentTaskDescr;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr)
+{
+  CopyParent(parentTaskDescr);
+  data.items.flags = data.items.flags & ~TaskDescr_IsParConstr;
+  ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
+  omptarget_nvptx_TaskDescr *masterTaskDescr,
+  uint16_t tnum)
+{
+  CopyParent(masterTaskDescr);
+  // overrwrite specific items;
+  data.items.flags |= TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
+  data.items.threadsInTeam = tnum; // set number of threads
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr)
+{
+  Copy(workTaskDescr);
+  // overrwrite specific items;
+  data.items.threadId = GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Thread Private Context
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TaskDescr *omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int gtid)
+{
+  ASSERT0(LT_FUSSY, gtid < MAX_NUM_OMP_THREADS, 
+    "Getting top level, gtid is larger than allocated data structure size");
+  return topTaskDescr[gtid];
+}
+
+INLINE void omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int gtid)
+{
+  // levelOneTaskDescr is init when starting the parallel region
+  // top task descr is NULL (team master version will be fixed separately)
+  topTaskDescr[gtid] = NULL;
+  // no num threads value has been pushed
+  tnumForNextPar[gtid] = 0;
+  // priv counter init to zero
+  priv[gtid] = 0;
+  // the following don't need to be init here; they are init when using dyn sched
+  // current_Event, events_Number, chunk, num_Iterations, schedule
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Work Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_WorkDescr::InitWorkDescr()
+{
+  cg.Clear(); // start and stop to zero too
+  // threadsInParallelTeam does not need to be init (done in start parallel)
+  hasCancel = FALSE;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Team Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr()
+{
+  levelZeroTaskDescr.InitLevelZeroTaskDescr();  
+  workDescrForActiveParallel.InitWorkDescr();
+  //omp_init_lock(criticalLock);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Get private data structure for thread
+////////////////////////////////////////////////////////////////////////////////
+
+// Utility routines for CUDA threads
+INLINE omptarget_nvptx_TeamDescr & getMyTeamDescriptor()
+{
+  return omptarget_nvptx_threadPrivateContext->TeamContext()[GetOmpTeamId()];
+}
+
+INLINE omptarget_nvptx_WorkDescr & getMyWorkDescriptor()
+{
+  omptarget_nvptx_TeamDescr & currTeamDescr = getMyTeamDescriptor();
+  return currTeamDescr.WorkDescr();
+}
+
+INLINE omptarget_nvptx_TaskDescr * getMyTopTaskDescriptor(int globalThreadId)
+{
+  return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(globalThreadId);
+}
+
+INLINE omptarget_nvptx_TaskDescr * getMyTopTaskDescriptor()
+{
+  return getMyTopTaskDescriptor(GetGlobalThreadId());
+}
+
Index: libomptarget/deviceRTLs/nvptx/src/option.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/option.h
@@ -0,0 +1,78 @@
+//===------------ option.h - NVPTX OpenMP GPU options ------------ CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// GPU default options
+//
+//===----------------------------------------------------------------------===//
+#ifndef _OPTION_H_
+#define _OPTION_H_ 
+
+////////////////////////////////////////////////////////////////////////////////
+// Kernel options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// following two defs must match absolute limit hardwired in the host RTL
+#define TEAMS_ABSOLUTE_LIMIT 512 /* omptx limit (must match teamsAbsoluteLimit) */
+#define THREAD_ABSOLUTE_LIMIT 1024 /* omptx limit (must match threadAbsoluteLimit) */
+
+// max number of blocks depend on the kernel we are executing - pick default here
+#define MAX_NUM_TEAMS TEAMS_ABSOLUTE_LIMIT
+#define WARPSIZE 32
+#define MAX_NUM_WARPS (MAX_NUM_TEAMS * THREAD_ABSOLUTE_LIMIT / WARPSIZE)
+#define MAX_NUM_THREADS MAX_NUM_WARPS * WARPSIZE
+
+#ifdef OMPTHREAD_IS_WARP
+  // assume here one OpenMP thread per CUDA warp
+  #define MAX_NUM_OMP_THREADS MAX_NUM_WARPS
+#else
+  // assume here one OpenMP thread per CUDA thread
+  #define MAX_NUM_OMP_THREADS MAX_NUM_THREADS
+#endif
+
+#define MAX_INSTANCES 16
+
+////////////////////////////////////////////////////////////////////////////////
+// algo options
+////////////////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////////////////
+// data options
+////////////////////////////////////////////////////////////////////////////////
+
+
+// decide if counters are 32 or 64 bit
+#define Counter unsigned long long
+
+// aee: KMP defines kmp_int to be 32 or 64 bits depending on the target. 
+// think we don't need it here (meaning we can be always 64 bit compatible)
+/*
+#ifdef KMP_I8
+  typedef kmp_int64		kmp_int;
+#else
+  typedef kmp_int32		kmp_int;
+#endif
+*/
+
+////////////////////////////////////////////////////////////////////////////////
+// misc options (by def everythig here is device)
+////////////////////////////////////////////////////////////////////////////////
+
+#define EXTERN extern "C" __device__
+#define INLINE __inline__ __device__
+#define NOINLINE __noinline__ __device__
+#ifndef TRUE
+  #define TRUE 1
+#endif
+#ifndef FALSE
+  #define FALSE 0
+#endif
+
+#endif
Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -0,0 +1,244 @@
+//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Parallel implemention in the GPU. Here is the pattern:
+//
+//    while (not finished) {
+//
+//    if (master) {
+//      sequential code, decide which par loop to do, or if finished
+//     __kmpc_kernel_prepare_parallel() // exec by master only
+//    }
+//    syncthreads // A
+//    __kmpc_kernel_parallel() // exec by all
+//    if (this thread is included in the parallel) {
+//      switch () for all parallel loops
+//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
+//    }
+//
+//
+//    The reason we don't exec end_parallel for the threads not included
+//    in the parallel loop is that for each barrier in the parallel
+//    region, these non-included threads will cycle through the
+//    syncthread A. Thus they must preserve their current threadId that
+//    is larger than thread in team.
+//
+//    To make a long story short...
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// support for parallel that goes parallel (1 static level only)
+////////////////////////////////////////////////////////////////////////////////
+
+// return number of cuda threads that participate to parallel
+// calculation has to consider simd implementation in nvptx
+// i.e. (num omp threads * num lanes)
+//
+// cudathreads =
+//    if(num_threads != 0) {
+//      if(thread_limit > 0) {
+//        min (num_threads*numLanes ; thread_limit*numLanes);
+//      } else {
+//        min (num_threads*numLanes; blockDim.x)
+//      }
+//    } else {
+//      if (thread_limit != 0) {
+//        min (thread_limit*numLanes; blockDim.x)
+//      } else { // no thread_limit, no num_threads, use all cuda threads
+//        blockDim.x;
+//      }
+//    }
+EXTERN int __kmpc_kernel_prepare_parallel(int NumThreads, int NumLanes)
+{
+  PRINT0(LD_IO , "call to __kmpc_kernel_prepare_parallel\n");
+  int globalThreadId = GetGlobalThreadId();
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+    omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(globalThreadId);
+  ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
+  if (currTaskDescr->InParallelRegion()) {
+    PRINT0(LD_PAR, "already in parallel: go seq\n");
+
+    // todo: support nested parallelism
+    return FALSE;
+  }
+
+  uint16_t CudaThreadsForParallel = 0;
+  uint16_t NumThreadsClause =
+      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(
+          globalThreadId);
+
+  // we cannot have more than block size
+  uint16_t CudaThreadsAvail = GetNumberOfThreadsInBlock();
+
+  // this is different from ThreadAvail of OpenMP because we may be
+  // using some of the CUDA threads as SIMD lanes
+
+  if (NumThreadsClause != 0) {
+    // reset request to avoid propagating to successive #parallel
+    omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(
+        globalThreadId) = 0;
+
+    // assume that thread_limit*numlanes is already <= CudaThreadsAvail
+    // because that is already checked on the host side (CUDA offloading rtl)
+    if (currTaskDescr->ThreadLimit() != 0)
+      CudaThreadsForParallel =
+          NumThreadsClause*NumLanes < currTaskDescr->ThreadLimit()*NumLanes ?
+              NumThreadsClause*NumLanes : currTaskDescr->ThreadLimit()*NumLanes;
+    else {
+      CudaThreadsForParallel = (NumThreadsClause*NumLanes > CudaThreadsAvail) ?
+          CudaThreadsAvail : NumThreadsClause*NumLanes;
+    }
+  } else {
+    if (currTaskDescr->ThreadLimit() != 0) {
+      CudaThreadsForParallel =
+          (currTaskDescr->ThreadLimit()*NumLanes > CudaThreadsAvail) ?
+              CudaThreadsAvail : currTaskDescr->ThreadLimit()*NumLanes;
+    } else
+      CudaThreadsForParallel = GetNumberOfThreadsInBlock();
+  }
+
+  ASSERT(LT_FUSSY, CudaThreadsForParallel > 0, "bad thread request of %d threads", CudaThreadsForParallel);
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == TEAM_MASTER, "only team master can create parallel");
+
+  // set number of threads on work descriptor
+  // this is different from the number of cuda threads required for the parallel
+  // region
+  omptarget_nvptx_WorkDescr & workDescr = getMyWorkDescriptor();
+  workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr,
+      CudaThreadsForParallel/NumLanes);
+  // init counters (copy start to init)
+  workDescr.CounterGroup().Reset();
+
+  return CudaThreadsForParallel;
+}
+
+// works only for active parallel loop...
+EXTERN void __kmpc_kernel_parallel(int numLanes)
+{
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
+  // init work descriptor from workdesccr
+  int globalThreadId = GetGlobalThreadId();
+  omptarget_nvptx_TaskDescr *newTaskDescr = 
+    omptarget_nvptx_threadPrivateContext->Level1TaskDescr(globalThreadId);
+  omptarget_nvptx_WorkDescr & workDescr = getMyWorkDescriptor(); 
+  ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+  newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
+  // install new top descriptor
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(globalThreadId, newTaskDescr);
+  // init private from int value
+  workDescr.CounterGroup().Init(
+    omptarget_nvptx_threadPrivateContext->Priv(globalThreadId));
+  PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of %d threads\n",
+    newTaskDescr->ThreadId(), newTaskDescr->NThreads());
+
+  // each thread sets its omp thread ID when entering a parallel region
+  // based on the number of simd lanes and its cuda thread ID
+  if (numLanes > 1) {
+    // the compiler is requesting lanes for #simd execution
+    // WARNING: assume thread number in #parallel is a multiple of numLanes
+    newTaskDescr->ThreadId() /= numLanes;
+    //newTaskDescr->ThreadsInTeam(); // =  newTaskDescr->ThreadsInTeam()/numLanes;
+  }
+//  } else {
+//    // not a for with a simd inside: use only one lane
+//    // we may have started thread_limit*simd_info CUDA threads
+//    // and we need to set the number of threads to thread_limit value
+//    // FIXME: is this always the case, even if numLanes > 1?
+////    newTaskDescr->ThreadId() = threadIdx.x;
+//    //newTaskDescr->ThreadsInTeam();// = newTaskDescr->ThreadLimit();
+//  }
+}
+
+
+
+EXTERN void __kmpc_kernel_end_parallel()
+{
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
+  // pop stack
+  int globalThreadId = GetGlobalThreadId();
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(globalThreadId);
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(globalThreadId, 
+    currTaskDescr->GetPrevTaskDescr());
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// support for parallel that goes sequential
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid)
+{
+  PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
+  
+  // assume this is only called for nested parallel
+  int globalThreadId = GetGlobalThreadId();
+
+  // unlike actual parallel, threads in the same team do not share
+  // the workTaskDescr in this case and num threads is fixed to 1  
+  
+  // get current task
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(globalThreadId);
+
+  // allocate new task descriptor and copy value from current one, set prev to it
+  omptarget_nvptx_TaskDescr *newTaskDescr = (omptarget_nvptx_TaskDescr *) 
+    SafeMalloc(sizeof(omptarget_nvptx_TaskDescr), (char *) "new seq parallel task");
+  newTaskDescr->CopyParent(currTaskDescr);
+
+  // tweak values for serialized parallel case:
+  // - each thread becomes ID 0 in its serialized parallel, and
+  // - there is only one thread per team
+  newTaskDescr->ThreadId() = 0;
+  newTaskDescr->ThreadsInTeam() = 1;
+  
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(globalThreadId, newTaskDescr);
+}
+
+EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, uint32_t global_tid)
+{
+  PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
+  
+  // pop stack
+  int globalThreadId = GetGlobalThreadId();
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(globalThreadId);
+  // set new top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+    globalThreadId, 
+    currTaskDescr->GetPrevTaskDescr());
+  // free
+  SafeFree(currTaskDescr, (char *) "new seq parallel task");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// push params
+////////////////////////////////////////////////////////////////////////////////
+
+
+EXTERN void __kmpc_push_num_threads (kmp_Indent * loc, int32_t gtid, 
+  int32_t num_threads)
+{
+  PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
+  // only the team master updates the state
+  gtid = GetGlobalThreadId();
+  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(gtid) = num_threads;	
+}
+
+// Do not do nothing: the host guarantees we started the requested number of
+// teams and we only need inspection gridDim
+
+EXTERN void __kmpc_push_num_teams (kmp_Indent * loc, int32_t gtid, 
+  int32_t num_teams, int32_t thread_limit)
+{
+  PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams);
+  ASSERT0(LT_FUSSY, FALSE, "should never have anything with new teams on device");
+}
+
Index: libomptarget/deviceRTLs/nvptx/src/reduction.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -0,0 +1,1247 @@
+//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of reduction with KMPC interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdio.h>
+#include <complex.h>
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
+//EXTERN int32_t __gpu__warpBlockRedu_fixed4_add(int32_t);
+
+EXTERN void omp_reduction_op_gpu(char *, char *);
+
+
+//cannot implement atomic_start and atomic_end for GPU. Report runtime error
+EXTERN void __kmpc_atomic_start() {
+		printf("__kmpc_atomic_start not supported\n"); \
+		asm("trap;"); \
+		return; \
+}
+
+EXTERN void __kmpc_atomic_end() {
+		printf("__kmpc_atomic_end not supported\n"); \
+		asm("trap;"); \
+		return; \
+}
+
+//EXTERN  kmp_ReductFctPtr *gpu_callback = (kmp_ReductFctPtr *)omp_reduction_op_gpu;
+
+EXTERN 
+int32_t __gpu_block_reduce(){
+	if (omp_get_num_threads() != blockDim.x)
+		return 0;
+	unsigned tnum = __ballot(1);
+	if (tnum != (~0x0)) { //assume swapSize is 32
+		return 0;
+	}
+	return 1;
+}
+
+EXTERN 
+int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, void *reduce_array_size, kmp_ReductFctPtr *reductFct, kmp_CriticalName *lck) {
+  int globalThreadId = GetGlobalThreadId();
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(globalThreadId);
+	int numthread;
+	if (currTaskDescr->IsParallelConstruct()) {
+		numthread = omp_get_num_threads();
+	} else {
+		numthread = omp_get_num_teams();
+	}
+	
+	if (numthread == 1)
+                return 1;
+        else if (!__gpu_block_reduce())
+                return 2;
+	else {
+		if (threadIdx.x == 0)
+			return 1;
+		else
+			return 0;
+	}
+//	return 2;
+	/**
+	 * Only when all the threads in a block are doing reduction,
+	 * the warpBlockRedu is used. Otherwise atomic.
+	 * check the data type, too.
+	 * A special case: when the size of thread group is one,
+	 * do reduction directly.
+	 **/
+	
+	// Note: this code provokes warning because it follows a "return"
+	
+	//since there is no thread interface yet, just infer from the 
+	// result of ballot
+#if 0
+	unsigned tnum = __ballot(1);
+	if (tnum != (~0x0)) { //assume swapSize is 32
+		return 2;
+	}
+	
+#if 0
+	if (threadIdx.x == 0) {
+		if ((void *)reductFct != (void *)omp_reduction_op_gpu) {
+			printf("function pointer value is not correct\n");
+		} else {
+			printf("function pointer value is correct\n");
+		}
+	}
+#endif
+
+	//printf("function pointer %p %d %p\n", reductFct, reduce_size, omp_reduction_op_gpu);
+	if (reduce_size == 0) {
+	(*reductFct)((char*)reduce_data, (char*)reduce_data);
+	} else {
+	//omp_reduction_op_gpu((char*)reduce_data, (char*)reduce_data);
+	 (*gpu_callback)((char*)reduce_data, (char*)reduce_data);
+	}
+	
+	//int **myp = (int **) reduce_data;
+	// the results are with thread 0. Reduce to the shared one
+	if (threadIdx.x == 0) {
+		//printf("function pointer %p %p\n", reductFct, omp_reduction_op);
+	 //   	printf("my result %d\n", *myp[0]);
+		return 1;
+	} else {
+		return 0;
+	}
+#endif
+}
+
+EXTERN
+int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
+        if (threadIdx.x == 0){
+		return 2;
+	}else{
+		return 0;
+	}
+}
+
+EXTERN 
+int32_t __kmpc_reduce41(kmp_Indent *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, void *reduce_array_size, kmp_ReductFctPtr *reductFct, kmp_CriticalName *lck) {
+	return __kmpc_reduce_gpu(loc, global_tid, num_vars, reduce_size, reduce_data, reduce_array_size, reductFct, lck);
+}
+
+EXTERN 
+int32_t __kmpc_reduce_nowait41(kmp_Indent *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, void *reduce_array_size, kmp_ReductFctPtr *reductFct, kmp_CriticalName *lck) {
+  int globalThreadId = GetGlobalThreadId();
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(globalThreadId);
+	int numthread;
+  	if (currTaskDescr->IsParallelConstruct()) {
+		numthread = omp_get_num_threads();
+	} else {
+		numthread = omp_get_num_teams();
+	}
+
+	if (numthread == 1)
+                return 1;
+        else if (!__gpu_block_reduce())
+                return 2;
+	else {
+		if (threadIdx.x == 0)
+			return 1;
+		else
+			return 0;
+	}
+
+	// Notice: as above, uncomment if 0 once this code below is ready for shipping
+#if 0
+	unsigned tnum = __ballot(1);
+	if (tnum != (~0x0)) { //assume swapSize is 32
+		return 2;
+	}
+
+	if (threadIdx.x == 0) {
+		printf("choose block reduction\n");
+	}
+
+	(*reductFct)(reduce_data, reduce_data);
+	//omp_reduction_op((char*)reduce_data, (char*)reduce_data);
+
+	int **myp = (int **) reduce_data;
+	// the results are with thread 0. Reduce to the shared one
+	if (threadIdx.x == 0) {
+	    	printf("my result %d\n", *myp[0]);
+		return 1;
+	} else {
+		return 0;
+	}
+#endif
+}
+
+EXTERN
+void __kmpc_end_reduce( kmp_Indent *loc, int32_t global_tid, kmp_CriticalName *lck ) {
+}
+
+EXTERN
+void __kmpc_end_reduce_nowait( kmp_Indent *loc, int32_t global_tid, kmp_CriticalName *lck ) {
+}
+
+
+
+
+/* implement different data type or operations  with atomicCAS
+ */
+
+#define omptarget_nvptx_add(x, y) ((x)+(y))
+#define omptarget_nvptx_sub(x, y) ((x)-(y))
+#define omptarget_nvptx_sub_rev(y, x) ((x)-(y))
+#define omptarget_nvptx_mul(x, y) ((x)*(y))
+#define omptarget_nvptx_div(x, y) ((x)/(y))
+#define omptarget_nvptx_div_rev(y, x) ((x)/(y))
+#define omptarget_nvptx_min(x, y) ((x)>(y)?(y):(x))
+#define omptarget_nvptx_max(x, y) ((x)<(y)?(y):(x))
+#define omptarget_nvptx_andb(x, y) ((x) & (y))
+#define omptarget_nvptx_orb(x, y) ((x) | (y))
+#define omptarget_nvptx_xor(x, y) ((x) ^ (y))
+#define omptarget_nvptx_shl(x,y) ((x) << (y))
+#define omptarget_nvptx_shr(x,y) ((x) >> (y))
+#define omptarget_nvptx_andl(x, y) ((x) && (y))
+#define omptarget_nvptx_orl(x, y) ((x) || (y))
+#define omptarget_nvptx_eqv(x, y) ((x) == (y))
+#define omptarget_nvptx_neqv(x, y) ((x) != (y))
+
+#if 0
+// keep for debugging 
+EXTERN 
+void __kmpc_atomic_fixed4_add(kmp_Indent *id_ref, int32_t gtid, int32_t * lhs, int32_t rhs) {
+	//if (gtid < 64)
+        PRINT(LD_LOOP, "thread %d participating in reduction, lhs = %p, rhs = %d\n", gtid, lhs, rhs);
+        atomicAdd(lhs, rhs);
+}
+#endif
+
+
+INLINE __device__ float atomicCAS(float *_addr, float _compare, float _val) {
+	int *addr = (int *) _addr;
+	int compare = __float_as_int(_compare);
+	int val = __float_as_int(_val);
+	return __int_as_float(atomicCAS(addr, compare, val));
+}
+
+INLINE __device__ double atomicCAS(double *_addr, double _compare, double _val) {
+	unsigned long long int  *addr = (unsigned long long int *) _addr;
+	unsigned long long int  compare = __double_as_longlong(_compare);
+	unsigned long long int  val = __double_as_longlong(_val);
+	return __longlong_as_double(atomicCAS(addr, compare, val));
+}
+
+INLINE __device__ long long int atomicCAS(long long int *_addr, long long int _compare, long long int _val) {
+	unsigned long long int  *addr = (unsigned long long int *) _addr;
+	unsigned long long int  compare = (unsigned long long int)(_compare);
+	unsigned long long int  val = (unsigned long long int)(_val);
+	return (long long int) (atomicCAS(addr, compare, val));
+}
+
+INLINE __device__ int64_t atomicCAS(int64_t *_addr, int64_t _compare, int64_t _val) {
+	unsigned long long int  *addr = (unsigned long long int *) _addr;
+	unsigned long long int  compare = (unsigned long long int)(_compare);
+	unsigned long long int  val = (unsigned long long int)(_val);
+	return (int64_t) (atomicCAS(addr, compare, val));
+}
+
+INLINE __device__ uint64_t atomicCAS(uint64_t *_addr, uint64_t _compare, uint64_t _val) {
+	unsigned long long int  *addr = (unsigned long long int *) _addr;
+	unsigned long long int  compare = (unsigned long long int)(_compare);
+	unsigned long long int  val = (unsigned long long int)(_val);
+	return (uint64_t) (atomicCAS(addr, compare, val));
+}
+
+INLINE __device__ float complex atomicCAS(float complex *_addr, float complex _compare, float complex _val) {
+	double  *addr = (double *) _addr;
+	double  compare = (double)(_compare);
+	double  val = (double)(_val);
+	return (float complex) (atomicCAS(addr, compare, val));
+}
+
+#define ATOMIC_GENOP_NATIVE(_name, _dtype, _op, _cudaop) \
+        EXTERN void __kmpc_atomic_##_name##_##_op\
+        (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs) { \
+	PRINT(LD_LOOP, "Reduction: thead %d\n", gtid); \
+	atomic##_cudaop(lhs, rhs); \
+} \
+  \
+        EXTERN _dtype __kmpc_atomic_##_name##_##_op##_cpt\
+        (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs, int flag) { \
+	_dtype old =  atomic##_cudaop(lhs, rhs); \
+	if (flag) { \
+		return omptarget_nvptx_##_op(old, rhs); \
+	} else {\
+		return old; \
+	} \
+}
+
+/*for types that are supported directly by atomicCAS */
+#define ATOMIC_GENOP_DIRECT(_name, _dtype, _op) \
+        EXTERN void __kmpc_atomic_##_name##_##_op\
+        (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs) { \
+	PRINT(LD_LOOP, "Reduction: thead %d\n", gtid); \
+	_dtype *temp_lhs = lhs; \
+	_dtype oldvalue = *temp_lhs; \
+	_dtype saved ; \
+	_dtype newvalue ; \
+	do { \
+		saved = oldvalue; \
+		newvalue = (_dtype)omptarget_nvptx_##_op(saved, rhs); \
+		oldvalue = atomicCAS(temp_lhs, saved, newvalue); \
+	} while (saved != oldvalue) ;\
+} \
+  \
+        EXTERN _dtype __kmpc_atomic_##_name##_##_op##_cpt\
+        (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs, int flag) { \
+	_dtype *temp_lhs = lhs; \
+	_dtype oldvalue = *temp_lhs; \
+	_dtype saved ; \
+	_dtype newvalue ; \
+	do { \
+		saved = oldvalue; \
+		newvalue = (_dtype)omptarget_nvptx_##_op(saved, rhs); \
+		oldvalue = atomicCAS(temp_lhs, saved, newvalue); \
+	} while (saved != oldvalue) ;\
+	if (flag) return newvalue; \
+	else return oldvalue ; \
+}
+
+#define ATOMIC_GENOP_DIRECT_REV(_name, _dtype, _op) \
+        EXTERN void __kmpc_atomic_##_name##_##_op##_rev\
+        (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs) { \
+	_dtype *temp_lhs = lhs; \
+	_dtype oldvalue = *temp_lhs; \
+	_dtype saved ; \
+	_dtype newvalue ; \
+	do { \
+		saved = oldvalue; \
+		newvalue = (_dtype)omptarget_nvptx_##_op(rhs, saved); \
+		oldvalue = atomicCAS(temp_lhs, saved, newvalue); \
+	} while (saved != oldvalue) ;\
+} \
+  \
+        EXTERN _dtype __kmpc_atomic_##_name##_##_op##_cpt##_rev\
+        (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs, int flag) { \
+	_dtype *temp_lhs = lhs; \
+	_dtype oldvalue = *temp_lhs; \
+	_dtype saved ; \
+	_dtype newvalue ; \
+	do { \
+		saved = oldvalue; \
+		newvalue = (_dtype)omptarget_nvptx_##_op(rhs, saved); \
+		oldvalue = atomicCAS(temp_lhs, saved, newvalue); \
+	} while (saved != oldvalue) ;\
+	if (flag) return newvalue; \
+	else return oldvalue ; \
+}
+
+
+INLINE __device__  void dc_add(double complex *lhs, double complex rhs) {
+	double *ptrl = (double *)lhs;
+	double *ptrr = (double *) &rhs;
+	ptrl[0] += ptrr[0];
+	ptrl[1] += ptrr[1];
+}
+
+INLINE __device__  void dc_sub(double complex *lhs, double complex rhs) {
+	double *ptrl = (double *)lhs;
+	double *ptrr = (double *) &rhs;
+	ptrl[0] -= ptrr[0];
+	ptrl[1] -= ptrr[1];
+}
+
+INLINE __device__  void dc_mul(double complex *lhs, double complex rhs) {
+	double *ptrl = (double *)lhs;
+	double *ptrr = (double *) &rhs;
+	double r1 = ptrl[0], r2 = ptrr[0];
+	double i1 = ptrl[1], i2 = ptrr[1];
+	ptrl[0] = r1*r2-i1*i2;
+	ptrl[1] = r1*i2+r2*i1;
+}
+
+INLINE __device__  void dc_div(double complex *lhs, double complex rhs) {
+	double *ptrl = (double *)lhs;
+	double *ptrr = (double *) &rhs;
+	double r1 = ptrl[0], r2 = ptrr[0];
+	double i1 = ptrl[1], i2 = ptrr[1];
+	ptrl[0] = (r1*r2+i1*i2)/(r2*r2+i2*i2);
+	ptrl[1] = (i1*r2-r1*i2)/(r2*r2+i2*i2);
+}
+
+#define ATOMIC_GENOP_DC(_op) \
+        EXTERN void __kmpc_atomic_cmplx8_##_op\
+        (kmp_Indent *id_ref, int32_t gtid, double _Complex * lhs, double _Complex rhs) { \
+		printf("double complex atomic opertion not supported\n"); \
+		asm("trap;"); \
+		return; \
+	}\
+	EXTERN double _Complex __gpu_warpBlockRedu_cmplx8_##_op(double _Complex rhs) { \
+	__shared__ double _Complex lhs; \
+	if (threadIdx.x == 0 ) \
+		lhs = rhs; \
+	__syncthreads(); \
+	for (int i= 1; i<blockDim.x; i++)  { \
+		if (threadIdx.x == i) { \
+			dc_##_op(&lhs, rhs); \
+		}\
+		__syncthreads(); \
+	}\
+	return lhs; \
+}
+
+//implementation with shared
+#define ATOMIC_GENOP_DC_obsolete(_op) \
+        EXTERN void __kmpc_atomic_cmplx16_##_op\
+        (kmp_Indent *id_ref, int32_t gtid, double _Complex * lhs, double _Complex rhs) { \
+	__shared__ unsigned int  stepinblock; \
+	unsigned tnum = __ballot(1); \
+	if (tnum != (~0x0)) {  \
+		return ; \
+	} \
+	if (threadIdx.x == 0)  stepinblock = 0; \
+	__syncthreads(); \
+	while (stepinblock < blockDim.x) { \
+		if (threadIdx.x == stepinblock) { \
+		dc_##_op(lhs, rhs); \
+		stepinblock ++ ; \
+		}\
+		__syncthreads(); \
+	} \
+} 
+
+ATOMIC_GENOP_DC(add);
+ATOMIC_GENOP_DC(sub);
+ATOMIC_GENOP_DC(mul);
+ATOMIC_GENOP_DC(div);
+
+
+INLINE __device__  uint64_t fc_add(float r1, float i1, float r2, float i2) {
+	uint64_t result;
+	float *rr = (float *)&result;
+	float *ri = rr+1;
+	*rr = r1+r2;
+	*ri = i1+i2;
+	return result;
+}
+
+INLINE __device__  uint64_t fc_sub(float r1, float i1, float r2, float i2) {
+	uint64_t result;
+	float *rr = (float *)&result;
+	float *ri = rr+1;
+	*rr = r1-r2;
+	*ri = i1-i2;
+	return result;
+}
+
+INLINE __device__  uint64_t fc_mul(float r1, float i1, float r2, float i2) {
+	uint64_t result;
+	float *rr = (float *)&result;
+	float *ri = rr+1;
+	*rr = r1*r2-i1*i2;
+	*ri = r1*i2+r2*i1;
+	return result;
+}
+
+INLINE __device__  uint64_t fc_div(float r1, float i1, float r2, float i2) {
+	uint64_t result;
+	float *rr = (float *)&result;
+	float *ri = rr+1;
+	*rr = (r1*r2+i1*i2)/(r2*r2+i2*i2);
+	*ri = (i1*r2-r1*i2)/(r2*r2+i2*i2);
+	return result;
+}
+
+#define ATOMIC_GENOP_FC(_op) \
+        EXTERN void __kmpc_atomic_cmplx4_##_op\
+        (kmp_Indent *id_ref, int32_t gtid, float complex * lhs, float complex rhs) { \
+	uint64_t *temp_lhs = (uint64_t *)lhs; \
+	uint64_t oldvalue = *temp_lhs; \
+	uint64_t saved ; \
+	float *pr1 = (float *) &rhs; \
+	float *pi1 = pr1+1; \
+	float r1 = *pr1 ; \
+	float i1 = *pi1 ; \
+	uint64_t newvalue ; \
+	do { \
+		saved = oldvalue; \
+		float *pr2 = (float *) &saved ; \
+		float *pi2 = pr2+1; \
+		newvalue = fc_##_op(*pr2, *pi2, r1, i1); \
+		oldvalue = atomicCAS(temp_lhs, saved, newvalue); \
+	} while (saved != oldvalue) ;\
+} \
+\
+        EXTERN void  __kmpc_atomic_cmplx4_##_op##_cpt\
+        (kmp_Indent *id_ref, int32_t gtid, float complex * lhs, float complex rhs, float complex *outp, int flag) { \
+	uint64_t *temp_lhs = (uint64_t *)lhs; \
+	uint64_t oldvalue = *temp_lhs; \
+	uint64_t saved ; \
+	float *pr1 = (float *) &rhs; \
+	float *pi1 = pr1+1; \
+	float r1 = *pr1 ; \
+	float i1 = *pi1 ; \
+	uint64_t newvalue ; \
+	do { \
+		saved = oldvalue; \
+		float *pr2 = (float *) &saved ; \
+		float *pi2 = pr2+1; \
+		newvalue = fc_##_op(*pr2, *pi2, r1, i1); \
+		oldvalue = atomicCAS(temp_lhs, saved, newvalue); \
+	} while (saved != oldvalue) ;\
+	if (flag) {float complex *temp = (float complex *)&newvalue; *outp =  *temp; } \
+	else {float complex *temp = (float complex *) &saved; *outp =  *temp; } \
+}
+
+#define ATOMIC_GENOP_FC_REV(_op) \
+        EXTERN void __kmpc_atomic_cmplx4_##_op##_rev\
+        (kmp_Indent *id_ref, int32_t gtid, float complex * lhs, float complex rhs) { \
+	uint64_t *temp_lhs = (uint64_t *)lhs; \
+	uint64_t oldvalue = *temp_lhs; \
+	uint64_t saved ; \
+	float *pr1 = (float *) &rhs; \
+	float *pi1 = pr1+1; \
+	float r1 = *pr1 ; \
+	float i1 = *pi1 ; \
+	uint64_t newvalue ; \
+	do { \
+		saved = oldvalue; \
+		float *pr2 = (float *) &saved ; \
+		float *pi2 = pr2+1; \
+		newvalue = fc_##_op(r1, i1, *pr2, *pi2); \
+		oldvalue = atomicCAS(temp_lhs, saved, newvalue); \
+	} while (saved != oldvalue) ;\
+} \
+\
+        EXTERN void  __kmpc_atomic_cmplx4_##_op##_cpt##_rev\
+        (kmp_Indent *id_ref, int32_t gtid, float complex * lhs, float complex rhs, float complex *outp, int flag) { \
+	uint64_t *temp_lhs = (uint64_t *)lhs; \
+	uint64_t oldvalue = *temp_lhs; \
+	uint64_t saved ; \
+	float *pr1 = (float *) &rhs; \
+	float *pi1 = pr1+1; \
+	float r1 = *pr1 ; \
+	float i1 = *pi1 ; \
+	uint64_t newvalue ; \
+	do { \
+		saved = oldvalue; \
+		float *pr2 = (float *) &saved ; \
+		float *pi2 = pr2+1; \
+		newvalue = fc_##_op(r1, i1, *pr2, *pi2); \
+		oldvalue = atomicCAS(temp_lhs, saved, newvalue); \
+	} while (saved != oldvalue) ;\
+	if (flag) {float complex *temp = (float complex *)&newvalue; *outp =  *temp; } \
+	else {float complex *temp = (float complex *) &saved; *outp =  *temp; } \
+}
+
+
+ATOMIC_GENOP_FC(add);
+ATOMIC_GENOP_FC(sub);
+ATOMIC_GENOP_FC_REV(sub);
+ATOMIC_GENOP_FC(mul);
+ATOMIC_GENOP_FC(div);
+ATOMIC_GENOP_FC_REV(div);
+
+/* for test */
+//ATOMIC_GENOP_DIRECT(fixed4, int32_t, add);
+
+//for int and unit
+#define ATOMIC_GENOP_ALL_MIXED(_name, _dirname, _tname, _optype) \
+	_dirname(_tname, _optype, add, Add) ;\
+	_dirname(_tname, _optype, sub, Sub) ;\
+	_name##_REV(_tname, _optype, sub) ;\
+	_name(_tname, _optype, mul) ;\
+	_name(_tname, _optype, div) ;\
+	_name##_REV(_tname, _optype, div) ;\
+	_dirname(_tname, _optype, min, Min) ;\
+	_dirname(_tname, _optype, max, Max) ;\
+	_dirname(_tname, _optype, andb, And) ;\
+	_dirname(_tname, _optype, orb, Or) ;\
+	_dirname(_tname, _optype, xor, Xor) ;\
+	_name(_tname, _optype, shl) ;\
+	_name(_tname, _optype, shr) ;\
+	_name(_tname, _optype, andl) ;\
+	_name(_tname, _optype, orl) ; \
+	_name(_tname, _optype, eqv) ; \
+	_name(_tname, _optype, neqv) ;
+
+
+#define ATOMIC_GENOP_ALL_MIXED_FIXED8U(_name, _dirname, _tname, _optype) \
+	_dirname(_tname, _optype, add, Add) ;\
+	_name(_tname, _optype, sub) ;\
+	_name##_REV(_tname, _optype, sub) ;\
+	_name(_tname, _optype, mul) ;\
+	_name(_tname, _optype, div) ;\
+	_name##_REV(_tname, _optype, div) ;\
+	_dirname(_tname, _optype, min, Min) ;\
+	_dirname(_tname, _optype, max, Max) ;\
+	_dirname(_tname, _optype, andb, And) ;\
+	_dirname(_tname, _optype, orb, Or) ;\
+	_dirname(_tname, _optype, xor, Xor) ;\
+	_name(_tname, _optype, shl) ;\
+	_name(_tname, _optype, shr) ;\
+	_name(_tname, _optype, andl) ;\
+	_name(_tname, _optype, orl) ; \
+	_name(_tname, _optype, eqv) ; \
+	_name(_tname, _optype, neqv) ;
+
+#define ATOMIC_GENOP_ALL(_name, _tname, _optype) \
+	_name(_tname, _optype, add) ;\
+	_name(_tname, _optype, sub) ;\
+	_name##_REV(_tname, _optype, sub) ;\
+	_name(_tname, _optype, mul) ;\
+	_name(_tname, _optype, div) ;\
+	_name##_REV(_tname, _optype, div) ;\
+	_name(_tname, _optype, min) ;\
+	_name(_tname, _optype, max) ;\
+	_name(_tname, _optype, andb) ;\
+	_name(_tname, _optype, orb) ;\
+	_name(_tname, _optype, xor) ;\
+	_name(_tname, _optype, shl) ;\
+	_name(_tname, _optype, shr) ;\
+	_name(_tname, _optype, andl) ;\
+	_name(_tname, _optype, orl) ; \
+	_name(_tname, _optype, eqv) ; \
+	_name(_tname, _optype, neqv) ; \
+
+#define ATOMIC_GENOP_FLOAT(_name, _tname, _optype) \
+	_name(_tname, _optype, add) ;\
+	_name(_tname, _optype, sub) ;\
+	_name##_REV(_tname, _optype, sub) ;\
+	_name(_tname, _optype, mul) ;\
+	_name(_tname, _optype, div) ;\
+	_name##_REV(_tname, _optype, div) ;\
+	_name(_tname, _optype, min) ;\
+	_name(_tname, _optype, max) ;
+
+
+
+
+//ATOMIC_GENOP_ALL(ATOMIC_GENOP_DIRECT, fixed4, int32_t);
+//ATOMIC_GENOP_ALL(ATOMIC_GENOP_DIRECT, fixed4u, uint32_t);
+ATOMIC_GENOP_ALL_MIXED(ATOMIC_GENOP_DIRECT, ATOMIC_GENOP_NATIVE, fixed4, int32_t);
+ATOMIC_GENOP_ALL_MIXED(ATOMIC_GENOP_DIRECT, ATOMIC_GENOP_NATIVE, fixed4u, uint32_t);
+
+
+ATOMIC_GENOP_ALL(ATOMIC_GENOP_DIRECT, fixed8, int64_t);
+
+ATOMIC_GENOP_ALL(ATOMIC_GENOP_DIRECT, fixed8u, uint64_t);
+//ATOMIC_GENOP_ALL_MIXED_FIXED8U(ATOMIC_GENOP_DIRECT, ATOMIC_GENOP_NATIVE, fixed8u, uint64_t);
+
+ATOMIC_GENOP_FLOAT(ATOMIC_GENOP_DIRECT, float4, float);
+ATOMIC_GENOP_FLOAT(ATOMIC_GENOP_DIRECT, float8, double);
+
+
+/*******************************************************
+ * data type of size not 32 nor 64
+ *******************************************************/
+
+typedef enum {
+  omptarget_nvptx_inc,
+  omptarget_nvptx_dec,
+  omptarget_nvptx_add,
+  omptarget_nvptx_sub,
+  omptarget_nvptx_sub_rev,
+  omptarget_nvptx_mul,
+  omptarget_nvptx_div,
+  omptarget_nvptx_div_rev,
+  omptarget_nvptx_min,
+  omptarget_nvptx_max,
+  omptarget_nvptx_rd,
+  omptarget_nvptx_wr,
+  omptarget_nvptx_swp,
+  omptarget_nvptx_andb,
+  omptarget_nvptx_orb,
+  omptarget_nvptx_xor,
+  omptarget_nvptx_andl,
+  omptarget_nvptx_orl,
+  omptarget_nvptx_eqv,
+  omptarget_nvptx_neqv,
+  omptarget_nvptx_shl,
+  omptarget_nvptx_shl_rev,
+  omptarget_nvptx_shr,
+  omptarget_nvptx_shr_rev,
+} omptarget_nvptx_BINOP_t;
+
+
+
+template <
+  typename OpType,            // type of the operation performed
+  omptarget_nvptx_BINOP_t binop          // enum describing the operation
+>
+INLINE __device__ OpType Compute(OpType a, OpType b) // a is old value, b is new value
+{
+  OpType res = 0;
+  if (binop == omptarget_nvptx_inc)     res = a + b;
+  if (binop == omptarget_nvptx_dec)     res = a - b;
+  if (binop == omptarget_nvptx_add)     res = a + b;
+  if (binop == omptarget_nvptx_sub)     res = a - b;
+  if (binop == omptarget_nvptx_sub_rev) res = b - a;
+  if (binop == omptarget_nvptx_mul)     res = a * b;
+  if (binop == omptarget_nvptx_div)     res = a / b;
+  if (binop == omptarget_nvptx_div_rev) res = b / a;
+  if (binop == omptarget_nvptx_min)     res = a < b ? a : b;
+  if (binop == omptarget_nvptx_max)     res = a > b ? a : b;
+  if (binop == omptarget_nvptx_rd)      res = a; // read
+  if (binop == omptarget_nvptx_wr)      res = b; // write and swap are the same
+  if (binop == omptarget_nvptx_swp)     res = b; // write and swap are the same
+  if (binop == omptarget_nvptx_andb)    res = a &  b;
+  if (binop == omptarget_nvptx_orb)     res = a |  b;
+  if (binop == omptarget_nvptx_xor)     res = a ^  b;
+  if (binop == omptarget_nvptx_andl)    res = a && b;
+  if (binop == omptarget_nvptx_orl)     res = a || b;
+  if (binop == omptarget_nvptx_eqv)     res = a == b;
+  if (binop == omptarget_nvptx_neqv)    res = a != b;
+  if (binop == omptarget_nvptx_shl)     res = a << b;
+  if (binop == omptarget_nvptx_shl_rev) res = b << a;
+  if (binop == omptarget_nvptx_shr)     res = a >> b;
+  if (binop == omptarget_nvptx_shr_rev) res = b >> a;
+
+  return res;
+}
+
+
+template<> 
+INLINE __device__ float Compute<float, omptarget_nvptx_add>(float a, float b) 
+{
+	return a+b;
+}
+
+template<> 
+INLINE __device__ float Compute<float, omptarget_nvptx_sub>(float a, float b) 
+{
+	return a-b;
+}
+
+template<> 
+INLINE __device__ float Compute<float, omptarget_nvptx_mul>(float a, float b) 
+{
+	return a*b;
+}
+
+template<> 
+INLINE __device__ float Compute<float, omptarget_nvptx_div>(float a, float b) 
+{
+	return a/b;
+}
+
+template<> 
+INLINE __device__ float Compute<float, omptarget_nvptx_min>(float a, float b) 
+{
+	return a<b?a:b;
+}
+
+template<> 
+INLINE __device__ float Compute<float, omptarget_nvptx_max>(float a, float b) 
+{
+	return a>b?a:b;
+}
+
+template<> 
+INLINE __device__ double Compute<double, omptarget_nvptx_add>(double a, double b) 
+{
+	return a+b;
+}
+
+template<> 
+INLINE __device__ double Compute<double, omptarget_nvptx_sub>(double a, double b) 
+{
+	return a-b;
+}
+
+template<> 
+INLINE __device__ double Compute<double, omptarget_nvptx_mul>(double a, double b) 
+{
+	return a*b;
+}
+
+template<> 
+INLINE __device__ double Compute<double, omptarget_nvptx_div>(double a, double b) 
+{
+	return a/b;
+}
+
+template<> 
+INLINE __device__ double Compute<double, omptarget_nvptx_min>(double a, double b) 
+{
+	return a<b?a:b;
+}
+
+template<> 
+INLINE __device__ double Compute<double, omptarget_nvptx_max>(double a, double b) 
+{
+	return a>b?a:b;
+}
+
+#if 0
+template <
+  omptarget_nvptx_BINOP_t binop          // enum describing the operation
+>
+INLINE __device__ float Compute<float, binop>(float a, float b) // a is old value, b is new value
+{
+  OpType res = 0;
+  if (binop == omptarget_nvptx_add)     res = a + b;
+  if (binop == omptarget_nvptx_sub)     res = a - b;
+  if (binop == omptarget_nvptx_mul)     res = a * b;
+  if (binop == omptarget_nvptx_div)     res = a / b;
+  if (binop == omptarget_nvptx_min)     res = a < b ? a : b;
+  if (binop == omptarget_nvptx_max)     res = a > b ? a : b;
+  return res;
+}
+#endif
+
+
+////////////////////////////////////////////////////////////////////////////////
+// common atomic slicing functions (modifying only a part of a word)
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename MemType,      // type of the underlying atomic memory operation 
+  typename OpType        // type of the operation performed
+>
+INLINE __device__ void ComputeAtomic_PrepareSlice(
+  OpType *addr,          // original address
+  MemType **memAddrPtr,           // truncated address to MemType boundary
+  MemType *memBitShiftRightPtr,   // bits to shift to move val to rightmost position
+  MemType *memValMaskInPlacePtr)  // mask of val in proper position
+{
+  //ASSERT(LT_FUSSY, sizeof(OpType)<sizeof(MemType), "expected smaller type");
+  // compute the mask that corresponds to the natural alignment of memType
+  // int -> 0x3; long long -> 0x7
+  unsigned long memAddrMask = sizeof(MemType) - 1;
+  // compute the addr of the atomic variable truncated to alignment of memType
+  *memAddrPtr = (MemType *) 
+    ((unsigned long) addr & ~ memAddrMask);
+  // compute the number of bit shift to move the target atomic value in 
+  // the rightmost position
+  unsigned long byteOffsetInMem = (unsigned long) addr & memAddrMask;
+
+  /* for big-endian */
+  //unsigned long lastByteOffsetInMem = byteOffsetInMem + sizeof(OpType) -1;
+//  unsigned long byteShiftRight = (sizeof(MemType) - 1) - lastByteOffsetInMem;
+  //*memBitShiftRightPtr = (MemType) (byteShiftRight << 3);  // 3: byte to bits
+
+
+   /* for little-endian */
+  unsigned long byteShiftRight = byteOffsetInMem;
+  *memBitShiftRightPtr = (MemType) (byteShiftRight << 3);  // 3: byte to bits
+
+  // mask to isolate target atomic value located in rightmost position
+  MemType memValMask = ((MemType) 1 << (sizeof(OpType) << 3)) -1;  
+  // mask to isolate target atomic value located in place
+  *memValMaskInPlacePtr = memValMask << *memBitShiftRightPtr;
+/*
+  printf( 
+    "Atomic of size %d in mem size %d: addr 0x%llx, truncated addr 0x%llx, shift right %lld, mask in place 0x%llx\n",
+    sizeof(MemType), sizeof(OpType), (uint64_t) addr, (uint64_t) *memAddrPtr, 
+    (uint64_t) *memBitShiftRightPtr, (uint64_t) *memValMaskInPlacePtr);
+*/
+}
+
+template <
+  typename MemType,      // type of the underlying atomic memory operation 
+  typename OpType,       // type of the operation performed
+  omptarget_nvptx_BINOP_t binop     // enum describing the operation
+>
+INLINE __device__  MemType  ComputeAtomic_ComputeSlice(
+  MemType oldMemVal,          // old value
+  OpType val,                 // value to compute with
+  MemType memBitShiftRight,   // bits to shift to move val to rightmost position
+  MemType memValMaskInPlace  // mask of val in proper position 
+  ) 
+{
+  OpType oldValtmp;
+  OpType newValtmp;
+  // select target atomic val
+  MemType oldMemVal_targetVal = oldMemVal &   memValMaskInPlace;
+  MemType oldMemVal_otherVal  = oldMemVal & ~ memValMaskInPlace;
+  // shift target atomic val to rightmost place: this is the old value
+
+  //type conversion??
+  oldValtmp = (OpType) (oldMemVal_targetVal >> memBitShiftRight);
+  // perform op
+
+  newValtmp = Compute<OpType, binop>(oldValtmp, val);
+
+  // insert new value in old world mem
+
+   //type conversion??
+  MemType newMemVal_targetVal = ((MemType) newValtmp) << memBitShiftRight;
+  newMemVal_targetVal &= memValMaskInPlace;
+  MemType newMemVal = oldMemVal_otherVal | newMemVal_targetVal;
+  return newMemVal;
+}
+
+
+
+#define ATOMIC_GENOP_PARTIAL(_name, _dtype, _op, _memType) \
+        EXTERN void __kmpc_atomic_##_name##_##_op\
+        (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs) { \
+      _memType *memAddr ; \
+      _memType memBitShiftRightPtr; \
+      _memType memValMaskInPlacePtr; \
+      ComputeAtomic_PrepareSlice<_memType, _dtype> \
+        (lhs, &memAddr, &memBitShiftRightPtr, &memValMaskInPlacePtr); \
+      _memType oldMemVal, newMemVal; \
+      oldMemVal = *memAddr; \
+      _memType savedMemVal; \
+      do { \
+        savedMemVal = oldMemVal; \
+        newMemVal = ComputeAtomic_ComputeSlice  <_memType, _dtype, omptarget_nvptx_##_op > \
+          (oldMemVal, rhs, memBitShiftRightPtr, memValMaskInPlacePtr); \
+	oldMemVal = atomicCAS(memAddr, savedMemVal, newMemVal); \
+      } while (savedMemVal != oldMemVal); \
+} \
+\
+        EXTERN _dtype __kmpc_atomic_##_name##_##_op##_cpt\
+        (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs, int flag) { \
+      _memType *memAddr ; \
+      _memType memBitShiftRightPtr; \
+      _memType memValMaskInPlacePtr; \
+      ComputeAtomic_PrepareSlice<_memType, _dtype> \
+        (lhs, &memAddr, &memBitShiftRightPtr, &memValMaskInPlacePtr); \
+      _memType oldMemVal, newMemVal; \
+      oldMemVal = *memAddr; \
+      _memType savedMemVal; \
+      do { \
+        savedMemVal = oldMemVal; \
+        newMemVal = ComputeAtomic_ComputeSlice  <_memType, _dtype, omptarget_nvptx_##_op > \
+          (oldMemVal, rhs, memBitShiftRightPtr, memValMaskInPlacePtr); \
+	oldMemVal = atomicCAS(memAddr, savedMemVal, newMemVal); \
+      } while (savedMemVal != oldMemVal); \
+      if (flag) \
+	return (_dtype) ((newMemVal & memValMaskInPlacePtr) >> memBitShiftRightPtr); \
+      else  \
+	return (_dtype) ((oldMemVal & memValMaskInPlacePtr) >> memBitShiftRightPtr); \
+}
+
+
+#define ATOMIC_GENOP_PARTIAL_REV(_name, _dtype, _op, _memType) \
+        EXTERN void __kmpc_atomic_##_name##_##_op##_rev\
+        (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs) { \
+      _memType *memAddr ; \
+      _memType memBitShiftRightPtr; \
+      _memType memValMaskInPlacePtr; \
+      ComputeAtomic_PrepareSlice<_memType, _dtype> \
+        (lhs, &memAddr, &memBitShiftRightPtr, &memValMaskInPlacePtr); \
+      _memType oldMemVal, newMemVal; \
+      oldMemVal = *memAddr; \
+      _memType savedMemVal; \
+      do { \
+        savedMemVal = oldMemVal; \
+        newMemVal = ComputeAtomic_ComputeSlice  <_memType, _dtype, omptarget_nvptx_##_op > \
+          (oldMemVal, rhs, memBitShiftRightPtr, memValMaskInPlacePtr); \
+	oldMemVal = atomicCAS(memAddr, savedMemVal, newMemVal); \
+      } while (savedMemVal != oldMemVal); \
+} \
+\
+        EXTERN _dtype __kmpc_atomic_##_name##_##_op##_cpt_rev\
+        (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype rhs, int flag) { \
+      _memType *memAddr ; \
+      _memType memBitShiftRightPtr; \
+      _memType memValMaskInPlacePtr; \
+      ComputeAtomic_PrepareSlice<_memType, _dtype> \
+        (lhs, &memAddr, &memBitShiftRightPtr, &memValMaskInPlacePtr); \
+      _memType oldMemVal, newMemVal; \
+      oldMemVal = *memAddr; \
+      _memType savedMemVal; \
+      do { \
+        savedMemVal = oldMemVal; \
+        newMemVal = ComputeAtomic_ComputeSlice  <_memType, _dtype, omptarget_nvptx_##_op > \
+          (oldMemVal, rhs, memBitShiftRightPtr, memValMaskInPlacePtr); \
+	oldMemVal = atomicCAS(memAddr, savedMemVal, newMemVal); \
+      } while (savedMemVal != oldMemVal); \
+      if (flag) \
+	return (_dtype) ((newMemVal & memValMaskInPlacePtr) >> memBitShiftRightPtr); \
+      else  \
+	return (_dtype) ((oldMemVal & memValMaskInPlacePtr) >> memBitShiftRightPtr); \
+}
+
+
+
+#define ATOMIC_GENOP_ALL4(_name, _tname, _optype, _memtype) \
+	_name(_tname, _optype, add, _memtype); \
+	_name(_tname, _optype, sub, _memtype); \
+	_name##_REV(_tname, _optype, sub_rev, _memtype); \
+	_name(_tname, _optype, mul, _memtype); \
+	_name(_tname, _optype, div, _memtype); \
+	_name##_REV(_tname, _optype, div_rev, _memtype); \
+	_name(_tname, _optype, min, _memtype); \
+	_name(_tname, _optype, max, _memtype); \
+	_name(_tname, _optype, andb, _memtype); \
+	_name(_tname, _optype, orb, _memtype); \
+	_name(_tname, _optype, xor, _memtype); \
+	_name(_tname, _optype, andl, _memtype); \
+	_name(_tname, _optype, orl, _memtype); \
+	_name(_tname, _optype, eqv, _memtype); \
+	_name(_tname, _optype, neqv, _memtype); \
+	_name(_tname, _optype, shl, _memtype); \
+	_name(_tname, _optype, shr, _memtype);
+
+
+ATOMIC_GENOP_ALL4(ATOMIC_GENOP_PARTIAL, fixed1, int8_t, int32_t);
+ATOMIC_GENOP_ALL4(ATOMIC_GENOP_PARTIAL, fixed1u, uint8_t, int32_t);
+ATOMIC_GENOP_ALL4(ATOMIC_GENOP_PARTIAL, fixed2u, uint16_t, int32_t);
+ATOMIC_GENOP_ALL4(ATOMIC_GENOP_PARTIAL, fixed2, int16_t, int32_t);
+
+/** cooperative reduction 
+ *  make use of warp, shared variable, and __syncthreads
+ **/
+
+
+
+
+template<typename T>
+INLINE __device__ T myshfldown(T val, unsigned int delta, int size=warpSize) {
+	return __shfl_down(val, delta, size);
+#if 0
+	T ret = 0;
+	int localv;
+	int remotev;
+	/* not finished */
+	switch(sizeof(T)) {
+	case 1:
+	case 2:
+		localv = reinterpret_cast<int>(val);
+		remotev = __shfl_down(localv, delta, size);
+		ret = reinterpret_cast<T>(remotev);
+		break;
+	break;
+	
+	}
+	return ret;
+#endif
+}
+
+#if 0
+template<>
+INLINE __device__ float myshfldown<float>(float val, unsigned int delta, int size) {
+	int t = __float_as_int(val);
+	int t1 = __shfl_down(t, delta, size);
+	float ret = __int_as_float<float>(t1);
+	return ret;
+}
+#endif
+
+template<>
+INLINE __device__ int myshfldown<int>(int val, unsigned int delta, int size) {
+	return __shfl_down(val, delta, size);
+}
+
+template<>
+INLINE __device__ unsigned int myshfldown<unsigned int>(unsigned int val, unsigned int delta, int size) {
+	return __shfl_down(val, delta, size);
+}
+
+template<>
+INLINE __device__ int64_t myshfldown<int64_t>(int64_t val, unsigned int delta, int size) {
+	return __shfl_down(val, delta, size);
+}
+
+template<>
+INLINE __device__ uint64_t myshfldown<uint64_t>(uint64_t val, unsigned int delta, int size) {
+	return __shfl_down(val, delta, size);
+}
+
+
+template<>
+INLINE __device__ float myshfldown<float>(float val, unsigned int delta, int size) {
+	return __shfl_down(val, delta, size);
+}
+
+template<>
+INLINE __device__ double myshfldown<double>(double val, unsigned int delta, int size) {
+	return __shfl_down(val, delta, size);
+}
+
+template<>
+INLINE __device__ unsigned long long  myshfldown<unsigned long long >(unsigned long long  val, unsigned int delta, int size) {
+	return __shfl_down(val, delta, size);
+}
+
+
+
+template <typename T, omptarget_nvptx_BINOP_t binop>
+__inline__ __device__
+T reduInitVal() {
+	switch(binop) {
+   	case omptarget_nvptx_inc:
+	case omptarget_nvptx_dec:
+	case omptarget_nvptx_add:
+	case omptarget_nvptx_sub:
+	case omptarget_nvptx_sub_rev:
+		return (T) 0;
+	case omptarget_nvptx_mul:
+	case omptarget_nvptx_div:
+		return (T) 1;
+	default:
+		//ASSERT(0);
+		return (T) 0;
+	}
+}
+
+template <typename T, omptarget_nvptx_BINOP_t binop>
+__inline__ __device__
+T  warpReduceSum(T val, unsigned int size) {
+  for (int offset = size/2; offset > 0; offset /= 2)
+    val = Compute<T, binop>(val, myshfldown<T>(val, offset, size));
+  return val;
+}
+
+//#define MYGSIZE warpSize
+#define MYGSIZE 32
+
+template<typename T, omptarget_nvptx_BINOP_t binop>
+__inline__ __device__ T warpBlockReduction(T inputval) {
+  	__shared__ T shared[MYGSIZE]; 
+	
+	unsigned int remainder  = blockDim.x & (MYGSIZE-1);;
+	unsigned int start_r = blockDim.x - remainder;
+  	int lane = threadIdx.x % warpSize;
+	int wid = threadIdx.x / warpSize;
+
+	if (blockDim.x < MYGSIZE) {
+		shared[threadIdx.x] = inputval;
+	} else {
+		if (threadIdx.x >= start_r) {
+			shared[threadIdx.x - start_r] = inputval;
+		} else if (threadIdx.x < MYGSIZE && threadIdx.x >= remainder) {
+			shared[threadIdx.x] = reduInitVal<T, binop>();
+		}
+	}
+	__syncthreads();
+
+	if (blockDim.x < MYGSIZE) {
+		if (threadIdx.x == 0) {
+			T val = shared[0];
+			for(unsigned i= 1; i < blockDim.x; i++) {
+				val = Compute<T, binop>(val, shared[i]);
+			}
+			return val;
+		}
+		return (T) 0;
+	}
+		
+	if (threadIdx.x < start_r) {
+		T val = warpReduceSum<T, binop>(inputval, MYGSIZE);
+		if (lane == 0) {
+			shared[wid] = Compute<T, binop>(shared[wid], val);
+		}
+	}
+	__syncthreads();
+
+	if (wid == 0) {
+		T val = warpReduceSum<T, binop>(shared[threadIdx.x], MYGSIZE);
+		if (threadIdx.x == 0) {
+//			printf("inside %d\n", val);
+			return val;
+		}
+	}
+	return (T) 0;
+}
+
+
+#define WARPBLOCK_GENREDU(_name, _dtype, _op) \
+        EXTERN _dtype __gpu_warpBlockRedu_##_name##_##_op\
+        (_dtype rhs) { \
+	return warpBlockReduction<_dtype, omptarget_nvptx_##_op>(rhs); \
+}
+
+#define WARPBLOCK_GENREDU_ALLOP(_name, _dtype) \
+	WARPBLOCK_GENREDU(_name, _dtype, add); \
+	WARPBLOCK_GENREDU(_name, _dtype, sub); \
+	WARPBLOCK_GENREDU(_name, _dtype, mul); \
+	WARPBLOCK_GENREDU(_name, _dtype, div); \
+	WARPBLOCK_GENREDU(_name, _dtype, min); \
+	WARPBLOCK_GENREDU(_name, _dtype, max); \
+	WARPBLOCK_GENREDU(_name, _dtype, andb); \
+	WARPBLOCK_GENREDU(_name, _dtype, orb); \
+	WARPBLOCK_GENREDU(_name, _dtype, xor); \
+	WARPBLOCK_GENREDU(_name, _dtype, andl); \
+	WARPBLOCK_GENREDU(_name, _dtype, orl); \
+	WARPBLOCK_GENREDU(_name, _dtype, eqv); \
+	WARPBLOCK_GENREDU(_name, _dtype, neqv); \
+	WARPBLOCK_GENREDU(_name, _dtype, shl); \
+	WARPBLOCK_GENREDU(_name, _dtype, shr); 
+	
+
+WARPBLOCK_GENREDU_ALLOP(fixed1, int8_t);
+WARPBLOCK_GENREDU_ALLOP(fixed1u, uint8_t);
+WARPBLOCK_GENREDU_ALLOP(fixed2, int16_t);
+WARPBLOCK_GENREDU_ALLOP(fixed2u, uint16_t);
+WARPBLOCK_GENREDU_ALLOP(fixed4, int32_t);
+WARPBLOCK_GENREDU_ALLOP(fixed4u, uint32_t);
+WARPBLOCK_GENREDU_ALLOP(fixed8, int64_t);
+WARPBLOCK_GENREDU_ALLOP(fixed8u, uint64_t);
+
+#define WARPBLOCK_GENREDU_ALLOP_F(_name, _dtype) \
+	WARPBLOCK_GENREDU(_name, _dtype, add); \
+	WARPBLOCK_GENREDU(_name, _dtype, sub); \
+	WARPBLOCK_GENREDU(_name, _dtype, mul); \
+	WARPBLOCK_GENREDU(_name, _dtype, div); \
+	WARPBLOCK_GENREDU(_name, _dtype, min); \
+	WARPBLOCK_GENREDU(_name, _dtype, max); 
+WARPBLOCK_GENREDU_ALLOP_F(float4, float);
+WARPBLOCK_GENREDU_ALLOP_F(float8, double);
+
+
+/**************************************
+* runtime support for array reduction *
+***************************************/
+
+#define ARRAYATOMIC_GENOP(_name, _dtype, _op) \
+        EXTERN void __array_atomic_##_name##_##_op\
+        (kmp_Indent *id_ref, int32_t gtid, _dtype * lhs, _dtype *rhs, int64_t n) { \
+	PRINT(LD_LOOP, "Reduction: thead %d\n", gtid); \
+	for(int i = 0; i < n/sizeof(_dtype); i++) { \
+        	__kmpc_atomic_##_name##_##_op(id_ref, gtid, lhs+i, rhs[i]);\
+	 }\
+	}\
+        EXTERN void __gpu_array_warpBlockRedu_##_name##_##_op\
+	(_dtype *ldata, int64_t n) {\
+	for(int i = 0; i < n/sizeof(_dtype); i++) { \
+		ldata[i] = __gpu_warpBlockRedu_##_name##_##_op(ldata[i]); \
+	} \
+} 
+
+#define ARRAY_GEN_ALLOP_INTEGER(_name, _tname, _optype) \
+	_name(_tname, _optype, add) ;\
+	_name(_tname, _optype, sub) ;\
+	_name(_tname, _optype, mul) ;\
+	_name(_tname, _optype, div) ;\
+	_name(_tname, _optype, min) ;\
+	_name(_tname, _optype, max) ;\
+	_name(_tname, _optype, andb) ;\
+	_name(_tname, _optype, orb) ;\
+	_name(_tname, _optype, xor) ;\
+	_name(_tname, _optype, shl) ;\
+	_name(_tname, _optype, shr) ;\
+	_name(_tname, _optype, andl) ;\
+	_name(_tname, _optype, orl) ; \
+	_name(_tname, _optype, eqv) ; \
+	_name(_tname, _optype, neqv) ; \
+
+#define ARRAY_GEN_ALLOP_FLOAT(_name, _tname, _optype) \
+	_name(_tname, _optype, add) ;\
+	_name(_tname, _optype, sub) ;\
+	_name(_tname, _optype, mul) ;\
+	_name(_tname, _optype, div) ;\
+	_name(_tname, _optype, min) ;\
+	_name(_tname, _optype, max) ;
+
+
+//ARRAYATOMIC_GENOP(fixed4, int32_t, add);
+
+#if 1
+ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed1, int8_t);
+ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed2, int16_t);
+ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed4, int32_t);
+ARRAY_GEN_ALLOP_INTEGER(ARRAYATOMIC_GENOP, fixed8, int64_t);
+ARRAY_GEN_ALLOP_FLOAT(ARRAYATOMIC_GENOP,float4, float);
+ARRAY_GEN_ALLOP_FLOAT(ARRAYATOMIC_GENOP,float8, double);
+#endif
+
Index: libomptarget/deviceRTLs/nvptx/src/stdio.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/stdio.cu
@@ -0,0 +1,24 @@
+//===------------- stdio.cu - NVPTX OpenMP Std I/O --------------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements standard IO functions. Note that varargs are not supported in
+// CUDA, therefore the compiler needs to analyze the arguments passed to
+// printf and generate a call to one of the functions defined here.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
+EXTERN int __kmpc_printf(const char* str)
+{
+  return printf(str);
+}
+
+
Index: libomptarget/deviceRTLs/nvptx/src/support.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/support.h
@@ -0,0 +1,63 @@
+//===--------- support.h - NVPTX OpenMP support functions -------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper to some functions natively supported by the GPU.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// get info from machine
+////////////////////////////////////////////////////////////////////////////////
+
+// get global ids to locate tread/team info (constant regardless of OMP)
+INLINE int GetGlobalThreadId();   
+INLINE int GetGlobalTeamId();     
+
+// get global number of ids to size thread/team data structures 
+INLINE int GetNumberOfGlobalThreadIds();
+INLINE int GetNumberOfGlobalTeamIds();   
+
+// get OpenMP thread and team ids
+INLINE int GetOmpThreadId(int globalThreadId);        // omp_thread_num
+INLINE int GetOmpTeamId();                            // omp_team_num
+
+// get OpenMP number of threads and team
+INLINE int GetNumberOfOmpThreads(int globalThreadId); // omp_num_threads
+INLINE int GetNumberOfOmpTeams();                     // omp_num_teams
+
+// get OpenMP number of procs
+INLINE int GetNumberOfProcsInTeam();
+
+// masters
+INLINE int IsTeamMaster(int ompThreadId);
+INLINE int IsWarpMaster(int ompThreadId);
+
+
+// get low level ids of resources
+INLINE int GetThreadIdInBlock();
+INLINE int GetWarpIdInBlock();
+INLINE int GetBlockIdInKernel();
+
+// Get low level number of resources
+INLINE int GetNumberOfThreadsInBlock();
+INLINE int GetNumberOfWarpsInBlock();
+INLINE int GetNumberOfBlocksInKernel();
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory 
+////////////////////////////////////////////////////////////////////////////////
+
+
+// safe alloc and free
+INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
+INLINE void *SafeFree(void *ptr, const char *msg); 
+// pad to a alignment (power of 2 only)
+INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+#define ADD_BYTES(_addr, _bytes) ((void *)((char *)((void *)(_addr))+(_bytes)))
+#define SUB_BYTES(_addr, _bytes) ((void *)((char *)((void *)(_addr))-(_bytes)))
Index: libomptarget/deviceRTLs/nvptx/src/supporti.h
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -0,0 +1,192 @@
+//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper implementation to some functions natively supported by the GPU.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// support: get info from machine
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// machine: get number of (assuming 1D layout)
+
+INLINE int GetNumberOfThreadsInBlock()
+{
+  return blockDim.x;
+}
+
+INLINE int GetNumberOfWarpsInBlock()
+{
+  ASSERT(LT_FUSSY, GetNumberOfThreadsInBlock() % warpSize == 0, 
+    "expected threads num %d to be a multiple of warp size %d\n", 
+    GetNumberOfThreadsInBlock(), warpSize);
+   return GetNumberOfThreadsInBlock() / warpSize;
+}
+
+INLINE int GetNumberOfBlocksInKernel()
+{
+  return gridDim.x;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// machine: get ids  (assuming 1D layout) 
+
+INLINE int GetThreadIdInBlock()
+{
+  return threadIdx.x;
+}
+
+INLINE int GetWarpIdInBlock()
+{
+  ASSERT(LT_FUSSY, GetNumberOfThreadsInBlock() % warpSize == 0, 
+    "expected threads num %d to be a multiple of warp size %d\n", 
+    GetNumberOfThreadsInBlock(), warpSize);
+  return  GetThreadIdInBlock() / warpSize;
+}
+
+INLINE int GetBlockIdInKernel()
+{
+  return blockIdx.x;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Global thread id used to locate thread info 
+
+INLINE int GetGlobalThreadId()
+{
+  #ifdef OMPTHREAD_IS_WARP
+    return GetBlockIdInKernel() * GetNumberOfWarpsInBlock()   + GetWarpIdInBlock();
+  #else
+    return GetBlockIdInKernel() * GetNumberOfThreadsInBlock() + GetThreadIdInBlock();
+  #endif
+}
+
+INLINE int GetNumberOfGlobalThreadIds() 
+{
+  #ifdef OMPTHREAD_IS_WARP
+    return GetNumberOfWarpsInBlock()   * GetNumberOfBlockInKernel();
+  #else
+    return GetNumberOfThreadsInBlock() * GetNumberOfBlocksInKernel();
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// global  team id used to locate team info 
+
+INLINE int GetGlobalTeamId()
+{
+  return GetBlockIdInKernel();
+}
+ 
+INLINE int GetNumberOfGlobalTeamIds()
+{
+  return GetNumberOfBlocksInKernel();
+}   
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenMP Thread id linked to OpenMP
+
+INLINE int GetOmpThreadId(int globalThreadId)    
+{
+  // omp_thread_num
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(globalThreadId);
+  int rc = currTaskDescr->ThreadId();
+  return rc;
+}
+
+INLINE int GetNumberOfOmpThreads(int globalThreadId)
+{
+  // omp_num_threads
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(globalThreadId);
+
+  ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
+
+  int rc = currTaskDescr->ThreadsInTeam();
+  return rc;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Team id linked to OpenMP
+
+INLINE int GetOmpTeamId()
+{
+  // omp_team_num
+  return GetGlobalTeamId(); // assume 1 block per team
+}
+
+INLINE int GetNumberOfOmpTeams()
+{
+  // omp_num_teams
+  return GetNumberOfGlobalTeamIds(); // assume 1 block per team
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// get OpenMP number of procs
+
+INLINE int GetNumberOfProcsInTeam()
+{
+  #ifdef OMPTHREAD_IS_WARP
+    return GetNumberOfWarpsInBlock();
+  #else
+    return GetNumberOfThreadsInBlock();
+  #endif
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Masters
+
+INLINE int IsTeamMaster(int ompThreadId)
+{
+  return (ompThreadId == 0);
+}
+
+INLINE int IsWarpMaster(int ompThreadId)
+{
+  return (ompThreadId % warpSize == 0);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory 
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE unsigned long PadBytes(
+  unsigned long size, 
+  unsigned long alignment) // must be a power of 2
+{
+  // compute the necessary padding to satify alignment constraint
+  ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0, 
+    "alignment %ld is not a power of 2\n", alignment);
+  return (~(unsigned long) size + 1) & (alignment - 1);
+}
+
+INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
+{
+  void * ptr = malloc(size);
+  PRINT(LD_MEM, "malloc data of size %d for %s: 0x%llx\n", size, msg,
+    P64(ptr)); 
+  ASSERT(LT_SAFETY, ptr, "failed to allocate %d bytes for %s\n", size, msg);
+  return ptr;
+}
+
+INLINE void *SafeFree(void *ptr, const char *msg)
+{
+  PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", P64(ptr), msg); 
+  free(ptr);
+  return NULL;
+}
Index: libomptarget/deviceRTLs/nvptx/src/sync.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -0,0 +1,109 @@
+//===------------ sync.h - NVPTX OpenMP synchronizations --------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Include all synchronization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP Ordered calls
+////////////////////////////////////////////////////////////////////////////////
+
+
+EXTERN void __kmpc_ordered (kmp_Indent * loc, int32_t gtid)
+{
+  PRINT0(LD_IO, "call kmpc_ordered\n");
+}
+
+
+EXTERN void __kmpc_end_ordered (kmp_Indent * loc, int32_t gtid)
+{
+  PRINT0(LD_IO, "call kmpc_end_ordered\n");
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP Barriers
+////////////////////////////////////////////////////////////////////////////////
+
+// a team is a block: we can use CUDA native synchronization mechanism
+// FIXME: what if not all threads (warps) participate to the barrier?
+// We may need to implement it differently
+
+EXTERN int32_t __kmpc_cancel_barrier (kmp_Indent* loc_ref, int32_t gtid)
+{
+  PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
+  __syncthreads();	
+  PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
+  return 0;
+}
+
+// aee this one shoud be discontinued
+EXTERN void __kmpc_barrier (kmp_Indent* loc_ref, int32_t gtid)
+{
+  PRINT0(LD_IO, "call kmpc_barrier\n");
+  __syncthreads();
+  PRINT0(LD_SYNC, "completed kmpc_barrier\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP MASTER
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE int32_t IsMaster()
+{
+  // only the team master updates the state
+  int gtid = GetGlobalThreadId();
+  int ompThreadId = GetOmpThreadId(gtid);
+  return IsTeamMaster(ompThreadId);
+}
+
+EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid)
+{
+  PRINT0(LD_IO, "call kmpc_master\n");
+  return IsMaster();
+}
+
+EXTERN void  __kmpc_end_master(kmp_Indent *loc, int32_t global_tid)
+{
+  PRINT0(LD_IO, "call kmpc_end_master\n");
+  ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP SINGLE
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid)
+{
+  PRINT0(LD_IO, "call kmpc_single\n");
+  // decide to implement single with master; master get the single
+  return IsMaster();
+}
+
+EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid)
+{
+  PRINT0(LD_IO, "call kmpc_end_single\n");
+  // decide to implement single with master: master get the single
+  ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
+  // sync barrier is explicitely called... so that is not a problem
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Flush
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_flush(kmp_Indent *loc)
+{
+  PRINT0(LD_IO, "call kmpc_flush\n");
+  // not aware of anything to do for flush
+}
Index: libomptarget/deviceRTLs/nvptx/src/task.cu
===================================================================
--- /dev/null
+++ libomptarget/deviceRTLs/nvptx/src/task.cu
@@ -0,0 +1,184 @@
+//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Task implementation support.
+//
+//  explicit task structure uses
+//  omptarget_nvptx task
+//  kmp_task
+//
+//  where kmp_task is
+//    - klegacy_TaskDescr    <- task pointer
+//        shared -> X
+//        routine
+//        part_id
+//        descr
+//    -  private (of size given by task_alloc call). Accessed by
+//       task+sizeof(klegacy_TaskDescr)
+//        * private data *
+//    - shared: X. Accessed by shared ptr in klegacy_TaskDescr
+//        * pointer table to shared variables *
+//    - end
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../deviceRTLs/nvptx/src/omptarget-nvptx.h"
+
+EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
+  kmp_Indent *loc,        // unused
+  uint32_t global_tid,    // unused
+  int32_t flag,           // unused (because in our impl, all are immediately exec
+  size_t sizeOfTaskInclPrivate, 
+  size_t sizeOfSharedTable, 
+  kmp_TaskFctPtr taskSub)
+{
+  PRINT(LD_IO, 
+    "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, fct 0x%llx)\n", 
+     P64(sizeOfTaskInclPrivate),  P64(sizeOfSharedTable), P64(taskSub));
+  // want task+priv to be a multiple of 8 bytes
+  size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void*));
+  sizeOfTaskInclPrivate += padForTaskInclPriv;
+  size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
+  ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0, 
+    "need task descr of size %d to be a multiple of %d\n", 
+    sizeof(omptarget_nvptx_TaskDescr), sizeof(void *));
+  size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *) 
+    SafeMalloc(totSize, "explicit task descriptor");
+  kmp_TaskDescr *newKmpTaskDescr = & newExplicitTaskDescr->kmpTaskDescr;
+  ASSERT0(LT_FUSSY, (uint64_t) newKmpTaskDescr == 
+     (uint64_t) ADD_BYTES(newExplicitTaskDescr, sizeof(omptarget_nvptx_TaskDescr)),
+     "bad size assumptions");
+  // init kmp_TaskDescr
+  newKmpTaskDescr->sharedPointerTable = 
+    (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate);
+  newKmpTaskDescr->sub = taskSub;
+  newKmpTaskDescr->destructors = NULL;
+  PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
+    P64(newKmpTaskDescr), P64(newExplicitTaskDescr));
+
+  return newKmpTaskDescr;
+}
+
+EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, 
+  uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr, 
+  int32_t depNum, void * depList, int32_t noAliasDepNum,
+  void * noAliasDepList)
+{
+  PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n", 
+    P64(newKmpTaskDescr));
+  // 1. get explict task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *) 
+    SUB_BYTES(newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, & newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, 
+    "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = & newExplicitTaskDescr->taskDescr; 
+  ASSERT0(LT_FUSSY, (uint64_t) newTaskDescr == (uint64_t) newExplicitTaskDescr, 
+    "bad assumptions");
+
+  // 2. push new context: update new task descriptor
+	int gtid = GetGlobalThreadId();
+  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(gtid);
+  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(gtid, newTaskDescr);
+
+  // 3. call sub
+  PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n", 
+    P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr));
+  newKmpTaskDescr->sub(0, newKmpTaskDescr);
+  PRINT(LD_TASK, "return from call task sub 0x%llx()\n", 
+    P64(newKmpTaskDescr->sub));
+
+  // 4. pop context
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(gtid, parentTaskDescr);
+  // 5. free
+  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
+  return 0;
+}
+
+EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, 
+  uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr)
+{
+  PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n", 
+    P64(newKmpTaskDescr));
+  // 1. get explict task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *) 
+    SUB_BYTES(newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, & newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, 
+    "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = & newExplicitTaskDescr->taskDescr; 
+  ASSERT0(LT_FUSSY, (uint64_t) newTaskDescr == (uint64_t) newExplicitTaskDescr, 
+    "bad assumptions");
+
+  // 2. push new context: update new task descriptor
+	int gtid = GetGlobalThreadId();
+  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(gtid);
+  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(gtid, newTaskDescr);
+  // 3... noting to call... is inline
+  // 4 & 5 ... done in complete
+}
+
+EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, 
+  uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr)
+{
+  PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n", 
+    P64(newKmpTaskDescr));
+  // 1. get explict task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *) 
+    SUB_BYTES(newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, & newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, 
+    "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = & newExplicitTaskDescr->taskDescr; 
+  ASSERT0(LT_FUSSY, (uint64_t) newTaskDescr == (uint64_t) newExplicitTaskDescr, 
+    "bad assumptions");
+  // 2. get parent
+  omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
+  // 3... noting to call... is inline
+  // 4. pop context
+	int gtid = GetGlobalThreadId();
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(gtid, parentTaskDescr);
+  // 5. free
+  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
+
+}
+
+EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, 
+  uint32_t global_tid, int32_t depNum, void * depList,
+  int32_t noAliasDepNum, void * noAliasDepList)
+{
+  PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid)
+{
+  PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid)
+{
+  PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN void __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid)
+{
+  PRINT0(LD_IO, "call to __kmpc_taskyield()\n");
+  // do nothing
+}
+
+EXTERN void __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid)
+{
+  PRINT0(LD_IO, "call to __kmpc_taskwait()\n");
+  // nothing to do as all our tasks are executed as final
+}
Index: libomptarget/exports
===================================================================
--- /dev/null
+++ libomptarget/exports
@@ -0,0 +1,18 @@
+VERS1.0 {
+  global:
+    __tgt_register_lib;
+    __tgt_unregister_lib;
+    __tgt_target_data_begin;
+    __tgt_target_data_end;
+    __tgt_target_data_update;
+    __tgt_target;
+    __tgt_target_teams;
+    __tgt_target_data_begin_nowait;
+    __tgt_target_data_end_nowait;
+    __tgt_target_data_update_nowait;
+    __tgt_target_nowait;
+    __tgt_target_teams_nowait;
+  local:
+    *;
+};
+
Index: libomptarget/plugins/CMakeLists.txt
===================================================================
--- /dev/null
+++ libomptarget/plugins/CMakeLists.txt
@@ -0,0 +1,59 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build plugins for the user system if available.
+#
+##===----------------------------------------------------------------------===##
+
+# void build_generic64(string tmachine, string tmachine_name, string tmachine_libname);
+# - build a plugin for a generic 64-bit target based on libffi.
+# - tmachine: name of the machine processor as used in the cmake build system.
+# - tmachine_name: name of the machine to be printed with the debug messages.
+# - tmachine_libname: machine name to be appended to the plugin library name.
+macro(build_generic64 tmachine tmachine_name tmachine_libname tmachine_triple)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
+  if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+  
+    libomptarget_say("Building ${tmachine_name} offloading plugin.")
+  
+    include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+    
+    # Define macro to be used as prefix of the runtime messages for this target.
+    add_definitions("-DTARGET_NAME=${tmachine_name}")
+  
+    add_library("omptarget.rtl.${tmachine_libname}" SHARED 
+      ${CMAKE_CURRENT_SOURCE_DIR}/../generic-64bit/src/rtl.cpp)
+      
+    target_link_libraries(
+      "omptarget.rtl.${tmachine_libname}"
+      ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} 
+      ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+      dl
+      "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+  
+    # Report to the parent scope that we are building a plugin.
+    set(LIBOMPTARGET_SYSTEM_TARGETS 
+      "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
+    
+  else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+    libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
+  endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+else()
+  libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.")
+endif()
+endmacro()
+
+add_subdirectory(cuda)
+add_subdirectory(ppc64)
+add_subdirectory(ppc64le)
+add_subdirectory(x86_64)
+
+# Make sure the parent scope can see the plugins that will be created.
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+
Index: libomptarget/plugins/cuda/CMakeLists.txt
===================================================================
--- /dev/null
+++ libomptarget/plugins/cuda/CMakeLists.txt
@@ -0,0 +1,42 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a CUDA machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(LIBOMPTARGET_DEP_CUDA_FOUND)
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
+  
+    libomptarget_say("Building CUDA offloading plugin.")
+  
+    # Define the suffix for the runtime messaging dumps.
+    add_definitions(-DTARGET_NAME=CUDA)
+  
+    if(CMAKE_BUILD_TYPE MATCHES Debug)
+        add_definitions(-DCUDA_ERROR_REPORT)
+    endif()
+    
+    include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+  
+    add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
+    target_link_libraries(omptarget.rtl.cuda 
+      ${LIBOMPTARGET_DEP_CUDA_LIBRARIES} 
+      cuda 
+      ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+      "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+  
+    # Report to the parent scope that we are building a plugin for CUDA.
+    set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE)
+  else()
+    libomptarget_say("Not building CUDA offloading plugin: only support CUDA in linux x86_64 or ppc64le hosts.")
+  endif()
+else()
+  libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.")
+endif()
Index: libomptarget/plugins/cuda/src/rtl.cpp
===================================================================
--- /dev/null
+++ libomptarget/plugins/cuda/src/rtl.cpp
@@ -0,0 +1,600 @@
+//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for CUDA machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <assert.h>
+#include <cstdio>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <gelf.h>
+#include <list>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <string.h>
+#include <vector>
+
+#include "omptarget.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME Generic-64bit
+#endif
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL",__VA_ARGS__)
+
+// Utility for retrieving and printing CUDA error string
+#ifdef CUDA_ERROR_REPORT
+#define CUDA_ERR_STRING(err)               \
+    do {                                   \
+        const char* errStr;                \
+        cuGetErrorString (err, &errStr);   \
+        DP("CUDA error is: %s\n", errStr); \
+    } while (0)
+#else
+#define CUDA_ERR_STRING(err) {}
+#endif
+
+// NVPTX image start encodes a struct that also includes the host entries begin
+// and end pointers. The host entries are used by the runtime to accelerate
+// the retrieval of the target entry pointers
+struct __tgt_nvptx_device_image_start{
+  void                  *RealStart;        // Pointer to actual NVPTX elf image
+  char                  *TgtName;          // Name of the target of the image
+  __tgt_offload_entry   *HostStart;        // Pointer to the host entries start
+  __tgt_offload_entry   *HostEnd;          // Pointer to the host entries end
+};
+
+/// Account the memory allocated per device
+struct AllocMemEntryTy{
+  int64_t TotalSize;
+  std::vector<std::pair<void*,int64_t> > Ptrs;
+
+  AllocMemEntryTy() : TotalSize(0) {}
+};
+
+/// Keep entries table per device
+struct FuncOrGblEntryTy{
+  __tgt_target_table Table;
+  std::vector<__tgt_offload_entry> Entries;
+};
+
+/// Use a single entity to encode a kernel and a set of flags
+struct KernelTy{
+  CUfunction Func;
+  int SimdInfo;
+
+  // keep track of cuda pointer to write to it when thread_limit value
+  // changes (check against last value written to ThreadLimit
+  CUdeviceptr ThreadLimitPtr;
+  int ThreadLimit;
+
+  KernelTy(CUfunction _Func, int _SimdInfo, CUdeviceptr _ThreadLimitPtr)
+    : Func(_Func), SimdInfo(_SimdInfo), ThreadLimitPtr(_ThreadLimitPtr) {
+    ThreadLimit = 0; //default (0) signals that it was not initialized
+  };
+};
+
+/// List that contains all the kernels.
+/// FIXME: we may need this to be per device and per library.
+std::list<KernelTy> KernelsList;
+
+/// Class containing all the device information
+class RTLDeviceInfoTy{
+  std::vector<FuncOrGblEntryTy> FuncGblEntries;
+
+public:
+  int NumberOfDevices;
+  std::vector<CUmodule> Modules;
+  std::vector<CUcontext> Contexts;
+  std::vector<int> ThreadsPerBlock;
+  std::vector<int> BlocksPerGrid;
+
+  // Record entry point associated with device
+  void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry ){
+    assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+
+    E.Entries.push_back(entry);
+  }
+
+  // Return true if the entry is associated with device
+  bool findOffloadEntry(int32_t device_id, void *addr){
+    assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+
+    for(unsigned i=0; i<E.Entries.size(); ++i){
+      if(E.Entries[i].addr == addr)
+        return true;
+    }
+
+    return false;
+  }
+
+  // Return the pointer to the target entries table
+  __tgt_target_table *getOffloadEntriesTable(int32_t device_id){
+    assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+
+    int32_t size = E.Entries.size();
+
+    // Table is empty
+    if(!size)
+      return 0;
+
+    __tgt_offload_entry *begin = &E.Entries[0];
+    __tgt_offload_entry *end = &E.Entries[size-1];
+
+    // Update table info according to the entries and return the pointer
+    E.Table.EntriesBegin = begin;
+    E.Table.EntriesEnd = ++end;
+
+    return &E.Table;
+  }
+
+  // Clear entries table for a device
+  void clearOffloadEntriesTable(int32_t device_id){
+    assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    E.Entries.clear();
+    E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
+  }
+
+  RTLDeviceInfoTy(){
+    DP ("Start initializing CUDA\n");
+
+    CUresult err = cuInit(0);
+    if (err != CUDA_SUCCESS)
+    {
+      DP ("Error when initializing CUDA\n");
+      CUDA_ERR_STRING(err);
+      return;
+    }
+
+    NumberOfDevices = 0;
+
+    err = cuDeviceGetCount(&NumberOfDevices);
+    if (err != CUDA_SUCCESS)
+    {
+      DP ("Error when getting CUDA device count\n");
+      CUDA_ERR_STRING(err);
+      return;    
+    }    
+
+    if (NumberOfDevices == 0){
+      DP("There are no devices supporting CUDA.\n");
+      return;
+    }
+
+    FuncGblEntries.resize(NumberOfDevices);
+    Contexts.resize(NumberOfDevices);
+    ThreadsPerBlock.resize(NumberOfDevices);
+    BlocksPerGrid.resize(NumberOfDevices);
+  }
+
+  ~RTLDeviceInfoTy(){
+    // Close modules
+    for(unsigned i=0; i<Modules.size(); ++i )
+      if(Modules[i])
+      {
+        CUresult err = cuModuleUnload(Modules[i]);
+	if (err != CUDA_SUCCESS)
+	{
+	  DP ("Error when unloading CUDA module\n");
+	  CUDA_ERR_STRING(err);
+	}
+      }
+
+    // Destroy contexts
+    for(unsigned i=0; i<Contexts.size(); ++i )
+      if(Contexts[i])
+      {
+        CUresult err = cuCtxDestroy(Contexts[i]);
+	if (err != CUDA_SUCCESS)
+	{
+	  DP ("Error when destroying CUDA context\n");
+	  CUDA_ERR_STRING(err);
+	}
+      }
+  }
+};
+
+static RTLDeviceInfoTy DeviceInfo;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int __tgt_rtl_device_type(int32_t device_id){
+
+  if( device_id < DeviceInfo.NumberOfDevices)
+    return 190; // EM_CUDA
+
+  return 0;
+}
+
+int __tgt_rtl_number_of_devices(){
+  return DeviceInfo.NumberOfDevices;
+}
+
+int32_t __tgt_rtl_init_device(int32_t device_id){
+
+  CUdevice cuDevice;
+  DP ("Getting device %d\n", device_id);
+  CUresult err = cuDeviceGet(&cuDevice, device_id);
+  if (err != CUDA_SUCCESS)
+  {
+    DP ("Error when getting CUDA device with id = %d\n", device_id);
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  // Create the context and save it to use whenever this device is selected
+  err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, cuDevice);
+  if (err != CUDA_SUCCESS)
+  {
+    DP ("Error when creating a CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  // scan properties to determine number of threads/blocks per block/grid
+  struct cudaDeviceProp Properties;
+  cudaError_t error = cudaGetDeviceProperties(&Properties, device_id);
+  if (error != cudaSuccess) {
+    DP("Error when getting device Properties, use default\n");
+    DeviceInfo.BlocksPerGrid[device_id] = 32;
+    DeviceInfo.ThreadsPerBlock[device_id] = 512;
+  } else {
+    DeviceInfo.BlocksPerGrid[device_id] = Properties.multiProcessorCount;
+    // exploit threads only along x axis
+    DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0];
+    if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) {
+      DP("use up to %d threads, fewer than max per blocks along xyz %d\n",
+        Properties.maxThreadsDim[0], Properties.maxThreadsPerBlock);
+    }
+  }
+  DP("Default number of blocks %d & threads %d\n", 
+    DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id]);
+
+  // done
+  return OFFLOAD_SUCCESS;
+}
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, __tgt_device_image *image){
+
+  //Set the context we are using
+  CUresult err = cuCtxSetCurrent (DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS)
+  {
+    DP ("Error when setting a CUDA context for device %d\n", device_id);
+    CUDA_ERR_STRING(err);
+    return NULL;
+  }
+
+  //Clear the offload table as we are going to create a new one
+  DeviceInfo.clearOffloadEntriesTable(device_id);
+
+  // Create the module and extract the function pointers
+
+  CUmodule cumod;
+  DP("load data from image %llx\n", (unsigned long long) image->ImageStart);
+  err = cuModuleLoadDataEx (&cumod, image->ImageStart, 0, NULL, NULL);
+  if (err != CUDA_SUCCESS)
+  {
+    DP ("Error when loading CUDA module\n");
+    CUDA_ERR_STRING (err);
+    return NULL;
+  }
+
+  DP ("CUDA module successfully loaded!\n");
+  DeviceInfo.Modules.push_back(cumod);
+
+  // Here, we take advantage of the data that is appended after img_end to get
+  // the symbols' name we need to load. This data consist of the host entries
+  // begin and end as well as the target name (see the offloading linker script
+  // creation in clang compiler).
+  // Find the symbols in the module by name. The name can be obtain by
+  // concatenating the host entry name with the target name
+
+  __tgt_offload_entry *HostBegin = image->EntriesBegin;
+  __tgt_offload_entry *HostEnd   = image->EntriesEnd;
+
+  for( __tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
+
+    if( !e->addr ){
+      // FIXME: Probably we should fail when something like this happen, the
+      // host should have always something in the address to uniquely identify
+      // the target region.
+      DP("Analyzing host entry '<null>' (size = %lld)...\n",
+         (unsigned long long)e->size);
+
+      __tgt_offload_entry entry = *e;
+      DeviceInfo.addOffloadEntry(device_id, entry);
+      continue;
+    }
+
+    if( e->size ){
+
+      __tgt_offload_entry entry = *e;
+
+      CUdeviceptr cuptr;
+      size_t cusize;
+      err = cuModuleGetGlobal(&cuptr,&cusize,cumod,e->name);
+
+      if (err != CUDA_SUCCESS){
+        DP("loading global '%s' (Failed)\n",e->name);
+        CUDA_ERR_STRING (err);
+        return NULL;
+      }
+
+      if ((int32_t)cusize != e->size){
+        DP("loading global '%s' - size mismatch (%lld != %lld)\n",e->name,
+            (unsigned long long)cusize,
+            (unsigned long long)e->size);
+        CUDA_ERR_STRING (err);
+        return NULL;
+      }
+
+      DP("Entry point %ld maps to global %s (%016lx)\n",e-HostBegin,e->name,(long)cuptr);
+      entry.addr = (void*)cuptr;
+
+      DeviceInfo.addOffloadEntry(device_id, entry);
+
+      continue;
+    }
+
+    CUfunction fun;
+    err = cuModuleGetFunction (&fun, cumod, e->name);
+
+    if (err != CUDA_SUCCESS){
+      DP("loading '%s' (Failed)\n",e->name);
+      CUDA_ERR_STRING (err);
+      return NULL;
+    }
+
+    DP("Entry point %ld maps to %s (%016lx)\n",e-HostBegin,e->name,(Elf64_Addr)fun);
+
+    // default value
+    int8_t SimdInfoVal = 1;
+
+    // obtain and save simd_info value for target region
+    const char suffix[] = "_simd_info";
+    char * SimdInfoName = (char *) malloc((strlen(e->name)+strlen(suffix))*
+        sizeof(char));
+    sprintf(SimdInfoName, "%s%s", e->name, suffix);
+
+    CUdeviceptr SimdInfoPtr;
+    size_t cusize;
+    err = cuModuleGetGlobal(&SimdInfoPtr,&cusize,cumod,SimdInfoName);
+    if (err == CUDA_SUCCESS) {
+      if ((int32_t)cusize != sizeof(int8_t)){
+        DP("loading global simd_info '%s' - size mismatch (%lld != %lld)\n", SimdInfoName, (unsigned long long)cusize,(unsigned long long)sizeof(int8_t));
+        CUDA_ERR_STRING (err);
+        return NULL;
+      }
+
+      err = cuMemcpyDtoH(&SimdInfoVal,(CUdeviceptr)SimdInfoPtr,cusize);
+      if (err != CUDA_SUCCESS)
+      {
+        DP("Error when copying data from device to host. Pointers: "
+            "host = 0x%016lx, device = 0x%016lx, size = %lld\n",(Elf64_Addr)&SimdInfoVal, (Elf64_Addr)SimdInfoPtr,(unsigned long long)cusize);
+        CUDA_ERR_STRING (err);
+        return NULL;
+      }
+      if (SimdInfoVal < 1) {
+        DP("Error wrong simd_info value specified in cubin file: %d\n", SimdInfoVal);
+        return NULL;
+      }
+    }
+
+    // obtain cuda pointer to global tracking thread limit
+    const char SuffixTL[] = "_thread_limit";
+    char * ThreadLimitName = (char *) malloc((strlen(e->name)+strlen(SuffixTL))*
+        sizeof(char));
+    sprintf(ThreadLimitName, "%s%s", e->name, SuffixTL);
+
+    CUdeviceptr ThreadLimitPtr;
+    err = cuModuleGetGlobal(&ThreadLimitPtr,&cusize,cumod,ThreadLimitName);
+    if (err != CUDA_SUCCESS) {
+      DP("retrieving pointer for %s global\n", ThreadLimitName);
+      CUDA_ERR_STRING (err);
+      return NULL;
+    }
+    if ((int32_t)cusize != sizeof(int32_t)) {
+      DP("loading global thread_limit '%s' - size mismatch (%lld != %lld)\n", ThreadLimitName, (unsigned long long)cusize,(unsigned long long)sizeof(int32_t));
+      CUDA_ERR_STRING (err);
+      return NULL;
+    }
+
+    // encode function and kernel
+    KernelsList.push_back(KernelTy(fun, SimdInfoVal, ThreadLimitPtr));
+
+    __tgt_offload_entry entry = *e;
+    entry.addr = (void*)&KernelsList.back();
+    DeviceInfo.addOffloadEntry(device_id, entry);
+  }
+
+  return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size){
+
+  //Set the context we are using  
+  CUresult err = cuCtxSetCurrent (DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS)
+  {
+    DP("Error while trying to set CUDA current context\n");
+    CUDA_ERR_STRING (err);
+    return NULL;
+  }
+
+  CUdeviceptr ptr;
+  err = cuMemAlloc(&ptr, size);
+  if (err != CUDA_SUCCESS)
+  {
+    DP("Error while trying to allocate %d\n", err);
+    CUDA_ERR_STRING (err);
+    return NULL;
+  }
+
+  void *vptr = (void*) ptr;
+  return vptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, int64_t size){
+  //Set the context we are using
+  CUresult err = cuCtxSetCurrent (DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS)
+  {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING (err);
+    return OFFLOAD_FAIL;
+  }
+
+  err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
+  if (err != CUDA_SUCCESS)
+  {
+    DP("Error when copying data from host to device. Pointers: "
+      "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
+       (Elf64_Addr)hst_ptr, (Elf64_Addr)tgt_ptr, (unsigned long long)size);
+    CUDA_ERR_STRING (err);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, int64_t size){
+  //Set the context we are using
+  CUresult err = cuCtxSetCurrent (DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS)
+  {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING (err);
+    return OFFLOAD_FAIL;
+  }
+
+  err = cuMemcpyDtoH(hst_ptr,(CUdeviceptr)tgt_ptr,size);
+  if (err != CUDA_SUCCESS)
+  {
+    DP("Error when copying data from device to host. Pointers: "
+      "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
+        (Elf64_Addr)hst_ptr, (Elf64_Addr)tgt_ptr, (unsigned long long)size);
+    CUDA_ERR_STRING (err);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void* tgt_ptr){
+  //Set the context we are using
+  CUresult err = cuCtxSetCurrent (DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS)
+  {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING (err);
+    return OFFLOAD_FAIL;
+  }
+
+  err = cuMemFree((CUdeviceptr)tgt_ptr);
+  if (err != CUDA_SUCCESS)
+  {
+    DP("Error when freeing CUDA memory\n");
+    CUDA_ERR_STRING (err);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, 
+  void *tgt_entry_ptr, void **tgt_args, int32_t arg_num,
+  int32_t team_num, int32_t thread_limit)
+{
+  //Set the context we are using
+  CUresult err = cuCtxSetCurrent (DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS)
+  {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING (err);
+    return OFFLOAD_FAIL;
+  }
+
+  // All args are references
+  std::vector<void*> args(arg_num);
+
+  for(int32_t i=0; i<arg_num; ++i)
+    args[i] = &tgt_args[i];
+
+  KernelTy *KernelInfo = (KernelTy*)tgt_entry_ptr;
+
+  int cudaThreadsPerBlock = (thread_limit<=0 ||
+    thread_limit*KernelInfo->SimdInfo > DeviceInfo.ThreadsPerBlock[device_id])?
+        DeviceInfo.ThreadsPerBlock[device_id] :
+        thread_limit*KernelInfo->SimdInfo;
+
+  // update thread limit content in gpu memory if un-initialized or changed
+  if (KernelInfo->ThreadLimit == 0 || KernelInfo->ThreadLimit != thread_limit) {
+    // always capped by maximum number of threads in a block: even if 1 OMP thread
+    // is 1 independent CUDA thread, we may have up to max block size OMP threads
+    // if the user request thread_limit(tl) with tl > max block size, we
+    // only start max block size CUDA threads
+    if (thread_limit > DeviceInfo.ThreadsPerBlock[device_id])
+    thread_limit = DeviceInfo.ThreadsPerBlock[device_id];
+
+    KernelInfo->ThreadLimit = thread_limit;
+    err = cuMemcpyHtoD(KernelInfo->ThreadLimitPtr,&thread_limit,sizeof(int32_t));
+
+    if (err != CUDA_SUCCESS) {
+      DP("Error when setting thread limit global\n");
+      return OFFLOAD_FAIL;
+    }
+  }
+
+  int blocksPerGrid = team_num>0 ? team_num : 
+    DeviceInfo.BlocksPerGrid[device_id];
+  int nshared = 0;
+
+  // Run on the device
+  DP("launch kernel with %d blocks and %d threads\n", blocksPerGrid, cudaThreadsPerBlock);
+
+  err = cuLaunchKernel(KernelInfo->Func,
+    blocksPerGrid, 1, 1, cudaThreadsPerBlock, 1, 1, nshared, 0, &args[0], 0);
+  if( err != CUDA_SUCCESS )
+  {
+    DP("Device kernel launching failed!\n");
+    CUDA_ERR_STRING (err);
+    assert(err == CUDA_SUCCESS && "Unable to launch target execution!" );
+    return OFFLOAD_FAIL;
+  }
+
+  DP("Execution of entry point at %016lx successful!\n",
+    (Elf64_Addr)tgt_entry_ptr);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 
+  void **tgt_args, int32_t arg_num)
+{
+  // use one team and one thread
+  // fix thread num
+  int32_t team_num = 1;
+  int32_t thread_limit = 0; // use default
+  return __tgt_rtl_run_target_team_region(device_id, 
+    tgt_entry_ptr, tgt_args, arg_num, team_num, thread_limit);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
Index: libomptarget/plugins/exports
===================================================================
--- /dev/null
+++ libomptarget/plugins/exports
@@ -0,0 +1,15 @@
+VERS1.0 {
+  global:
+    __tgt_rtl_device_type;
+    __tgt_rtl_number_of_devices;
+    __tgt_rtl_init_device;
+    __tgt_rtl_load_binary;
+    __tgt_rtl_data_alloc;
+    __tgt_rtl_data_submit;
+    __tgt_rtl_data_retrieve;
+    __tgt_rtl_data_delete;
+    __tgt_rtl_run_target_team_region;
+    __tgt_rtl_run_target_region;
+  local:
+    *;
+};
Index: libomptarget/plugins/generic-64bit/src/rtl.cpp
===================================================================
--- /dev/null
+++ libomptarget/plugins/generic-64bit/src/rtl.cpp
@@ -0,0 +1,314 @@
+//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for generic 64-bit machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <assert.h>
+#include <cstdio>
+#include <dlfcn.h>
+#include <elf.h>
+#include <ffi.h>
+#include <gelf.h>
+#include <list>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include <link.h>
+
+#include "omptarget.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME Generic-64bit
+#endif
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL",__VA_ARGS__)
+
+#define NUMBER_OF_DEVICES 4
+#define OFFLOADSECTIONNAME ".omp_offloading.entries"
+
+/// Array of Dynamic libraries loaded for this target
+struct DynLibTy{
+  char *FileName;
+  void* Handle;
+};
+
+/// Account the memory allocated per device
+struct AllocMemEntryTy{
+  int64_t TotalSize;
+  std::vector<std::pair<void*,int64_t> > Ptrs;
+
+  AllocMemEntryTy() : TotalSize(0) {}
+};
+
+/// Keep entries table per device
+struct FuncOrGblEntryTy{
+  __tgt_target_table Table;
+};
+
+/// Class containing all the device information
+class RTLDeviceInfoTy{
+  std::vector<FuncOrGblEntryTy> FuncGblEntries;
+
+public:
+  std::list<DynLibTy> DynLibs;
+
+  // Record entry point associated with device
+  void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, __tgt_offload_entry *end){
+    assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+
+    E.Table.EntriesBegin = begin;
+    E.Table.EntriesEnd = end;
+  }
+
+  // Return true if the entry is associated with device
+  bool findOffloadEntry(int32_t device_id, void *addr){
+    assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+
+
+
+    for(__tgt_offload_entry *i= E.Table.EntriesBegin,
+                            *e= E.Table.EntriesEnd; i<e; ++i){
+      if(i->addr == addr)
+        return true;
+    }
+
+    return false;
+  }
+
+  // Return the pointer to the target entries table
+  __tgt_target_table *getOffloadEntriesTable(int32_t device_id){
+    assert( device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+
+    return &E.Table;
+  }
+
+  RTLDeviceInfoTy(int32_t num_devices){
+    FuncGblEntries.resize(num_devices);
+  }
+
+  ~RTLDeviceInfoTy(){
+    // Close dynamic libraries
+    for(std::list<DynLibTy>::iterator
+      ii = DynLibs.begin(), ie = DynLibs.begin(); ii!=ie; ++ii)
+      if(ii->Handle){
+        dlclose(ii->Handle);
+        remove(ii->FileName);
+      }
+  }
+};
+
+static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES);
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int __tgt_rtl_device_type(int32_t device_id){
+
+  if( device_id < NUMBER_OF_DEVICES)
+    return 21; // EM_PPC64
+
+  return 0;
+}
+
+int __tgt_rtl_number_of_devices(){
+  return NUMBER_OF_DEVICES;
+}
+
+int32_t __tgt_rtl_init_device(int32_t device_id){
+  return OFFLOAD_SUCCESS; // success
+}
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, __tgt_device_image *image){
+
+  DP("Dev %d: load binary from 0x%llx image\n", device_id,
+      (long long)image->ImageStart);
+
+  assert(device_id>=0 && device_id<NUMBER_OF_DEVICES && "bad dev id");
+
+  size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart;
+  size_t NumEntries = (size_t) (image->EntriesEnd - image->EntriesBegin);
+  DP("Expecting to have %ld entries defined.\n", (long)NumEntries);
+
+  // We do not need to set the ELF version because the caller of this function
+  // had to do that to decide the right runtime to use
+
+  //Obtain elf handler
+  Elf *e = elf_memory ((char*)image->ImageStart, ImageSize);
+  if(!e){
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return NULL;
+  }
+
+  if( elf_kind(e) !=  ELF_K_ELF){
+    DP("Invalid Elf kind!\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  //Find the entries section offset
+  Elf_Scn *section = 0;
+  Elf64_Off entries_offset = 0;
+
+  size_t shstrndx;
+
+  if (elf_getshdrstrndx (e , &shstrndx )) {
+    DP("Unable to get ELF strings index!\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  while ((section = elf_nextscn(e,section))) {
+    GElf_Shdr hdr;
+    gelf_getshdr(section, &hdr);
+
+    if (!strcmp(elf_strptr(e,shstrndx,hdr.sh_name),OFFLOADSECTIONNAME)){
+      entries_offset = hdr.sh_addr;
+      break;
+    }
+  }
+
+  if (!entries_offset) {
+    DP("Entries Section Offset Not Found\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  DP("Offset of entries section is (%016lx).\n", entries_offset);
+
+  // load dynamic library and get the entry points. We use the dl library
+  // to do the loading of the library, but we could do it directly to avoid the
+  // dump to the temporary file.
+  //
+  // 1) Create tmp file with the library contents
+  // 2) Use dlopen to load the file and dlsym to retrieve the symbols
+  char tmp_name[] = "/tmp/tmpfile_XXXXXX";
+  int tmp_fd = mkstemp (tmp_name);
+
+  if( tmp_fd == -1 ){
+    elf_end(e);
+    return NULL;
+  }
+
+  FILE *ftmp = fdopen(tmp_fd, "wb");
+
+  if( !ftmp ){
+    elf_end(e);
+    return NULL;
+  }
+
+  fwrite(image->ImageStart,ImageSize,1,ftmp);
+  fclose(ftmp);
+
+  DynLibTy Lib = { tmp_name, dlopen(tmp_name,RTLD_LAZY) };
+
+  if(!Lib.Handle){
+    DP("target library loading error: %s\n",dlerror());
+    elf_end(e);
+    return NULL;
+  }
+
+  struct link_map *libInfo = (struct link_map *)Lib.Handle;
+
+  // The place where the entries info is loaded is the library base address
+  // plus the offset determined from the ELF file.
+  Elf64_Addr entries_addr = libInfo->l_addr + entries_offset;
+
+  DP("Pointer to first entry to be loaded is (%016lx).\n", entries_addr);
+
+  // Table of pointers to all the entries in the target
+  __tgt_offload_entry *entries_table = (__tgt_offload_entry*)entries_addr;
+
+
+  __tgt_offload_entry *entries_begin = &entries_table[0];
+  __tgt_offload_entry *entries_end =  entries_begin + NumEntries;
+
+  if(!entries_begin){
+    DP("Can't obtain entries begin\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  DP("Entries table range is (%016lx)->(%016lx)\n",(Elf64_Addr)entries_begin,(Elf64_Addr)entries_end)
+  DeviceInfo.createOffloadTable(device_id,entries_begin,entries_end);
+
+  elf_end(e);
+
+  return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size){
+  void *ptr = malloc(size);
+  return ptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, int64_t size){
+  memcpy(tgt_ptr,hst_ptr,size);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, int64_t size){
+  memcpy(hst_ptr,tgt_ptr,size);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void* tgt_ptr){
+  free(tgt_ptr);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+   void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit)
+{
+  // ignore team num and thread limit
+
+  // Use libffi to launch execution
+  ffi_cif cif;
+
+  // All args are references
+  std::vector<ffi_type*> args_types(arg_num, &ffi_type_pointer);
+  std::vector<void*> args(arg_num);
+
+  for(int32_t i=0; i<arg_num; ++i)
+    args[i] = &tgt_args[i];
+
+  ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num, &ffi_type_void, &args_types[0]);
+
+  assert( status == FFI_OK && "Unable to prepare target launch!" );
+
+  if( status != FFI_OK )
+    return OFFLOAD_FAIL;
+
+  DP("Running entry point at %016lx...\n",(Elf64_Addr)tgt_entry_ptr);
+
+  ffi_call(&cif, FFI_FN(tgt_entry_ptr), NULL, &args[0]);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 
+  void **tgt_args, int32_t arg_num)
+{
+  // use one team and one thread
+  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, 
+    tgt_args, arg_num, 1, 1);
+}
+
+#ifdef __cplusplus
+}
+#endif
Index: libomptarget/plugins/ppc64/CMakeLists.txt
===================================================================
--- /dev/null
+++ libomptarget/plugins/ppc64/CMakeLists.txt
@@ -0,0 +1,18 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu")
+else()
+ libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.")
+endif()
\ No newline at end of file
Index: libomptarget/plugins/ppc64le/CMakeLists.txt
===================================================================
--- /dev/null
+++ libomptarget/plugins/ppc64le/CMakeLists.txt
@@ -0,0 +1,18 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64le machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu")
+else()
+ libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.")
+endif()
\ No newline at end of file
Index: libomptarget/plugins/x86_64/CMakeLists.txt
===================================================================
--- /dev/null
+++ libomptarget/plugins/x86_64/CMakeLists.txt
@@ -0,0 +1,18 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a x86_64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu")
+else()
+ libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.")
+endif()
\ No newline at end of file
Index: libomptarget/src/omptarget.h
===================================================================
--- /dev/null
+++ libomptarget/src/omptarget.h
@@ -0,0 +1,146 @@
+//===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+#ifndef _OMPTARGET_H_
+#define _OMPTARGET_H_
+
+#define OFFLOAD_SUCCESS (0)
+#define OFFLOAD_FAIL    (~0)
+
+#define OFFLOAD_DEVICE_DEFAULT     -1
+#define OFFLOAD_DEVICE_CONSTRUCTOR -2
+#define OFFLOAD_DEVICE_DESTRUCTOR  -3
+
+/// Data attributes for each data reference used in an OpenMP target region.
+enum tgt_map_type{
+  tgt_map_alloc     = 0x00,   // allocate memory in the device for this reference
+  tgt_map_to        = 0x01,   // copy the data to the device but do not update the host memory
+  tgt_map_from      = 0x02,   // copy the data to the host but do not update the device memory
+  tgt_map_always    = 0x04,
+  tgt_map_release   = 0x08,
+  tgt_map_delete    = 0x18,
+  tgt_map_pointer   = 0x20,
+  tgt_map_extra     = 0x40
+};
+
+/// This struct is a record of an entry point or global. For a function
+/// entry point the size is expected to be zero
+struct __tgt_offload_entry{
+  void      *addr;       // Pointer to the offload entry info (function or global)
+  char      *name;       // Name of the function or global
+  int64_t   size;        // Size of the entry info (0 if it a function)
+};
+
+/// This struct is a record of the device image information
+struct __tgt_device_image{
+  void   *ImageStart;       // Pointer to the target code start
+  void   *ImageEnd;         // Pointer to the target code end
+  __tgt_offload_entry  *EntriesBegin;   // Begin of the table with all the entries
+  __tgt_offload_entry  *EntriesEnd;     // End of the table with all the entries (non inclusive)
+};
+
+/// This struct is a record of all the host code that may be offloaded to a target.
+struct __tgt_bin_desc{
+  int32_t              NumDevices;      // Number of devices supported
+  __tgt_device_image   *DeviceImages;   // Arrays of device images (one per device)
+  __tgt_offload_entry  *EntriesBegin;   // Begin of the table with all the entries
+  __tgt_offload_entry  *EntriesEnd;     // End of the table with all the entries (non inclusive)
+};
+
+/// This struct contains the offload entries identified by the target runtime
+struct __tgt_target_table{
+  __tgt_offload_entry  *EntriesBegin;   // Begin of the table with all the entries
+  __tgt_offload_entry  *EntriesEnd;     // End of the table with all the entries (non inclusive)
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void omp_set_default_device(int device_num);
+int omp_get_default_device(void);
+int omp_get_num_devices(void);
+int omp_is_initial_device(void);
+
+/// adds a target shared library to the target execution image
+void __tgt_register_lib(__tgt_bin_desc *desc);
+
+/// removes a target shared library to the target execution image
+void __tgt_unregister_lib(__tgt_bin_desc *desc);
+
+// creates host to the target data mapping, store it in the
+// libtarget.so internal structure (an entry in a stack of data maps) and
+// passes the data to the device;
+void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types);
+void __tgt_target_data_begin_nowait(int32_t device_id, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList);
+
+// passes data from the target, release target memory and destroys the
+// host-target mapping (top entry from the stack of data maps) created by
+// the last __tgt_target_data_begin
+void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types);
+void __tgt_target_data_end_nowait(int32_t device_id, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList);
+
+/// passes data to/from the target
+void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types);
+void __tgt_target_data_update_nowait(int32_t device_id, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList);
+
+// performs the same actions as data_begin in case arg_num is non-zero
+// and initiates run of offloaded region on target platform; if arg_num
+// is non-zero after the region execution is done it also performs the
+// same action as data_update and data_end aboveThe following types are
+// used; this function return 0 if it was able to transfer the execution
+// to a target and an int different from zero otherwise
+int __tgt_target(int32_t device_id, void *host_ptr, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types);
+int __tgt_target_nowait(int32_t device_id, void *host_ptr, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList);
+
+int __tgt_target_teams(int32_t device_id, void *host_ptr, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t num_teams, int32_t thread_limit);
+int __tgt_target_teams_nowait(int32_t device_id, void *host_ptr, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t num_teams, int32_t thread_limit,
+  int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef OMPTARGET_DEBUG
+# define DEBUGP(prefix, ...) { fprintf(stderr, "%s --> ", prefix); \
+                             fprintf(stderr, __VA_ARGS__); }
+#else
+# define DEBUGP(prefix, ...) {}
+#endif
+
+#ifdef __cplusplus
+  #define EXTERN extern "C"
+#else
+  #define EXTERN extern
+#endif
+
+#endif // _OMPTARGET_H_
Index: libomptarget/src/omptarget.cpp
===================================================================
--- /dev/null
+++ libomptarget/src/omptarget.cpp
@@ -0,0 +1,999 @@
+//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#include <assert.h>
+#include <dlfcn.h>
+#include <gelf.h>
+#include <libelf.h>
+#include <list>
+#include <map>
+#include <set>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+#include <queue>
+#include <mutex>
+
+/* Header file global to this project */
+#include "omptarget.h"
+#include "targets_info.h"
+
+#define DP(...) DEBUGP("Libomptarget", __VA_ARGS__)
+
+extern targets_info_table targets_info;
+
+struct RTLInfoTy;
+
+/// Map between host data and target data
+struct HostDataToTargetTy {
+  long HstPtrBase; // host info
+  long HstPtrBegin;
+  long HstPtrEnd; // non-inclusive
+
+  long TgtPtrBegin; // target info
+  long TgtPtrEnd;   // non-inclusive (aee: not needed???)
+
+  long RefCount;
+
+  HostDataToTargetTy() : 
+    HstPtrBase(0), HstPtrBegin(0), HstPtrEnd(0), 
+    TgtPtrBegin(0), TgtPtrEnd(0), RefCount(0) {}
+  HostDataToTargetTy(long BP, long B, long E, long TB, long TE) :
+    HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), 
+    TgtPtrBegin(TB), TgtPtrEnd(TE), RefCount(1) {}
+};
+
+typedef std::list<HostDataToTargetTy> HostDataToTargetListTy;
+
+struct DeviceTy {
+  int32_t DeviceID;
+  RTLInfoTy *RTL;
+  int32_t RTLDeviceID;
+
+  bool IsInit;
+  HostDataToTargetListTy HostDataToTargetMap;
+  std::list<void *> PendingConstrDestrHostPtrList;
+
+  DeviceTy(RTLInfoTy *RTL) : DeviceID(-1), RTL(RTL), RTLDeviceID(-1),
+    IsInit(false), HostDataToTargetMap(), PendingConstrDestrHostPtrList() {}
+
+  void *getOrAllocTgtPtr(void* HstPtrBegin, void* HstPtrBase, long Size,
+      long &IsNew, long UpdateRefCount = true);
+  void *getTgtPtrBegin(void* HstPtrBegin, long Size, long &IsLast,
+      long UpdateRefCount = true);
+  void deallocTgtPtr(void* TgtPtrBegin, long Size, long ForceDelete);
+
+  // calls to RTL
+  int32_t init();
+  __tgt_target_table *load_binary(void* Img);
+
+  int32_t data_submit(void* TgtPtrBegin, void* HstPtrBegin, int64_t Size);
+  int32_t data_retrieve(void* HstPtrBegin, void* TgtPtrBegin, int64_t Size);
+
+  int32_t run_region(void* TgtEntryPtr, void** TgtVarsPtr, int32_t TgtVarsSize);
+  int32_t run_team_region(void* TgtEntryPtr, void** TgtVarsPtr,
+      int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit);
+};
+
+struct RTLInfoTy{
+  typedef int32_t (device_type_ty)(int32_t);
+  typedef int32_t (number_of_devices_ty)();
+  typedef int32_t (init_device_ty)(int32_t);
+  typedef __tgt_target_table* (load_binary_ty)(int32_t, void*);
+  typedef void*   (data_alloc_ty)(int32_t, int64_t);
+  typedef int32_t (data_submit_ty)(int32_t, void*, void*, int64_t);
+  typedef int32_t (data_retrieve_ty)(int32_t, void*, void*, int64_t);
+  typedef int32_t (data_delete_ty)(int32_t, void*);
+  typedef int32_t (run_region_ty)(int32_t, void*, void**, int32_t);
+  typedef int32_t (run_team_region_ty)(int32_t, void*, void**, int32_t, int32_t, int32_t);
+
+  int32_t Idx;              // RTL index, index is the number of devices
+                            // of other RTLs that were registered before
+  int32_t NumberOfDevices;  // Number of devices this RTL deal with
+  std::vector<DeviceTy*> Devices; // one per device (NumberOfDevices in total)
+
+  void *LibraryHandler;
+  //Functions implemented in the RTL
+  device_type_ty *device_type;
+  number_of_devices_ty *number_of_devices;
+  init_device_ty *init_device;
+  load_binary_ty *load_binary;
+  data_alloc_ty *data_alloc;
+  data_submit_ty *data_submit;
+  data_retrieve_ty *data_retrieve;
+  data_delete_ty *data_delete;
+  run_region_ty *run_region;
+  run_team_region_ty *run_team_region;
+};
+
+/// RTLs identified in the system
+typedef std::list<RTLInfoTy> RTLsTy;
+static RTLsTy RTLs;
+
+/// Map between device type (elf id) and RTL
+typedef std::map<uint16_t, RTLInfoTy*> DTypeToRTLMapTy;
+static DTypeToRTLMapTy DTypeToRTLMap;
+
+/// Map between Device ID (i.e. openmp device id) and its DeviceTy
+typedef std::vector<DeviceTy> DevicesTy;
+static DevicesTy Devices;
+
+/// Map between the host entry begin and the translation table. Each
+/// registered library get one TranslationTable. Use the map from
+/// __tgt_offload_entry so that we may quickly determine if we are
+/// trying to re=register a existing lib, or we really have a new one
+struct TranslationTable{
+  __tgt_target_table HostTable;
+
+  // Image assigned to a given device
+  std::vector<__tgt_device_image*> TargetsImages; // One image per device ID
+
+  // Table of entry points or NULL if it was not already computed
+  std::vector<__tgt_target_table*> TargetsTable; // One table per device ID
+};
+typedef std::map<__tgt_offload_entry*, TranslationTable> HostEntriesBeginToTransTableTy;
+static HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
+
+/// Map between the host ptr and a table index
+struct TableMap{
+  TranslationTable *Table; // table associated with host ptr
+  uint32_t Index;          // index in which the host ptr translated entry is found
+  TableMap() : Table(0) , Index(0) {}
+  TableMap(TranslationTable *table, uint32_t index) : Table(table) , Index(index) {}
+};
+typedef std::map<void *, TableMap> HostPtrToTableMapTy;
+static HostPtrToTableMapTy HostPtrToTableMap;
+
+
+////////////////////////////////////////////////////////////////////////////////
+// getter and setter 
+//
+// aee: non compliant. This need to be integrated in KMPC; can keep it
+// here for the moment
+
+static int DefaultDevice = 0;
+
+//aee non-compliant
+void omp_set_default_device(int device_num){
+  DefaultDevice = device_num;
+}
+
+//aee non-compliant
+int omp_get_default_device(void){
+  return DefaultDevice;
+}
+
+int omp_get_num_devices(void) {
+  return Devices.size();
+}
+
+int omp_is_initial_device(void){
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// functionality for device
+
+// return the target pointer begin (where the data will be moved)
+void *DeviceTy::getTgtPtrBegin(void* HstPtrBegin, long Size, long &IsLast,
+  long UpdateRefCount)
+{
+  long hp = (long) HstPtrBegin;
+  IsLast = false;
+
+  for (HostDataToTargetListTy::iterator
+      ii=HostDataToTargetMap.begin(), 
+      ie=HostDataToTargetMap.end(); ii!=ie ; ++ii) {
+    HostDataToTargetTy &HT = *ii;
+    if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd){
+      if ((hp + Size) > HT.HstPtrEnd) {
+        DP("WARNING: Array contain pointer but does not contain the complete section\n");
+      }
+
+      IsLast = !(HT.RefCount > 1);
+
+      if (HT.RefCount > 1 && UpdateRefCount)
+        --HT.RefCount;
+
+      long tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin);
+      return (void*)tp;
+    }
+  }
+
+  return NULL;
+}
+
+// return the target pointer begin (where the data will be moved)
+void *DeviceTy::getOrAllocTgtPtr(void* HstPtrBegin, void* HstPtrBase,
+  long Size, long &IsNew, long UpdateRefCount)
+{
+  long hp = (long) HstPtrBegin;
+  IsNew = false;
+
+  // Check if the pointer is contained 
+  for (HostDataToTargetListTy::iterator
+      ii=HostDataToTargetMap.begin(), 
+      ie=HostDataToTargetMap.end(); ii!=ie ; ++ii) {
+    HostDataToTargetTy &HT = *ii;
+
+    // It is contained?
+    if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd){
+      if ((hp + Size) > HT.HstPtrEnd) {
+        DP("WARNING: Array contain pointer but does not contain the complete section\n");
+      }
+      if (UpdateRefCount)
+        ++HT.RefCount;
+      long tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin);
+      return (void*)tp;
+    }
+  }
+
+  //It is not contained we should create a new entry for it
+  IsNew = true;
+  long tp = (long) RTL->data_alloc(RTLDeviceID, Size);
+  HostDataToTargetMap.push_front(
+    HostDataToTargetTy((long)HstPtrBase, hp, hp+Size, tp, tp+Size));
+  return (void*)tp;
+}
+
+
+void DeviceTy::deallocTgtPtr(void* HstPtrBegin, long Size, long ForceDelete)
+{
+  long hp = (long)HstPtrBegin;
+
+  // Check if the pointer is contained in any sub-nodes
+  for (HostDataToTargetListTy::iterator
+      ii=HostDataToTargetMap.begin(), 
+      ie=HostDataToTargetMap.end(); ii!=ie ; ++ii) {
+    HostDataToTargetTy &HT = *ii;
+
+    // It is contained?
+    if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) {
+      if ((hp + Size) > HT.HstPtrEnd) {
+        DP("WARNING: Array contain pointer but does not contain the complete section\n");
+      }
+      if (ForceDelete) HT.RefCount = 1;
+      if (--HT.RefCount <= 0) {
+        assert(HT.RefCount == 0 && "did not expect a negative ref count");
+        DP("Deleting tgt data 0x%016llx of size %lld\n",
+            (long long) HT.TgtPtrBegin,
+            (long long)Size);
+        RTL->data_delete(RTLDeviceID, (void *) HT.TgtPtrBegin);
+        HostDataToTargetMap.erase(ii);
+      }
+      return;
+    }
+  }
+  DP("Section to delete (hst addr 0x%llx) does not exist in the allocated memory\n", 
+    (unsigned long long) hp);
+}
+
+// init device
+int32_t DeviceTy::init()
+{
+  int32_t rc = RTL->init_device(RTLDeviceID);
+  if (rc == OFFLOAD_SUCCESS) {
+    IsInit = true;
+  }
+  return rc;
+}
+
+// load binary to device
+__tgt_target_table *DeviceTy::load_binary(void* Img)
+{
+  return RTL->load_binary(RTLDeviceID, Img);
+}
+
+// submit data to device
+int32_t DeviceTy::data_submit(void* TgtPtrBegin, void* HstPtrBegin,
+  int64_t Size)
+{
+  return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
+}
+
+// retrieve data from device
+int32_t DeviceTy::data_retrieve(void* HstPtrBegin, void* TgtPtrBegin,
+  int64_t Size)
+{
+  return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
+}
+
+// run region on device
+int32_t DeviceTy::run_region(void* TgtEntryPtr, void** TgtVarsPtr,
+  int32_t TgtVarsSize)
+{
+  return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize);
+}
+
+// run team region on device
+int32_t DeviceTy::run_team_region(void* TgtEntryPtr, void** TgtVarsPtr,
+  int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit)
+{
+  return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
+    TgtVarsSize, NumTeams, ThreadLimit);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// functionality for registering libs
+
+static void RegisterImageIntoTranslationTable(TranslationTable &TT, RTLInfoTy &RTL, 
+    __tgt_device_image *image){
+
+  // sane size, as when we increase the one, we also increase the other
+  assert(TT.TargetsTable.size() == TT.TargetsImages.size()
+      && "We should have as many images as we have tables!");
+
+  // Resize the Targets Table and Images to accommodate the new targets if required
+  unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices;
+
+  if (TT.TargetsTable.size() < TargetsTableMinimumSize){
+    TT.TargetsImages.resize(TargetsTableMinimumSize, 0);
+    TT.TargetsTable.resize(TargetsTableMinimumSize, 0);
+  }
+
+  // Register the image in all devices for this target type
+  for(int32_t i=0; i<RTL.NumberOfDevices; ++i){
+    // If we are changing the image we are also invalidating the target table
+    if (TT.TargetsImages[RTL.Idx + i] != image){
+      TT.TargetsImages[RTL.Idx + i] = image;
+      TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// adds a target shared library to the target execution image
+EXTERN void __tgt_register_lib(__tgt_bin_desc *desc){
+
+  //Is the library version incompatible with the header file?
+  if (elf_version(EV_CURRENT) == EV_NONE){
+    DP("Incompatible ELF library!\n");
+    return;
+  }
+
+  // Can't read the host entries??
+  if (!desc) {
+    DP("Invalid descr!\n");
+    return;
+  }
+  if (!desc->EntriesBegin){
+    DP("Invalid Host entries!\n");
+    return;
+  }
+
+  DP("Registering entries starting at %016lx...\n", 
+    (Elf64_Addr)desc->EntriesBegin->addr);
+
+  HostEntriesBeginToTransTableTy::iterator TransTableIt =
+      HostEntriesBeginToTransTable.find(desc->EntriesBegin);
+
+  if (TransTableIt != HostEntriesBeginToTransTable.end()){
+    DP("Already registered!\n");
+    return;
+  }
+
+  // Initialize translation table for this
+  TranslationTable &TransTable = HostEntriesBeginToTransTable[desc->EntriesBegin];
+  TransTable.HostTable.EntriesBegin = desc->EntriesBegin;
+  TransTable.HostTable.EntriesEnd = desc->EntriesEnd;
+
+  // Scan all the device images
+  for(int32_t i=0; i<desc->NumDevices; ++i){
+
+    __tgt_device_image &img = desc->DeviceImages[i];
+    char *img_begin = (char*)img.ImageStart;
+    char *img_end   = (char*)img.ImageEnd;
+    size_t img_size = img_end - img_begin;
+
+    //Obtain elf handler
+    Elf *e = elf_memory(img_begin, img_size);
+    if (!e){
+      DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+      continue;
+    }
+
+    //Check if ELF is the right kind
+    if (elf_kind(e) != ELF_K_ELF){
+      DP("Unexpected ELF type!\n");
+      continue;
+    }
+    Elf64_Ehdr *eh64 = elf64_getehdr(e);
+    Elf32_Ehdr *eh32 = elf32_getehdr(e);
+    uint16_t MachineID;
+    if (eh64 && !eh32)
+      MachineID = eh64->e_machine;
+    else if (eh32 && !eh64)
+      MachineID = eh32->e_machine;
+    else{
+      DP("Ambiguous ELF header!\n");
+      continue;
+    }
+
+    DTypeToRTLMapTy::iterator RTLHandler = DTypeToRTLMap.find(MachineID);
+
+    // We already have an handler for this device's RTL?
+    // If so insert the Image <-> RTL Map entry and continue
+    if (RTLHandler != DTypeToRTLMap.end()){
+
+      // We were unable to find a runtime for this device before...
+      if (RTLHandler->second == 0)
+        continue;
+
+      RegisterImageIntoTranslationTable(TransTable, *RTLHandler->second, &img);
+      continue;
+    }
+
+    // Locate the library name and create an handler
+    void *dynlib_handle = 0;
+
+    DP("Looking for RTL libraries: with machine id %d\n", MachineID);
+    for(int32_t j=0; j<targets_info.Number_of_Entries; ++j){
+      if (targets_info.Entries[j].Machine_Elf_ID == MachineID){
+        DP("Found library %s!\n", targets_info.Entries[j].Machine_RTL_Lib);
+        
+        dynlib_handle = dlopen(targets_info.Entries[j].Machine_RTL_Lib, RTLD_NOW);
+        
+        if (!dynlib_handle)
+          DP("Error opening library for target %d: %s!\n", MachineID, dlerror());
+
+        break;
+      }
+      else
+        DP("Machine ID %d != %d!\n", targets_info.Entries[j].Machine_Elf_ID, MachineID);
+    }
+
+    RTLInfoTy *&RTL = DTypeToRTLMap[MachineID];
+    RTL = 0;
+
+    if (!dynlib_handle){
+      DP("No valid library for target %d, %s!\n", MachineID, dlerror());
+      continue;
+    }
+
+    // Create the handler and try to retrieve the functions
+    RTLInfoTy R;
+
+    R.LibraryHandler    = dynlib_handle;
+    if (!(R.device_type       = (RTLInfoTy::device_type_ty*)      dlsym(dynlib_handle, "__tgt_rtl_device_type"))) continue;
+    if (!(R.number_of_devices = (RTLInfoTy::number_of_devices_ty*)dlsym(dynlib_handle, "__tgt_rtl_number_of_devices"))) continue;
+    if (!(R.init_device       = (RTLInfoTy::init_device_ty*)      dlsym(dynlib_handle, "__tgt_rtl_init_device"))) continue;
+    if (!(R.load_binary       = (RTLInfoTy::load_binary_ty*)      dlsym(dynlib_handle, "__tgt_rtl_load_binary"))) continue;
+    if (!(R.data_alloc        = (RTLInfoTy::data_alloc_ty*)       dlsym(dynlib_handle, "__tgt_rtl_data_alloc"))) continue;
+    if (!(R.data_submit       = (RTLInfoTy::data_submit_ty*)      dlsym(dynlib_handle, "__tgt_rtl_data_submit"))) continue;
+    if (!(R.data_retrieve     = (RTLInfoTy::data_retrieve_ty*)    dlsym(dynlib_handle, "__tgt_rtl_data_retrieve"))) continue;
+    if (!(R.data_delete       = (RTLInfoTy::data_delete_ty*)      dlsym(dynlib_handle, "__tgt_rtl_data_delete"))) continue;
+    if (!(R.run_region        = (RTLInfoTy::run_region_ty*)       dlsym(dynlib_handle, "__tgt_rtl_run_target_region"))) continue;
+    if (!(R.run_team_region   = (RTLInfoTy::run_team_region_ty*)  dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region"))) continue;
+
+    // No devices are supported by this RTL?
+    if (!(R.NumberOfDevices = R.number_of_devices())) {
+      DP("No device support this RTL\n");
+      continue;
+    }
+
+    R.Idx = (RTLs.empty()) ? 0 : RTLs.back().Idx + RTLs.back().NumberOfDevices;
+
+    DP("Registered RTL supporting %d devices at index %d!\n", 
+      R.NumberOfDevices, R.Idx);
+
+    // The RTL is valid!
+    RTLs.push_back(R);
+    RTL = &RTLs.back();
+
+    // Add devices
+    DeviceTy device(RTL);
+
+    size_t start = Devices.size();
+    Devices.resize(start + RTL->NumberOfDevices, device);
+    for (int32_t device_id = 0; device_id < RTL->NumberOfDevices; device_id++) {
+      // global device ID
+      Devices[start + device_id].DeviceID = start + device_id;
+      // RTL local device ID
+      Devices[start + device_id].RTLDeviceID = device_id;
+
+      // Save pointer to device in RTL in case we want to unregister the RTL
+      RTL->Devices.push_back(&Devices[start + device_id]);
+    }
+
+    DP("Registering image %016lx with RTL!\n", (Elf64_Addr)img_begin);
+    RegisterImageIntoTranslationTable(TransTable, *RTL, &img);
+  }
+  
+  DP("Done register entries!\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// unloads a target shared library
+EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc){
+  DP("Unloading target library!\n");
+  return;
+}
+
+/// Internal function to do the mapping and transfer the data to the device
+static void target_data_begin(DeviceTy & Device, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types)
+{
+  // process each input
+  for(int32_t i=0; i<arg_num; ++i){
+    void *HstPtrBegin = args[i];
+    void *HstPtrBase = args_base[i];
+    void *Pointer_TgtPtrBegin;
+    long IsNew, Pointer_IsNew;
+    if (arg_types[i] & tgt_map_pointer) {
+      DP("has a pointer entry: \n");
+      // base is address of poiner
+      Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
+        sizeof(void *), Pointer_IsNew);
+      DP("There are %ld bytes allocated at target address %016lx\n",
+         (long)sizeof(void *), (long)Pointer_TgtPtrBegin);
+      assert(Pointer_TgtPtrBegin && "Data allocation by RTL returned invalid ptr");
+      // modify current entry
+      HstPtrBase = * (void **) HstPtrBase;
+    }
+
+    void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
+      arg_sizes[i], IsNew);
+    DP("There are %ld bytes allocated at target address %016lx - is new %ld\n",
+      (long)arg_sizes[i], (long)TgtPtrBegin, IsNew);
+    assert(TgtPtrBegin && "Data allocation by RTL returned invalid ptr");
+
+
+    if ((arg_types[i] & tgt_map_to) && 
+        (IsNew || (arg_types[i] & tgt_map_always))) {
+      DP("Moving %ld bytes (hst:%016lx) -> (tgt:%016lx)\n", (long)arg_sizes[i],
+        (long)HstPtrBegin, (long)TgtPtrBegin);
+      Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
+    }
+
+    if (arg_types[i] & tgt_map_pointer /* && Pointer_IsNew */){
+      DP("Update pointer (%016lx) -> [%016lx]\n", 
+        (long)Pointer_TgtPtrBegin, (long)TgtPtrBegin);
+      uint64_t Delta = (uint64_t) HstPtrBegin - (uint64_t) HstPtrBase;
+      void *TgrPtrBase_Value = (void *)((uint64_t) TgtPtrBegin - Delta);
+      Device.data_submit(Pointer_TgtPtrBegin, &TgrPtrBase_Value, sizeof(void*));
+    }
+  }
+}
+
+EXTERN void __tgt_target_data_begin_nowait(int32_t device_id, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList)
+{
+  __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
+    arg_types);
+}
+
+/// creates host to the target data mapping, store it in the
+/// libtarget.so internal structure (an entry in a stack of data maps)
+/// and passes the data to the device;
+EXTERN void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types)
+{
+  DP("Entering data begin region for device %d with %d mappings\n",
+    device_id, arg_num);
+
+  // No devices available?
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+    DP("Use default device id %d\n", device_id);
+  }
+  if (Devices.size() <= (size_t)device_id){
+    DP("Device ID  %d does not have a matching RTL.\n", device_id);
+    return;
+  }
+
+  // Get device info
+  DeviceTy & Device = Devices[device_id];
+  // Init the device if not done before
+  if (!Device.IsInit) {
+    if (Device.init() != OFFLOAD_SUCCESS) {
+      DP("failed to init device %d\n", device_id);
+      return;
+    }
+  }
+
+  target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types);
+}
+
+
+/// Internal function to undo the mapping and retrieve the data from the device
+static void target_data_end(DeviceTy & Device, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types)
+{
+  // process each input
+  for(int32_t i=0; i<arg_num; ++i){
+    void *HstPtrBegin = args[i];
+    void *HstPtrBase = args_base[i];
+    long IsLast;
+    long ForceDelete = arg_types[i] & tgt_map_delete;
+    if (arg_types[i] & tgt_map_pointer) {
+      // base is pointer begin
+      Device.getTgtPtrBegin(HstPtrBase, sizeof(void*), IsLast);
+      if (IsLast || ForceDelete) {
+        Device.deallocTgtPtr(HstPtrBase, sizeof(void*), ForceDelete);
+      }
+    }
+    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast);
+
+    DP("There are %ld bytes allocated at target address %016lx - is last %ld\n",
+          (long)arg_sizes[i], (long)TgtPtrBegin, IsLast);
+
+    long Always = arg_types[i] & tgt_map_always;
+    if ((arg_types[i] & tgt_map_from) && (IsLast || ForceDelete || Always)) {
+      DP("Moving %ld bytes (tgt:%016lx) -> (hst:%016lx)\n", (long)arg_sizes[i],
+        (long)TgtPtrBegin, (long)HstPtrBegin);
+      Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]);
+    }
+    if (IsLast || ForceDelete) {
+      Device.deallocTgtPtr(HstPtrBegin, arg_sizes[i], ForceDelete);
+    }
+  }
+}
+
+/// passes data from the target, release target memory and destroys
+/// the host-target mapping (top entry from the stack of data maps)
+/// created by the last __tgt_target_data_begin
+EXTERN void __tgt_target_data_end(int32_t device_id, int32_t arg_num, 
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types)
+{
+  DP("Entering data end region with %d mappings\n", arg_num);
+
+  // No devices available?
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+  if (Devices.size() <= (size_t)device_id){
+    DP("Device ID  %d does not have a matching RTL.\n", device_id);
+    return;
+  }
+
+  DeviceTy & Device = Devices[device_id];
+  if (!Device.IsInit) {
+    DP("uninit device: ignore");
+    return;
+  }
+
+  target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
+
+}
+
+EXTERN void __tgt_target_data_end_nowait(int32_t device_id, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList)
+{
+  __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
+    arg_types);
+}
+
+/// passes data to/from the target
+EXTERN void __tgt_target_data_update(int32_t device_id, int32_t arg_num, 
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types)
+{
+  DP("Entering data update with %d mappings\n", arg_num);
+
+  // No devices available?
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+  if (Devices.size() <= (size_t)device_id){
+    DP("Device ID  %d does not have a matching RTL.\n", device_id);
+    return;
+  }
+
+  // Get device info
+  DeviceTy & Device = Devices[device_id];
+  if (!Device.IsInit) {
+    DP("uninit device: ignore");
+    return;
+  }
+
+  // process each input
+  for(int32_t i=0; i<arg_num; ++i) {
+    void *HstPtrBegin = args[i];
+    //void *HstPtrBase = args_base[i];
+    long IsLast;
+    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, false);
+    if (arg_types[i] & tgt_map_from) {
+      DP("Moving %ld bytes (tgt:%016lx) -> (hst:%016lx)\n", (long)arg_sizes[i],
+        (long)TgtPtrBegin, (long)HstPtrBegin);
+      Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]);
+    } 
+    if (arg_types[i] & tgt_map_to) {
+      DP("Moving %ld bytes (hst:%016lx) -> (tgt:%016lx)\n", (long)arg_sizes[i],
+        (long)HstPtrBegin, (long)TgtPtrBegin);
+      Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
+    }
+  }
+}
+
+EXTERN void __tgt_target_data_update_nowait(int32_t device_id, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList)
+{
+  __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes,
+    arg_types);
+}
+
+/// performs the same actions as data_begin in case arg_num is
+/// non-zero and initiates run of offloaded region on target platform;
+/// if arg_num is non-zero after the region execution is done it also
+/// performs the same action as data_update and data_end aboveThe
+/// following types are used; this function return 0 if it was able to
+/// transfer the execution to a target and an int different from zero
+/// otherwise
+static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t team_num, int32_t thread_limit, 
+  int IsTeamConstruct, int IsConstrDestrRecursiveCall)
+{
+  DP("Entering target region with entry point %016lx and device Id %d\n", 
+    (Elf64_Addr)host_ptr, device_id);
+
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+  // got a new constructor/destructor?
+  if (device_id == OFFLOAD_DEVICE_CONSTRUCTOR || 
+      device_id == OFFLOAD_DEVICE_DESTRUCTOR) {
+    DP("Got a constructor/destructor\n");
+    for(unsigned D=0; D<Devices.size(); D++) {
+      DeviceTy & Device = Devices[D];
+      DP("device %d: enqueue constr/destr\n", D);
+      Device.PendingConstrDestrHostPtrList.push_back(host_ptr);
+    }
+    DP("Done with constructor/destructor\n");
+    return OFFLOAD_SUCCESS;
+  }
+
+  // No devices available?
+  if (! (device_id>=0 && (size_t)device_id<Devices.size())) {
+    DP("Device ID %d does not have a matching RTL.\n", device_id);
+    return OFFLOAD_FAIL;
+  }
+
+  // Get device info
+  DeviceTy & Device = Devices[device_id];
+  DP("Is the device %d (local is %d) initialized? %d\n", 
+    device_id, Device.RTLDeviceID, (int)Device.IsInit);
+
+  // Init the device if not done before
+  if (!Device.IsInit){
+    assert(! IsConstrDestrRecursiveCall && "constr & destr should not init RT");
+    if (Device.init() != OFFLOAD_SUCCESS) {
+      DP("failed to init device %d\n", device_id);
+      return OFFLOAD_FAIL;
+    }
+  }
+
+  if (! IsConstrDestrRecursiveCall && ! Device.PendingConstrDestrHostPtrList.empty()) {
+    DP("has pending constr/destr... call now\n");
+    for (std::list<void *>::iterator
+      ii=Device.PendingConstrDestrHostPtrList.begin(), 
+      ie=Device.PendingConstrDestrHostPtrList.end(); ii!=ie ; ++ii) {
+      void *ConstrDestrHostPtr = *ii;
+      int rc = target(device_id, ConstrDestrHostPtr, 0, NULL, NULL, NULL, NULL, 1, 1, 
+        true /*team*/, true /*recursive*/);
+      if (rc != OFFLOAD_SUCCESS) {
+        DP("failed to run constr/destr... enqueue it\n");
+        return OFFLOAD_FAIL;
+      }
+    }
+    DP("done with pending constr/destr\n");
+    Device.PendingConstrDestrHostPtrList.clear();
+  }
+
+  // Find the table information in the map or look it up in the translation tables
+  TableMap *TM = 0;
+  HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr);
+  if (TableMapIt == HostPtrToTableMap.end()){
+    // We don't have a map. So search all the registered libraries
+    for (HostEntriesBeginToTransTableTy::iterator
+        ii = HostEntriesBeginToTransTable.begin(), 
+        ie = HostEntriesBeginToTransTable.end(); !TM && ii!=ie ; ++ii){
+      // get the translation table (which contains all the good info)
+      TranslationTable *TransTable = &ii->second;
+      // iterate over all the host table entries to see if we can locate the host_ptr
+      __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
+      __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
+      __tgt_offload_entry *cur = begin;
+      for (uint32_t i=0; cur < end; ++cur, ++i) {
+        if (cur->addr != host_ptr)
+          continue;
+        // we got a match, now fill the HostPtrToTableMap so that we
+        // may avoid this search next time.
+        TM = &HostPtrToTableMap[host_ptr];
+        TM->Table = TransTable;
+        TM->Index = i;
+        break;
+      }
+    }
+  } else {
+    TM = &TableMapIt->second;
+  }
+  // No map for this host pointer found!
+  if (!TM){
+    DP("Host ptr %016lx does not have a matching target pointer.\n", 
+      (Elf64_Addr)host_ptr);
+    return OFFLOAD_FAIL;
+  }
+
+  // get target table
+  assert(TM->Table->TargetsTable.size() > (size_t)device_id
+      && "Not expecting a device ID outside the tables bounds!");
+  __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
+  // if first call, need to move the data
+  if (!TargetTable)
+  {
+    // 1) get image
+    assert(TM->Table->TargetsImages.size() > (size_t)device_id
+      && "Not expecting a device ID outside the tables bounds!");
+    __tgt_device_image *img = TM->Table->TargetsImages[device_id];
+    if (!img){
+      DP("No image loaded for device id %d.\n", device_id);
+      return OFFLOAD_FAIL;
+    }
+    // 2) load image into the target table
+    TargetTable = TM->Table->TargetsTable[device_id] = Device.load_binary(img);
+    // Unable to get table for this image: invalidate image and fail
+    if (!TargetTable){
+      DP("Unable to generate entries table for device id %d.\n", device_id);
+      TM->Table->TargetsImages[device_id] = 0;
+      return OFFLOAD_FAIL;
+    }
+
+    // Verify if the two tables sizes match
+    size_t hsize = TM->Table->HostTable.EntriesEnd - 
+      TM->Table->HostTable.EntriesBegin;
+    size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
+
+    // Invalid image for this host entries!
+    if (hsize != tsize){
+      DP("Host and Target tables mismatch for device id %d [%lx != %lx].\n", 
+       device_id, hsize, tsize);
+      TM->Table->TargetsImages[device_id] = 0;
+      TM->Table->TargetsTable[device_id] = 0;
+      return OFFLOAD_FAIL;
+    }
+    assert(TM->Index < hsize && 
+      "Not expecting index greater than the table size");
+
+    // process global data that needs to be mapped
+    __tgt_target_table *HostTable = &TM->Table->HostTable;
+    for(__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
+          *CurrHostEntry = HostTable->EntriesBegin,
+          *EntryDeviceEnd = TargetTable->EntriesEnd;
+        CurrDeviceEntry != EntryDeviceEnd;
+        CurrDeviceEntry++, CurrHostEntry++) {
+      if (CurrDeviceEntry->size != 0) {
+        // has data
+        assert(CurrDeviceEntry->size == CurrHostEntry->size && "data size mismatch");
+        long IsLast;
+        assert(Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size, 
+          IsLast, false) == NULL && "data in declared target should not be already mapped");
+        // add entry to map
+        DP("add mapping from host 0x%llx to 0x%llx with size %lld\n\n",  
+          (unsigned long long) CurrHostEntry->addr,
+          (unsigned long long)CurrDeviceEntry->addr,
+          (unsigned long long)CurrDeviceEntry->size);
+        Device.HostDataToTargetMap.push_front(HostDataToTargetTy(
+          (long)CurrHostEntry->addr, (long)CurrHostEntry->addr, 
+          (long)CurrHostEntry->addr + CurrHostEntry->size, 
+          (long)CurrDeviceEntry->addr, 
+          (long)CurrDeviceEntry->addr+CurrDeviceEntry->size));
+      }
+    }
+  }
+
+  //Move data to device
+  target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types);
+
+  std::vector<void*> tgt_args;
+
+  for(int32_t i=0; i<arg_num; ++i){
+
+    if (arg_types[i] & tgt_map_extra)
+      continue;
+
+    void * HstPtrBegin = args[i];
+    void * HstPtrBase = args_base[i];
+    void *TgtPtrBase;
+    long IsLast; // unused
+    if (arg_types[i] & tgt_map_pointer) {
+      DP("Obtaining target argument from host pointer %016lx to object %016lx \n", 
+        (long)HstPtrBase, (long)HstPtrBegin);
+      void * TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast, false);
+      TgtPtrBase = TgtPtrBegin; // no offset for ptrs
+    } else {
+      DP("Obtaining target argument from host pointer %016lx\n", 
+        (long)HstPtrBegin);
+      void * TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, false);
+      assert(TgtPtrBegin && "NULL argument for hst ptr");
+      uint64_t PtrDelta = (uint64_t) HstPtrBegin - (uint64_t)HstPtrBase;
+      TgtPtrBase = (void *)((uint64_t) TgtPtrBegin - PtrDelta);
+    }
+    tgt_args.push_back(TgtPtrBase);
+  }
+  // Push omp handle
+  tgt_args.push_back((void *) 0);
+
+  //Launch device execution
+  int rc;
+  DP("Launching target execution with pointer %016lx (index=%d).\n", 
+    (Elf64_Addr)TargetTable->EntriesBegin[TM->Index].addr, TM->Index);
+  if (IsTeamConstruct) {
+    rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
+      &tgt_args[0], tgt_args.size(), team_num, thread_limit);
+  } else {
+    rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
+      &tgt_args[0], tgt_args.size());
+  }
+
+  if (rc)
+    return OFFLOAD_FAIL;
+
+  //Move data from device
+  target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
+  return OFFLOAD_SUCCESS;
+}
+
+EXTERN int __tgt_target(int32_t device_id, void *host_ptr, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types)
+{
+  return target(device_id, host_ptr, arg_num, args_base, args, 
+    arg_sizes, arg_types, 0, 0, false /*team*/, false /*recursive*/);
+}
+
+EXTERN int __tgt_target_nowait(int32_t device_id, void *host_ptr, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList)
+{
+  return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+    arg_types);
+}
+
+EXTERN int __tgt_target_teams(int32_t device_id, void *host_ptr, int32_t arg_num, 
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t team_num, int32_t thread_limit) 
+{
+  return target(device_id, host_ptr, arg_num, args_base, args, 
+    arg_sizes, arg_types, team_num, thread_limit, 
+    true /*team*/, false /*recursive*/);
+}
+
+EXTERN int __tgt_target_teams_nowait(int32_t device_id, void *host_ptr, int32_t arg_num,
+  void** args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
+  int32_t team_num, int32_t thread_limit,
+  int32_t depNum, void * depList, int32_t noAliasDepNum, void * noAliasDepList)
+{
+  return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args,
+    arg_sizes, arg_types, team_num, thread_limit);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// temporary for debugging (matching the ones in omptarget-nvptx
+// REMOVE XXXXXX (here and in omptarget-nvptx)
+
+EXTERN void __kmpc_kernel_print(char *title) 
+{
+  DP(" %s\n", title);
+}
+
+EXTERN void __kmpc_kernel_print_int8(char *title, int64_t data) 
+{
+  DP(" %s val=%lld\n", title, (long long)data);
+}
+
Index: libomptarget/src/targets_info.h
===================================================================
--- /dev/null
+++ libomptarget/src/targets_info.h
@@ -0,0 +1,28 @@
+//===-------- targets_info.h - Information about Target RTLs ------ C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Map between ELF machine IDs and the RTL library that supports it
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+#ifndef _TARGETS_INFO_H_
+#define _TARGETS_INFO_H_
+
+struct targets_info_table_entry{
+  uint16_t Machine_Elf_ID;
+  const char *Machine_RTL_Lib;
+};
+struct targets_info_table{
+  int32_t Number_of_Entries;
+  targets_info_table_entry *Entries;
+};
+
+#endif
Index: libomptarget/src/targets_info.cpp
===================================================================
--- /dev/null
+++ libomptarget/src/targets_info.cpp
@@ -0,0 +1,27 @@
+//===-------- targets_info.cpp - Information about Target RTLs ---- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Map between ELF machine IDs and the RTL library that supports it
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "targets_info.h"
+
+static targets_info_table_entry targets_info_entries[] = {
+    {   21 /* EM_PPC64 */ ,   "libomptarget.rtl.ppc64.so"},
+    {   62 /* EM_X86_64*/ ,   "libomptarget.rtl.x86_64.so"},
+    {  190 /* EM_CUDA */  ,   "libomptarget.rtl.cuda.so"}
+};
+
+targets_info_table targets_info = {
+    sizeof(targets_info_entries) / sizeof(targets_info_table_entry),
+    &targets_info_entries[0]
+};
+
Index: libomptarget/test/CMakeLists.txt
===================================================================
--- /dev/null
+++ libomptarget/test/CMakeLists.txt
@@ -0,0 +1,93 @@
+# CMakeLists.txt file for unit testing OpenMP Library
+include(FindPythonInterp)
+include(CheckTypeSize)
+if(NOT PYTHONINTERP_FOUND)
+  libomptarget_warning_say("Could not find Python.")
+  libomptarget_warning_say("The check-libomptarget target will not be available!")
+  return()
+endif()
+
+set(LIBOMPTARGET_TEST_CFLAGS "" CACHE STRING
+  "Extra compiler flags to send to the test compiler")
+
+if(${LIBOMPTARGET_STANDALONE_BUILD})
+  # Make sure we can use the console pool for recent cmake and ninja > 1.5
+  if(CMAKE_VERSION VERSION_LESS 3.1.20141117)
+    set(cmake_3_2_USES_TERMINAL)
+  else()
+    set(cmake_3_2_USES_TERMINAL USES_TERMINAL)
+  endif()
+  set(LIBOMPTARGET_TEST_C_COMPILER ${CMAKE_C_COMPILER} CACHE STRING
+    "C compiler to use for testing OpenMP offloading library")
+  set(LIBOMPTARGET_TEST_CXX_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING
+    "C++ compiler to use for testing OpenMP offloading library")
+  set(LIBOMPTARGET_TEST_OPENMP_FLAG -fopenmp CACHE STRING
+    "OpenMP compiler flag to use for testing OpenMP offloading library")
+  set(LIBOMPTARGET_LLVM_LIT_EXECUTABLE "" CACHE STRING
+    "Path to llvm-lit")
+  find_program(LIT_EXECUTABLE NAMES llvm-lit ${LIBOMPTARGET_LLVM_LIT_EXECUTABLE})
+  if(NOT LIT_EXECUTABLE)
+    libomptarget_say("Cannot find llvm-lit.")
+    libomptarget_say("Please put llvm-lit in your PATH or set LIBOMPTARGET_LLVM_LIT_EXECUTABLE to its full path")
+    libomptarget_warning_say("The check-libomptarget target will not be available!")
+    return()
+  endif()
+  
+  set(LIBOMPTARGET_FILECHECK_EXECUTABLE "" CACHE STRING
+    "Path to FileCheck")
+  find_program(LIBOMPTARGET_FILECHECK NAMES FileCheck ${LIBOMPTARGET_FILECHECK_EXECUTABLE})
+  if(NOT LIBOMPTARGET_FILECHECK)
+    libomptarget_say("Cannot find FileCheck.")
+    libomptarget_say("Please put FileCheck in your PATH or set LIBOMPTARGET_FILECHECK_EXECUTABLE to its full path")
+    libomptarget_warning_say("The check-libomptarget target will not be available!")
+    return()
+  endif()
+  
+  # Set lit arguments
+  # The -j 1 lets the actual tests run with the entire machine.
+  # We have one test thread that spawns the tests serially.  This allows
+  # Each test to use the entire machine.
+  set(LIBOMPTARGET_LIT_ARGS_DEFAULT "-sv --show-unsupported --show-xfail -j 1")
+  if(MSVC OR XCODE)
+    set(LIBOMPTARGET_LIT_ARGS_DEFAULT "${LIBOMPTARGET_LIT_ARGS_DEFAULT} --no-progress-bar")
+  endif()
+  set(LIBOMPTARGET_LIT_ARGS "${LIBOMPTARGET_LIT_ARGS_DEFAULT}" CACHE STRING
+    "Default options for lit")
+  separate_arguments(LIBOMPTARGET_LIT_ARGS)
+  add_custom_target(check-libomptarget
+    COMMAND ${PYTHON_EXECUTABLE} ${LIT_EXECUTABLE} ${LIBOMPTARGET_LIT_ARGS} ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS omptarget
+    COMMENT "Running libomptarget tests"
+    ${cmake_3_2_USES_TERMINAL}
+  )
+  
+  set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "" CACHE STRING
+    "Path to folder containing omp.h")
+  set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "" CACHE STRING
+    "Path to folder containing libomp.h")
+else()
+  # LLVM source tree build, test just-built clang
+  if(NOT MSVC)
+    set(LIBOMPTARGET_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
+    set(LIBOMPTARGET_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++)
+    set(LIBOMPTARGET_FILECHECK ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck)
+  else()
+    libomptarget_error_say("Not prepared to run tests on Windows systems.")
+  endif()
+  set(LIBOMPTARGET_TEST_OPENMP_FLAG -fopenmp=libomp)
+  # Use add_lit_testsuite() from LLVM CMake. This also depends on OpenMP
+  # implementation because it uses omp.h.
+  add_lit_testsuite(check-libomptarget
+    "Running libomptarget tests"
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ARGS "-j 1"
+    DEPENDS omptarget omp
+  )
+  
+  set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${LIBOMPTARGET_BINARY_DIR}/../runtime/src")
+endif()
+
+# Configure the lit.site.cfg.in file
+set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!")
+configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
+
Index: libomptarget/test/lit.cfg
===================================================================
--- /dev/null
+++ libomptarget/test/lit.cfg
@@ -0,0 +1,116 @@
+# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+# Configuration file for the 'lit' test runner.
+
+import os
+import lit.formats
+
+# Tell pylint that we know config and lit_config exist somewhere.
+if 'PYLINT_IMPORT' in os.environ:
+    config = object()
+    lit_config = object()
+
+def append_dynamic_library_path(name, value, sep):
+    if name in config.environment:
+        config.environment[name] = value + sep + config.environment[name]
+    else:
+        config.environment[name] = value
+
+# name: The name of this test suite.
+config.name = 'libomptarget'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp', '.cc']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root object directory where output is placed
+config.test_exec_root = config.libomptarget_obj_root
+
+# test format
+config.test_format = lit.formats.ShTest()
+
+# compiler flags
+config.test_cflags = config.test_openmp_flag + \
+    " -I " + config.test_source_root + \
+    " -I " + config.omp_header_directory + \
+    " -L " + config.library_dir;
+    
+if config.omp_host_rtl_directory:
+    config.test_cflags = config.test_cflags + " -L " + \
+        config.omp_host_rtl_directory  
+    
+config.test_cflags = config.test_cflags + " " + config.test_extra_cflags
+
+# Setup environment to find dynamic library at runtime
+if config.operating_system == 'Windows':
+    append_dynamic_library_path('PATH', config.library_dir, ";")
+    append_dynamic_library_path('PATH', config.omp_host_rtl_directory, ";")
+elif config.operating_system == 'Darwin':
+    append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":")
+    append_dynamic_library_path('DYLD_LIBRARY_PATH', \
+        config.omp_host_rtl_directory, ";")
+    config.test_cflags += " -Wl,-rpath," + config.library_dir
+    config.test_cflags += " -Wl,-rpath," + config.omp_host_rtl_directory
+else: # Unices
+    append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
+    append_dynamic_library_path('LD_LIBRARY_PATH', \
+        config.omp_host_rtl_directory, ":")
+    
+# substitutions
+# - for targets that exist in the system create the actual command.
+# - for valid targets that do not exist in the system, return false, so that the
+#   same test can be used for different targets.
+
+# Scan all the valid targets.
+for libomptarget_target in config.libomptarget_all_targets:
+    # Is this target in the current system? If so create a compile, run and test
+    # command. Otherwise create command that return false.
+    if libomptarget_target in config.libomptarget_system_targets:
+        config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
+            libomptarget_target, \
+            "%libomptarget-compilexx-and-run-" + libomptarget_target + \
+            " | " + config.libomptarget_filecheck + " %s"))
+        config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
+            libomptarget_target, \
+            "%libomptarget-compile-and-run-" + libomptarget_target + \
+            " | " + config.libomptarget_filecheck + " %s"))      
+        config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
+            libomptarget_target, \
+            "%clangxx-" + libomptarget_target + " %s -o %t-" + \
+            libomptarget_target + " && %t-" + libomptarget_target))    
+        config.substitutions.append(("%libomptarget-compile-and-run-" + \
+            libomptarget_target, \
+            "%clang-" + libomptarget_target + " %s -o %t-" + \
+            libomptarget_target + " && %t-" + libomptarget_target))
+        config.substitutions.append(("%clangxx-" + libomptarget_target, \
+            "%clangxx %cflags -omptargets=" + libomptarget_target))    
+        config.substitutions.append(("%clang-" + libomptarget_target, \
+            "%clang %cflags -omptargets=" + libomptarget_target))
+        config.substitutions.append(("%fcheck-" + libomptarget_target, \
+            config.libomptarget_filecheck + " %s"))
+    else:
+        config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compile-and-run-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%clang-" + libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%clangxx-" + libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%fcheck-" + libomptarget_target, \
+            "echo ignored-command"))
+
+config.substitutions.append(("%clangxx", config.test_cxx_compiler))
+config.substitutions.append(("%clang", config.test_c_compiler))
+config.substitutions.append(("%openmp_flag", config.test_openmp_flag))
+config.substitutions.append(("%cflags", config.test_cflags))
+
Index: libomptarget/test/lit.site.cfg.in
===================================================================
--- /dev/null
+++ libomptarget/test/lit.site.cfg.in
@@ -0,0 +1,20 @@
+@AUTO_GEN_COMMENT@
+
+config.test_c_compiler = "@LIBOMPTARGET_TEST_C_COMPILER@"
+config.test_cxx_compiler = "@LIBOMPTARGET_TEST_CXX_COMPILER@"
+config.test_openmp_flag = "@LIBOMPTARGET_TEST_OPENMP_FLAG@"
+# For the moment we still need to pass libomptarget explicitly. Once the driver
+# patch, lands, this is not required anymore.
+config.test_extra_cflags = "-lomptarget @LIBOMPTARGET_TEST_CFLAGS@"
+config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
+config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
+config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
+config.operating_system = "@CMAKE_SYSTEM_NAME@"
+config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split()
+config.libomptarget_system_targets = "@LIBOMPTARGET_SYSTEM_TARGETS@".split()
+config.libomptarget_filecheck = "@LIBOMPTARGET_FILECHECK@"
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@LIBOMPTARGET_BASE_DIR@/test/lit.cfg")
+
Index: libomptarget/test/offloading/offloading_success.c
===================================================================
--- /dev/null
+++ libomptarget/test/offloading/offloading_success.c
@@ -0,0 +1,21 @@
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+int main(void) {
+  int isHost = 0;
+
+  #pragma omp target
+  {
+    isHost = omp_is_initial_device();
+  }
+
+  // The compiler doesn't have support to launch the target region on the device.
+  // CHECK: Target region executed on the host
+  printf("Target region executed on the %s\n", isHost ? "host" : "device");
+
+  return !isHost;
+}
Index: libomptarget/test/offloading/offloading_success.cpp
===================================================================
--- /dev/null
+++ libomptarget/test/offloading/offloading_success.cpp
@@ -0,0 +1,21 @@
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+int main(void) {
+  int isHost = 0;
+
+  #pragma omp target
+  {
+    isHost = omp_is_initial_device();
+  }
+
+  // The compiler doesn't have support to launch the target region on the device.
+  // CHECK: Target region executed on the host
+  printf("Target region executed on the %s\n", isHost ? "host" : "device");
+
+  return !isHost;
+}