diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
--- a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
+++ b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
@@ -15,7 +15,7 @@
 if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
   set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
 elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
-  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
+  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_CXX_COMPILER})
 else()
   return()
 endif()
diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h
--- a/openmp/libomptarget/deviceRTLs/common/debug.h
+++ b/openmp/libomptarget/deviceRTLs/common/debug.h
@@ -129,15 +129,18 @@
 #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
 #include "common/support.h"
 
+#pragma omp declare target
 template <typename... Arguments>
 NOINLINE static void log(const char *fmt, Arguments... parameters) {
   printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
          (int)GetWarpId(), (int)GetLaneId(), parameters...);
 }
+#pragma omp end declare target
 
 #endif
 #if OMPTARGET_NVPTX_TEST
 
+#pragma omp declare target
 template <typename... Arguments>
 NOINLINE static void check(bool cond, const char *fmt,
                            Arguments... parameters) {
@@ -148,6 +151,7 @@
 }
 
 NOINLINE static void check(bool cond) { assert(cond); }
+#pragma omp end declare target
 #endif
 
 // set flags that are tested (inclusion properties)
diff --git a/openmp/libomptarget/deviceRTLs/common/device_environment.h b/openmp/libomptarget/deviceRTLs/common/device_environment.h
--- a/openmp/libomptarget/deviceRTLs/common/device_environment.h
+++ b/openmp/libomptarget/deviceRTLs/common/device_environment.h
@@ -15,10 +15,14 @@
 
 #include "target_impl.h"
 
+#pragma omp declare target
+
 struct omptarget_device_environmentTy {
   int32_t debug_level;
 };
 
 extern DEVICE omptarget_device_environmentTy omptarget_device_environment;
 
+#pragma omp end declare target
+
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -34,6 +34,8 @@
 #define BARRIER_COUNTER 0
 #define ORDERED_COUNTER 1
 
+#pragma omp declare target
+
 // arguments needed for L0 parallelism only.
 class omptarget_nvptx_SharedArgs {
 public:
@@ -273,9 +275,9 @@
 /// Memory manager for statically allocated memory.
 class omptarget_nvptx_SimpleMemoryManager {
 private:
-  ALIGN(128) struct MemDataTy {
+  struct MemDataTy {
     volatile unsigned keys[OMP_STATE_COUNT];
-  } MemData[MAX_SM];
+  } MemData[MAX_SM] ALIGN(128);
 
   INLINE static uint32_t hash(unsigned key) {
     return key & (OMP_STATE_COUNT - 1);
@@ -326,6 +328,8 @@
 getMyTopTaskDescriptor(bool isSPMDExecutionMode);
 INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
 
+#pragma omp end declare target
+
 ////////////////////////////////////////////////////////////////////////////////
 // inlined implementation
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@@ -13,6 +13,8 @@
 
 #include "common/target_atomic.h"
 
+#pragma omp declare target
+
 ////////////////////////////////////////////////////////////////////////////////
 // Task Descriptor
 ////////////////////////////////////////////////////////////////////////////////
@@ -226,3 +228,5 @@
   usedMemIdx = i;
   return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
 }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
@@ -13,6 +13,8 @@
 #include "interface.h"
 #include "common/debug.h"
 
+#pragma omp declare target
+
 EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
                                         int32_t cancelVal) {
   PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
@@ -26,3 +28,5 @@
   // disabled
   return 0;
 }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/critical.cu b/openmp/libomptarget/deviceRTLs/common/src/critical.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/critical.cu
@@ -13,6 +13,8 @@
 #include "interface.h"
 #include "common/debug.h"
 
+#pragma omp declare target
+
 EXTERN
 void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
                      kmp_CriticalName *lck) {
@@ -26,3 +28,5 @@
   PRINT0(LD_IO, "call to kmpc_end_critical()\n");
   omp_unset_lock((omp_lock_t *)lck);
 }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -12,6 +12,8 @@
 #include "common/omptarget.h"
 #include "target_impl.h"
 
+#pragma omp declare target
+
 // Return true if this is the master thread.
 INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
   return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
@@ -275,3 +277,4 @@
   omptarget_nvptx_simpleMemoryManager.Release();
 }
 
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -15,6 +15,8 @@
 #include "common/target_atomic.h"
 #include "target_impl.h"
 
+#pragma omp declare target
+
 EXTERN double omp_get_wtick(void) {
   double rc = __kmpc_impl_get_wtick();
   PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc);
@@ -362,3 +364,5 @@
   PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
   return rc;
 }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@@ -16,6 +16,8 @@
 #include "target_impl.h"
 #include "common/target_atomic.h"
 
+#pragma omp declare target
+
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 // template class that encapsulate all the helper functions
@@ -754,3 +756,5 @@
 EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
   PRINT0(LD_IO, "call kmpc_for_static_fini\n");
 }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
@@ -13,6 +13,8 @@
 #include "common/omptarget.h"
 #include "common/device_environment.h"
 
+#pragma omp declare target
+
 ////////////////////////////////////////////////////////////////////////////////
 // global device environment
 ////////////////////////////////////////////////////////////////////////////////
@@ -66,3 +68,5 @@
 // Data sharing related variables.
 ////////////////////////////////////////////////////////////////////////////////
 DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -13,6 +13,8 @@
 #include "common/omptarget.h"
 #include "target_impl.h"
 
+#pragma omp declare target
+
 ////////////////////////////////////////////////////////////////////////////////
 // global data tables
 ////////////////////////////////////////////////////////////////////////////////
@@ -157,3 +159,5 @@
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n");
   return isSPMDMode();
 }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -35,6 +35,8 @@
 #include "common/omptarget.h"
 #include "target_impl.h"
 
+#pragma omp declare target
+
 ////////////////////////////////////////////////////////////////////////////////
 // support for parallel that goes parallel (1 static level only)
 ////////////////////////////////////////////////////////////////////////////////
@@ -300,3 +302,5 @@
                                   int proc_bind) {
   PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
 }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -14,6 +14,8 @@
 #include "common/target_atomic.h"
 #include "target_impl.h"
 
+#pragma omp declare target
+
 EXTERN
 void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
 
@@ -312,3 +314,4 @@
   return 0;
 }
 
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@@ -14,6 +14,8 @@
 #include "common/debug.h"
 #include "common/omptarget.h"
 
+#pragma omp declare target
+
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
@@ -264,3 +266,4 @@
   return static_cast<char *>(ReductionScratchpadPtr) + 256;
 }
 
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
@@ -13,6 +13,8 @@
 #include "common/omptarget.h"
 #include "target_impl.h"
 
+#pragma omp declare target
+
 ////////////////////////////////////////////////////////////////////////////////
 // KMP Ordered calls
 ////////////////////////////////////////////////////////////////////////////////
@@ -135,3 +137,5 @@
   PRINT0(LD_IO, "call __kmpc_syncwarp\n");
   __kmpc_impl_syncwarp(Mask);
 }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/task.cu b/openmp/libomptarget/deviceRTLs/common/src/task.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/task.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/task.cu
@@ -29,6 +29,8 @@
 
 #include "common/omptarget.h"
 
+#pragma omp declare target
+
 EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
     kmp_Ident *loc,     // unused
     uint32_t global_tid, // unused
@@ -214,3 +216,5 @@
 
   __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
 }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/state-queue.h b/openmp/libomptarget/deviceRTLs/common/state-queue.h
--- a/openmp/libomptarget/deviceRTLs/common/state-queue.h
+++ b/openmp/libomptarget/deviceRTLs/common/state-queue.h
@@ -23,6 +23,8 @@
 
 #include "target_impl.h"
 
+#pragma omp declare target
+
 template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
 private:
   ElementType elements[SIZE];
@@ -46,6 +48,8 @@
   INLINE ElementType *Dequeue();
 };
 
+#pragma omp end declare target
+
 #include "state-queuei.h"
 
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/common/state-queuei.h b/openmp/libomptarget/deviceRTLs/common/state-queuei.h
--- a/openmp/libomptarget/deviceRTLs/common/state-queuei.h
+++ b/openmp/libomptarget/deviceRTLs/common/state-queuei.h
@@ -19,6 +19,8 @@
 #include "state-queue.h"
 #include "common/target_atomic.h"
 
+#pragma omp declare target
+
 template <typename ElementType, uint32_t SIZE>
 INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
   return __kmpc_atomic_add((unsigned int *)&tail, 1u);
@@ -88,3 +90,5 @@
   DoneServing(slot, id);
   return element;
 }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h
--- a/openmp/libomptarget/deviceRTLs/common/support.h
+++ b/openmp/libomptarget/deviceRTLs/common/support.h
@@ -16,6 +16,8 @@
 #include "interface.h"
 #include "target_impl.h"
 
+#pragma omp declare target
+
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
@@ -95,4 +97,6 @@
 DEVICE unsigned int *GetTeamsReductionTimestamp();
 DEVICE char *GetTeamsReductionScratchpad();
 
+#pragma omp end declare target
+
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/common/target_atomic.h b/openmp/libomptarget/deviceRTLs/common/target_atomic.h
--- a/openmp/libomptarget/deviceRTLs/common/target_atomic.h
+++ b/openmp/libomptarget/deviceRTLs/common/target_atomic.h
@@ -15,6 +15,28 @@
 
 #include "target_impl.h"
 
+#pragma omp declare target
+
+// FIXME: Forward declaration
+// unsigned
+extern unsigned atomicAdd(unsigned *address, unsigned val);
+extern unsigned atomicInc(unsigned *address, unsigned val);
+extern unsigned atomicMax(unsigned *address, unsigned val);
+extern unsigned atomicExch(unsigned *address, unsigned val);
+extern unsigned atomicCAS(unsigned *address, unsigned compare, unsigned val);
+// unsigned long long
+extern unsigned long long atomicAdd(unsigned long long *address,
+                                    unsigned long long val);
+extern unsigned long long atomicInc(unsigned long long *address,
+                                    unsigned long long val);
+extern unsigned long long atomicMax(unsigned long long *address,
+                                    unsigned long long val);
+extern unsigned long long atomicExch(unsigned long long *address,
+                                     unsigned long long val);
+extern unsigned long long atomicCAS(unsigned long long *address,
+                                    unsigned long long compare,
+                                    unsigned long long val);
+
 template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
   return atomicAdd(address, val);
 }
@@ -35,4 +57,6 @@
   return atomicCAS(address, compare, val);
 }
 
+#pragma omp end declare target
+
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -26,6 +26,8 @@
 #include "nvptx/src/nvptx_interface.h"
 #endif
 
+#pragma omp declare target
+
 ////////////////////////////////////////////////////////////////////////////////
 // OpenMP interface
 ////////////////////////////////////////////////////////////////////////////////
@@ -448,4 +450,6 @@
 EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
                                               int16_t is_shared);
 
+#pragma omp end declare target
+
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -10,6 +10,19 @@
 #
 ##===----------------------------------------------------------------------===##
 
+# TODO: This part needs to be refined when libomptarget is going to support
+# Windows!
+if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
+  set(aux_triple x86_64-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
+  set(aux_triple powerpc64le-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+  set(aux_triple aarch64-unknown-linux-gnu)
+else()
+  libomptarget_say("Not building CUDA offloading device RTL: unknow host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+  return()
+endif()
+
 get_filename_component(devicertl_base_directory
   ${CMAKE_CURRENT_SOURCE_DIR}
   DIRECTORY)
@@ -79,61 +92,79 @@
     )
 
     # Set flags for LLVM Bitcode compilation.
-    set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}
+    set(bc_flags -S -x c++
+                 -target nvptx64
+                 -Xclang -emit-llvm-bc
+                 -Xclang -aux-triple -Xclang ${aux_triple}
+                 -fopenmp -Xclang -fopenmp-is-device
+                 -D__CUDACC__
                  -I${devicertl_base_directory}
-                 -I${devicertl_nvptx_directory}/src)
+                 -I${devicertl_nvptx_directory}/src
+                 -I${CUDA_TOOLKIT_ROOT_DIR}/include)
 
     if(${LIBOMPTARGET_NVPTX_DEBUG})
-      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1)
+      list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1)
     else()
-      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0)
+      list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0)
     endif()
 
     # Create target to build all Bitcode libraries.
     add_custom_target(omptarget-nvptx-bc)
 
-    # Generate a Bitcode library for all the compute capabilities the user requested.
+    # That's all PTX versions we know for now
+    set(nvptx_ptx_list 50 60 61 62 63 64 65 70 71)
+
+    # Generate a Bitcode library for all the compute capabilities the user
+    # requested and all PTX version we know for now.
     foreach(sm ${nvptx_sm_list})
-      set(cuda_arch --cuda-gpu-arch=sm_${sm})
-
-      # Compile CUDA files to bitcode.
-      set(bc_files "")
-      foreach(src ${cuda_src_files})
-        get_filename_component(infile ${src} ABSOLUTE)
-        get_filename_component(outfile ${src} NAME)
-
-        add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION}
-            -c ${infile} -o ${outfile}-sm_${sm}.bc
-          DEPENDS ${infile}
-          IMPLICIT_DEPENDS CXX ${infile}
-          COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
-          VERBATIM
+      set(cuda_arch -Xclang -target-cpu -Xclang sm_${sm})
+
+      foreach(ptx_num ${nvptx_ptx_list})
+        set(ptx_version -Xclang -target-feature -Xclang +ptx${ptx_num})
+
+        set(bc_files "")
+        foreach(src ${cuda_src_files})
+          get_filename_component(infile ${src} ABSOLUTE)
+          get_filename_component(outfile ${src} NAME)
+          set(outfile "${outfile}-sm_${sm}-ptx${ptx_num}.bc")
+
+          add_custom_command(OUTPUT ${outfile}
+            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags}
+              ${cuda_arch} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+            DEPENDS ${infile}
+            IMPLICIT_DEPENDS CXX ${infile}
+            COMMENT "Building LLVM bitcode ${outfile}"
+            VERBATIM
+          )
+          set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
+
+          list(APPEND bc_files ${outfile})
+        endforeach()
+
+        set(bclib_name "libomptarget-nvptx-sm_${sm}-ptx${ptx_num}.bc")
+
+        # Link to a bitcode library.
+        add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
+              -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+            DEPENDS ${bc_files}
+            COMMENT "Linking LLVM bitcode ${bclib_name}"
         )
-        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
+        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
 
-        list(APPEND bc_files ${outfile}-sm_${sm}.bc)
-      endforeach()
+        set(bclib_target_name "omptarget-nvptx-sm_${sm}-ptx${ptx_num}-bc")
+
+        add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
+        add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
 
-      # Link to a bitcode library.
-      add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
-            -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
-          DEPENDS ${bc_files}
-          COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
-      )
-      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
-
-      add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
-      add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc)
-
-      # Copy library to destination.
-      add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
-                         COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-                         ${LIBOMPTARGET_LIBRARY_DIR})
-
-      # Install bitcode library under the lib destination folder.
-      install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+        # Copy library to destination.
+        add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                          COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                          ${LIBOMPTARGET_LIBRARY_DIR})
+
+        # Install bitcode library under the lib destination folder.
+        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+      endforeach()
     endforeach()
   endif()
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
@@ -11,8 +11,11 @@
 
 #include <stdint.h>
 
-#define EXTERN extern "C" __device__
+#define EXTERN extern "C"
+
+#pragma omp declare target
 typedef uint32_t __kmpc_impl_lanemask_t;
 typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
+#pragma omp end declare target
 
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -20,11 +20,11 @@
 
 #include "nvptx_interface.h"
 
-#define DEVICE __device__
-#define INLINE __forceinline__ DEVICE
-#define NOINLINE __noinline__ DEVICE
-#define SHARED __shared__
-#define ALIGN(N) __align__(N)
+#define DEVICE
+#define INLINE
+#define NOINLINE
+#define SHARED __attribute__((shared))
+#define ALIGN(N) __attribute__((aligned(N)))
 
 ////////////////////////////////////////////////////////////////////////////////
 // Kernel options
@@ -67,6 +67,12 @@
 
 #define OMP_ACTIVE_PARALLEL_LEVEL 128
 
+#pragma omp declare target
+
+// Forward declaration
+extern uint32_t __ffs(uint32_t);
+extern uint32_t __popc(uint32_t);
+
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
   // The maximum number of workers in a kernel.
@@ -146,4 +152,6 @@
 DEVICE void *__kmpc_impl_malloc(size_t);
 DEVICE void __kmpc_impl_free(void *);
 
+#pragma omp end declare target
+
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -16,6 +16,8 @@
 
 #include <cuda.h>
 
+#pragma omp declare target
+
 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
 }
@@ -55,6 +57,10 @@
   return (double)nsecs * __kmpc_impl_get_wtick();
 }
 
+// FIXME: Forward declaration
+extern unsigned int __activemask();
+extern unsigned int __ballot(unsigned);
+
 // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
 #if CUDA_VERSION >= 9000
@@ -64,6 +70,11 @@
 #endif
 }
 
+// FIXME: Forward declaration
+// These two variants should contain a default argument int width.
+extern int __shfl_sync(unsigned mask, int val, int src_line);
+extern int __shfl(int val, int src_line);
+
 // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                      int32_t SrcLane) {
@@ -74,6 +85,10 @@
 #endif // CUDA_VERSION
 }
 
+// FIXME: Forward declaration
+extern int __shfl_down(int var, unsigned detla, int width);
+extern int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
+
 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
                                           int32_t Var, uint32_t Delta,
                                           int32_t Width) {
@@ -86,6 +101,9 @@
 
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
+// FIXME: Forward declaration
+extern void __syncwarp(int mask);
+
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
 #if CUDA_VERSION >= 9000
   __syncwarp(Mask);
@@ -109,6 +127,11 @@
                : "memory");
 }
 
+// FIXME: Forward declaration
+extern void __threadfence();
+extern void __threadfence_block();
+extern void __threadfence_system();
+
 DEVICE void __kmpc_impl_threadfence() { __threadfence(); }
 DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); }
 DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); }
@@ -158,3 +181,5 @@
 
 DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
 DEVICE void __kmpc_impl_free(void *x) { free(x); }
+
+#pragma omp end declare target