diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -720,33 +720,45 @@
   CC1Args.push_back("-mlink-builtin-bitcode");
   CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
+  std::string CudaVersionStr;
+
   // New CUDA versions often introduce new instructions that are only supported
   // by new PTX version, so we need to raise PTX level to enable them in NVPTX
   // back-end.
   const char *PtxFeature = nullptr;
   switch (CudaInstallation.version()) {
   case CudaVersion::CUDA_110:
+    CudaVersionStr = "110";
     PtxFeature = "+ptx70";
     break;
   case CudaVersion::CUDA_102:
+    CudaVersionStr = "102";
     PtxFeature = "+ptx65";
     break;
   case CudaVersion::CUDA_101:
+    CudaVersionStr = "101";
     PtxFeature = "+ptx64";
     break;
   case CudaVersion::CUDA_100:
+    CudaVersionStr = "100";
     PtxFeature = "+ptx63";
     break;
   case CudaVersion::CUDA_92:
+    CudaVersionStr = "92";
     PtxFeature = "+ptx61";
     break;
   case CudaVersion::CUDA_91:
+    CudaVersionStr = "91";
     PtxFeature = "+ptx61";
     break;
   case CudaVersion::CUDA_90:
+    CudaVersionStr = "90";
     PtxFeature = "+ptx60";
     break;
   default:
+    // If unknown CUDA version, we take it as CUDA 8.0. Same assumption is also
+    // made in libomptarget/deviceRTLs.
+    CudaVersionStr = "80";
     PtxFeature = "+ptx42";
   }
   CC1Args.append({"-target-feature", PtxFeature});
@@ -781,8 +793,8 @@
     llvm::sys::path::append(DefaultLibPath, Twine("lib") + CLANG_LIBDIR_SUFFIX);
     LibraryPaths.emplace_back(DefaultLibPath.c_str());
 
-    std::string LibOmpTargetName =
-      "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+    std::string LibOmpTargetName = "libomptarget-nvptx-cuda_" + CudaVersionStr +
+                                   "-" + GpuArch.str() + ".bc";
     bool FoundBCLibrary = false;
     for (StringRef LibraryPath : LibraryPaths) {
       SmallString<128> LibOmpTargetFile(LibraryPath);
diff --git a/openmp/libomptarget/deviceRTLs/common/allocator.h b/openmp/libomptarget/deviceRTLs/common/allocator.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/allocator.h
@@ -0,0 +1,42 @@
+//===--------- allocator.h - OpenMP target memory allocator ------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Macros for allocating variables in different address spaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_ALLOCATOR_H
+#define OMPTARGET_ALLOCATOR_H
+
+// Follows the pattern in interface.h
+// Clang sema checks this type carefully, needs to closely match that from omp.h
+typedef enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
+} omp_allocator_handle_t;
+
+#define __PRAGMA(STR) _Pragma(#STR)
+#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
+
+#define SHARED(NAME)                                                           \
+  NAME [[clang::loader_uninitialized]];                                        \
+  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+
+#define EXTERN_SHARED(NAME)                                                    \
+  NAME;                                                                        \
+  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+
+#endif // OMPTARGET_ALLOCATOR_H
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -14,11 +14,12 @@
 #ifndef OMPTARGET_H
 #define OMPTARGET_H
 
-#include "target_impl.h"
-#include "common/debug.h"     // debug
-#include "interface.h" // interfaces with omp, compiler, and user
+#include "common/allocator.h"
+#include "common/debug.h" // debug
 #include "common/state-queue.h"
 #include "common/support.h"
+#include "interface.h" // interfaces with omp, compiler, and user
+#include "target_impl.h"
 
 #define OMPTARGET_NVPTX_VERSION 1.1
 
@@ -71,8 +72,8 @@
   uint32_t nArgs;
 };
 
-extern DEVICE SHARED omptarget_nvptx_SharedArgs
-    omptarget_nvptx_globalArgs;
+extern DEVICE
+    omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs);
 
 // Worker slot type which is initialized with the default worker slot
 // size of 4*32 bytes.
@@ -94,7 +95,7 @@
   __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
 };
 
-extern DEVICE SHARED DataSharingStateTy DataSharingState;
+extern DEVICE DataSharingStateTy EXTERN_SHARED(DataSharingState);
 
 ////////////////////////////////////////////////////////////////////////////////
 // task ICV and (implicit & explicit) task state
@@ -273,9 +274,9 @@
 /// Memory manager for statically allocated memory.
 class omptarget_nvptx_SimpleMemoryManager {
 private:
-  ALIGN(128) struct MemDataTy {
+  struct MemDataTy {
     volatile unsigned keys[OMP_STATE_COUNT];
-  } MemData[MAX_SM];
+  } MemData[MAX_SM] ALIGN(128);
 
   INLINE static uint32_t hash(unsigned key) {
     return key & (OMP_STATE_COUNT - 1);
@@ -294,18 +295,18 @@
 
 extern DEVICE omptarget_nvptx_SimpleMemoryManager
     omptarget_nvptx_simpleMemoryManager;
-extern DEVICE SHARED uint32_t usedMemIdx;
-extern DEVICE SHARED uint32_t usedSlotIdx;
-extern DEVICE SHARED uint8_t
-    parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-extern DEVICE SHARED uint16_t threadLimit;
-extern DEVICE SHARED uint16_t threadsInTeam;
-extern DEVICE SHARED uint16_t nThreads;
-extern DEVICE SHARED
-    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
-
-extern DEVICE SHARED uint32_t execution_param;
-extern DEVICE SHARED void *ReductionScratchpadPtr;
+extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx);
+extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx);
+extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
+extern DEVICE uint16_t EXTERN_SHARED(threadLimit);
+extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam);
+extern DEVICE uint16_t EXTERN_SHARED(nThreads);
+extern DEVICE omptarget_nvptx_ThreadPrivateContext *
+    EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);
+
+extern DEVICE uint32_t EXTERN_SHARED(execution_param);
+extern DEVICE void *EXTERN_SHARED(ReductionScratchpadPtr);
 
 ////////////////////////////////////////////////////////////////////////////////
 // work function (outlined parallel/simd functions) and arguments.
@@ -313,8 +314,8 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 typedef void *omptarget_nvptx_WorkFn;
-extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
-    omptarget_nvptx_workFn;
+extern volatile DEVICE
+    omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);
 
 ////////////////////////////////////////////////////////////////////////////////
 // get private data structures
diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -316,10 +316,10 @@
   return rc;
 }
 
-EXTERN int omp_is_initial_device(void) {
-  PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");
-  return 0; // 0 by def on device
-}
+// EXTERN int omp_is_initial_device(void) {
+//   PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");
+//   return 0; // 0 by def on device
+// }
 
 // Unspecified on the device.
 EXTERN int omp_get_initial_device(void) {
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
@@ -11,8 +11,9 @@
 //===----------------------------------------------------------------------===//
 #pragma omp declare target
 
-#include "common/omptarget.h"
+#include "common/allocator.h"
 #include "common/device_environment.h"
+#include "common/omptarget.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // global device environment
@@ -28,44 +29,44 @@
     omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
         omptarget_nvptx_device_State[MAX_SM];
 
-DEVICE omptarget_nvptx_SimpleMemoryManager
-    omptarget_nvptx_simpleMemoryManager;
-DEVICE SHARED uint32_t usedMemIdx;
-DEVICE SHARED uint32_t usedSlotIdx;
+DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
+DEVICE uint32_t SHARED(usedMemIdx);
+DEVICE uint32_t SHARED(usedSlotIdx);
 
-DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-DEVICE SHARED uint16_t threadLimit;
-DEVICE SHARED uint16_t threadsInTeam;
-DEVICE SHARED uint16_t nThreads;
+DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
+DEVICE uint16_t SHARED(threadLimit);
+DEVICE uint16_t SHARED(threadsInTeam);
+DEVICE uint16_t SHARED(nThreads);
 // Pointer to this team's OpenMP state object
-DEVICE SHARED
-    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+DEVICE omptarget_nvptx_ThreadPrivateContext *
+    SHARED(omptarget_nvptx_threadPrivateContext);
 
 ////////////////////////////////////////////////////////////////////////////////
 // The team master sets the outlined parallel function in this variable to
 // communicate with the workers.  Since it is in shared memory, there is one
 // copy of these variables for each kernel, instance, and team.
 ////////////////////////////////////////////////////////////////////////////////
-volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
+volatile DEVICE omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);
 
 ////////////////////////////////////////////////////////////////////////////////
 // OpenMP kernel execution parameters
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED uint32_t execution_param;
+DEVICE uint32_t SHARED(execution_param);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Data sharing state
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED DataSharingStateTy DataSharingState;
+DEVICE DataSharingStateTy SHARED(DataSharingState);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scratchpad for teams reduction.
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED void *ReductionScratchpadPtr;
+DEVICE void *SHARED(ReductionScratchpadPtr);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Data sharing related variables.
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+DEVICE omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs);
 
 #pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -208,8 +208,8 @@
                          : /*Master thread only*/ 1;
   uint32_t TeamId = GetBlockIdInKernel();
   uint32_t NumTeams = GetNumberOfBlocksInKernel();
-  static SHARED unsigned Bound;
-  static SHARED unsigned ChunkTeamCount;
+  static unsigned SHARED(Bound);
+  static unsigned SHARED(ChunkTeamCount);
 
   // Block progress for teams greater than the current upper
   // limit. We always only allow a number of teams less or equal
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -10,6 +10,21 @@
 #
 ##===----------------------------------------------------------------------===##
 
+# TODO: This part needs to be refined when libomptarget is going to support
+# Windows!
+# TODO: This part can also be removed if we can change the clang driver to make
+# it support device only compilation.
+if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
+  set(aux_triple x86_64-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
+  set(aux_triple powerpc64le-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+  set(aux_triple aarch64-unknown-linux-gnu)
+else()
+  libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+  return()
+endif()
+
 get_filename_component(devicertl_base_directory
   ${CMAKE_CURRENT_SOURCE_DIR}
   DIRECTORY)
@@ -79,61 +94,83 @@
     )
 
     # Set flags for LLVM Bitcode compilation.
-    set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}
+    set(bc_flags -S -x c++
+                 -target nvptx64
+                 -Xclang -emit-llvm-bc
+                 -Xclang -aux-triple -Xclang ${aux_triple}
+                 -fopenmp -Xclang -fopenmp-is-device
+                 -D__CUDACC__
+                 -fdeclspec
                  -I${devicertl_base_directory}
                  -I${devicertl_nvptx_directory}/src)
 
     if(${LIBOMPTARGET_NVPTX_DEBUG})
-      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1)
+      list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1)
     else()
-      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0)
+      list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0)
     endif()
 
     # Create target to build all Bitcode libraries.
     add_custom_target(omptarget-nvptx-bc)
 
-    # Generate a Bitcode library for all the compute capabilities the user requested.
+    # This correlation is from clang/lib/Driver/ToolChains/Cuda.cpp.
+    # The last element is the default case.
+    set(cuda_version_list 110 102 101 100 92 91 90 80)
+    set(ptx_feature_list 70 65 64 63 61 61 60 42)
+
+    # Generate a Bitcode library for all the compute capabilities the user
+    # requested and all PTX version we know for now.
     foreach(sm ${nvptx_sm_list})
-      set(cuda_arch --cuda-gpu-arch=sm_${sm})
-
-      # Compile CUDA files to bitcode.
-      set(bc_files "")
-      foreach(src ${cuda_src_files})
-        get_filename_component(infile ${src} ABSOLUTE)
-        get_filename_component(outfile ${src} NAME)
-
-        add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION}
-            -c ${infile} -o ${outfile}-sm_${sm}.bc
-          DEPENDS ${infile}
-          IMPLICIT_DEPENDS CXX ${infile}
-          COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
-          VERBATIM
+      set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+
+      foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
+        set(cuda_flags ${sm_flags})
+        list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
+        list(APPEND cuda_flags "-DCUDA_VERSION=${cuda_version}00")
+
+        set(bc_files "")
+        foreach(src ${cuda_src_files})
+          get_filename_component(infile ${src} ABSOLUTE)
+          get_filename_component(outfile ${src} NAME)
+          set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
+
+          add_custom_command(OUTPUT ${outfile}
+            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags}
+              ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+            DEPENDS ${infile}
+            IMPLICIT_DEPENDS CXX ${infile}
+            COMMENT "Building LLVM bitcode ${outfile}"
+            VERBATIM
+          )
+          set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
+
+          list(APPEND bc_files ${outfile})
+        endforeach()
+
+        set(bclib_name "libomptarget-nvptx-cuda_${cuda_version}-sm_${sm}.bc")
+
+        # Link to a bitcode library.
+        add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
+              -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+            DEPENDS ${bc_files}
+            COMMENT "Linking LLVM bitcode ${bclib_name}"
         )
-        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
+        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
 
-        list(APPEND bc_files ${outfile}-sm_${sm}.bc)
-      endforeach()
+        set(bclib_target_name "omptarget-nvptx-cuda_${cuda_version}-sm_${sm}-bc")
+
+        add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
+        add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
 
-      # Link to a bitcode library.
-      add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
-            -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
-          DEPENDS ${bc_files}
-          COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
-      )
-      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
-
-      add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
-      add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc)
-
-      # Copy library to destination.
-      add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
-                         COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-                         ${LIBOMPTARGET_LIBRARY_DIR})
-
-      # Install bitcode library under the lib destination folder.
-      install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+        # Copy library to destination.
+        add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                          COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                          ${LIBOMPTARGET_LIBRARY_DIR})
+
+        # Install bitcode library under the lib destination folder.
+        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+      endforeach()
     endforeach()
   endif()
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
@@ -11,7 +11,8 @@
 
 #include <stdint.h>
 
-#define EXTERN extern "C" __device__
+#define EXTERN extern "C"
+
 typedef uint32_t __kmpc_impl_lanemask_t;
 typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -13,18 +13,16 @@
 #define _TARGET_IMPL_H_
 
 #include <assert.h>
-#include <cuda.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 
 #include "nvptx_interface.h"
 
-#define DEVICE __device__
-#define INLINE __forceinline__ DEVICE
-#define NOINLINE __noinline__ DEVICE
-#define SHARED __shared__
-#define ALIGN(N) __align__(N)
+#define DEVICE
+#define INLINE inline __attribute__((always_inline))
+#define NOINLINE __attribute__((noinline))
+#define ALIGN(N) __attribute__((aligned(N)))
 
 ////////////////////////////////////////////////////////////////////////////////
 // Kernel options
@@ -61,6 +59,7 @@
 #elif __CUDA_ARCH__ >= 600
 #define MAX_SM 56
 #else
+#error "Wrong number!"
 #define MAX_SM 16
 #endif
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -14,25 +14,6 @@
 #include "target_impl.h"
 #include "common/debug.h"
 
-#include <cuda.h>
-
-// Forward declaration of CUDA primitives which will be evetually transformed
-// into LLVM intrinsics.
-extern "C" {
-unsigned int __activemask();
-unsigned int __ballot(unsigned);
-// The default argument here is based on NVIDIA's website
-// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
-int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE);
-int __shfl(int val, int src_line, int width = WARPSIZE);
-int __shfl_down(int var, unsigned detla, int width);
-int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
-void __syncwarp(int mask);
-void __threadfence();
-void __threadfence_block();
-void __threadfence_system();
-}
-
 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
 }
@@ -74,10 +55,12 @@
 
 // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION >= 9000
-  return __activemask();
+#if CUDA_VERSION < 9020
+  return __nvvm_vote_ballot(1);
 #else
-  return __ballot(1);
+  unsigned int mask;
+  asm volatile("activemask.b32 %0;" : "=r"(mask));
+  return mask;
 #endif
 }
 
@@ -85,9 +68,9 @@
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                      int32_t SrcLane) {
 #if CUDA_VERSION >= 9000
-  return __shfl_sync(Mask, Var, SrcLane);
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
 #else
-  return __shfl(Var, SrcLane);
+  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
 #endif // CUDA_VERSION
 }
 
@@ -95,9 +78,10 @@
                                           int32_t Var, uint32_t Delta,
                                           int32_t Width) {
 #if CUDA_VERSION >= 9000
-  return __shfl_down_sync(Mask, Var, Delta, Width);
+  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta,
+                                   ((WARPSIZE - Width) << 8) | 0x1f);
 #else
-  return __shfl_down(Var, Delta, Width);
+  return __nvvm_shfl_down_i32(Var, Delta, ((WARPSIZE - Width) << 8) | 0x1f);
 #endif // CUDA_VERSION
 }
 
@@ -105,7 +89,7 @@
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
 #if CUDA_VERSION >= 9000
-  __syncwarp(Mask);
+  __nvvm_bar_warp_sync(Mask);
 #else
   // In Cuda < 9.0 no need to sync threads in warps.
 #endif // CUDA_VERSION
@@ -126,9 +110,9 @@
                : "memory");
 }
 
-DEVICE void __kmpc_impl_threadfence() { __threadfence(); }
-DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); }
-DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); }
+DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
+DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
+DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
 
 // Calls to the NVPTX layer (assuming 1D layout)
 DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }