diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -712,33 +712,30 @@
   CC1Args.push_back("-mlink-builtin-bitcode");
   CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
+  std::string CudaVersionStr;
+
   // New CUDA versions often introduce new instructions that are only supported
   // by new PTX version, so we need to raise PTX level to enable them in NVPTX
   // back-end.
   const char *PtxFeature = nullptr;
   switch (CudaInstallation.version()) {
-  case CudaVersion::CUDA_110:
-    PtxFeature = "+ptx70";
-    break;
-  case CudaVersion::CUDA_102:
-    PtxFeature = "+ptx65";
-    break;
-  case CudaVersion::CUDA_101:
-    PtxFeature = "+ptx64";
-    break;
-  case CudaVersion::CUDA_100:
-    PtxFeature = "+ptx63";
-    break;
-  case CudaVersion::CUDA_92:
-    PtxFeature = "+ptx61";
-    break;
-  case CudaVersion::CUDA_91:
-    PtxFeature = "+ptx61";
-    break;
-  case CudaVersion::CUDA_90:
-    PtxFeature = "+ptx60";
+#define CASE_CUDA_VERSION(CUDA_VER, PTX_VER)                                   \
+  case CudaVersion::CUDA_##CUDA_VER:                                           \
+    CudaVersionStr = #CUDA_VER;                                                \
+    PtxFeature = "+ptx" #PTX_VER;                                              \
     break;
+    CASE_CUDA_VERSION(110, 70);
+    CASE_CUDA_VERSION(102, 65);
+    CASE_CUDA_VERSION(101, 64);
+    CASE_CUDA_VERSION(100, 63);
+    CASE_CUDA_VERSION(92, 61);
+    CASE_CUDA_VERSION(91, 61);
+    CASE_CUDA_VERSION(90, 60);
+#undef CASE_CUDA_VERSION
   default:
+    // If unknown CUDA version, we take it as CUDA 8.0. Same assumption is also
+    // made in libomptarget/deviceRTLs.
+    CudaVersionStr = "80";
     PtxFeature = "+ptx42";
   }
   CC1Args.append({"-target-feature", PtxFeature});
@@ -784,8 +781,9 @@
     } else {
       bool FoundBCLibrary = false;
 
-      std::string LibOmpTargetName =
-          "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+      std::string LibOmpTargetName = "libomptarget-nvptx-cuda_" +
+                                     CudaVersionStr + "-" + GpuArch.str() +
+                                     ".bc";
 
       for (StringRef LibraryPath : LibraryPaths) {
         SmallString<128> LibOmpTargetFile(LibraryPath);
diff --git a/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_20.bc b/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_80-sm_20.bc
rename from clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_20.bc
rename to clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_80-sm_20.bc
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -164,7 +164,7 @@
 // RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB-USER %s
 
-// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-sm_20.bc
+// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-cuda_80-sm_20.bc
 // CHK-BCLIB-USER: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
 // CHK-BCLIB-NOT: {{error:|warning:}}
 
@@ -177,7 +177,7 @@
 // RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB-WARN %s
 
-// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_20.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.
+// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-cuda_80-sm_20.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.
 
 /// ###########################################################################
 
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -26,7 +26,8 @@
 #define DEVICE __attribute__((device))
 #define INLINE inline DEVICE
 #define NOINLINE __attribute__((noinline)) DEVICE
-#define SHARED __attribute__((shared))
+#define SHARED(NAME) __attribute__((shared)) NAME
+#define EXTERN_SHARED(NAME) __attribute__((shared)) NAME
 #define ALIGN(N) __attribute__((aligned(N)))
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/common/allocator.h b/openmp/libomptarget/deviceRTLs/common/allocator.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/allocator.h
@@ -0,0 +1,44 @@
+//===--------- allocator.h - OpenMP target memory allocator ------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Macros for allocating variables in different address spaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_ALLOCATOR_H
+#define OMPTARGET_ALLOCATOR_H
+
+#if _OPENMP
+// Follows the pattern in interface.h
+// Clang sema checks this type carefully, needs to closely match that from omp.h
+typedef enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
+} omp_allocator_handle_t;
+
+#define __PRAGMA(STR) _Pragma(#STR)
+#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
+
+#define SHARED(NAME)                                                           \
+  NAME [[clang::loader_uninitialized]];                                        \
+  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+
+#define EXTERN_SHARED(NAME)                                                    \
+  NAME;                                                                        \
+  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+#endif
+
+#endif // OMPTARGET_ALLOCATOR_H
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -14,11 +14,12 @@
 #ifndef OMPTARGET_H
 #define OMPTARGET_H
 
-#include "target_impl.h"
-#include "common/debug.h"     // debug
-#include "interface.h" // interfaces with omp, compiler, and user
+#include "common/allocator.h"
+#include "common/debug.h" // debug
 #include "common/state-queue.h"
 #include "common/support.h"
+#include "interface.h" // interfaces with omp, compiler, and user
+#include "target_impl.h"
 
 #define OMPTARGET_NVPTX_VERSION 1.1
 
@@ -71,8 +72,8 @@
   uint32_t nArgs;
 };
 
-extern DEVICE SHARED omptarget_nvptx_SharedArgs
-    omptarget_nvptx_globalArgs;
+extern DEVICE
+    omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs);
 
 // Worker slot type which is initialized with the default worker slot
 // size of 4*32 bytes.
@@ -94,7 +95,7 @@
   __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
 };
 
-extern DEVICE SHARED DataSharingStateTy DataSharingState;
+extern DEVICE DataSharingStateTy EXTERN_SHARED(DataSharingState);
 
 ////////////////////////////////////////////////////////////////////////////////
 // task ICV and (implicit & explicit) task state
@@ -273,9 +274,9 @@
 /// Memory manager for statically allocated memory.
 class omptarget_nvptx_SimpleMemoryManager {
 private:
-  ALIGN(128) struct MemDataTy {
+  struct MemDataTy {
     volatile unsigned keys[OMP_STATE_COUNT];
-  } MemData[MAX_SM];
+  } MemData[MAX_SM] ALIGN(128);
 
   INLINE static uint32_t hash(unsigned key) {
     return key & (OMP_STATE_COUNT - 1);
@@ -294,18 +295,23 @@
 
 extern DEVICE omptarget_nvptx_SimpleMemoryManager
     omptarget_nvptx_simpleMemoryManager;
-extern DEVICE SHARED uint32_t usedMemIdx;
-extern DEVICE SHARED uint32_t usedSlotIdx;
-extern DEVICE SHARED uint8_t
-    parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-extern DEVICE SHARED uint16_t threadLimit;
-extern DEVICE SHARED uint16_t threadsInTeam;
-extern DEVICE SHARED uint16_t nThreads;
-extern DEVICE SHARED
-    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
-
-extern DEVICE SHARED uint32_t execution_param;
-extern DEVICE SHARED void *ReductionScratchpadPtr;
+extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx);
+extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx);
+#if _OPENMP
+extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
+#else
+extern DEVICE
+    uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
+#endif
+extern DEVICE uint16_t EXTERN_SHARED(threadLimit);
+extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam);
+extern DEVICE uint16_t EXTERN_SHARED(nThreads);
+extern DEVICE omptarget_nvptx_ThreadPrivateContext *
+    EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);
+
+extern DEVICE uint32_t EXTERN_SHARED(execution_param);
+extern DEVICE void *EXTERN_SHARED(ReductionScratchpadPtr);
 
 ////////////////////////////////////////////////////////////////////////////////
 // work function (outlined parallel/simd functions) and arguments.
@@ -313,8 +319,8 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 typedef void *omptarget_nvptx_WorkFn;
-extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
-    omptarget_nvptx_workFn;
+extern volatile DEVICE
+    omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);
 
 ////////////////////////////////////////////////////////////////////////////////
 // get private data structures
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
@@ -11,8 +11,9 @@
 //===----------------------------------------------------------------------===//
 #pragma omp declare target
 
-#include "common/omptarget.h"
+#include "common/allocator.h"
 #include "common/device_environment.h"
+#include "common/omptarget.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // global device environment
@@ -28,44 +29,44 @@
     omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
         omptarget_nvptx_device_State[MAX_SM];
 
-DEVICE omptarget_nvptx_SimpleMemoryManager
-    omptarget_nvptx_simpleMemoryManager;
-DEVICE SHARED uint32_t usedMemIdx;
-DEVICE SHARED uint32_t usedSlotIdx;
+DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
+DEVICE uint32_t SHARED(usedMemIdx);
+DEVICE uint32_t SHARED(usedSlotIdx);
 
-DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-DEVICE SHARED uint16_t threadLimit;
-DEVICE SHARED uint16_t threadsInTeam;
-DEVICE SHARED uint16_t nThreads;
+DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
+DEVICE uint16_t SHARED(threadLimit);
+DEVICE uint16_t SHARED(threadsInTeam);
+DEVICE uint16_t SHARED(nThreads);
 // Pointer to this team's OpenMP state object
-DEVICE SHARED
-    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+DEVICE omptarget_nvptx_ThreadPrivateContext *
+    SHARED(omptarget_nvptx_threadPrivateContext);
 
 ////////////////////////////////////////////////////////////////////////////////
 // The team master sets the outlined parallel function in this variable to
 // communicate with the workers.  Since it is in shared memory, there is one
 // copy of these variables for each kernel, instance, and team.
 ////////////////////////////////////////////////////////////////////////////////
-volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
+volatile DEVICE omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);
 
 ////////////////////////////////////////////////////////////////////////////////
 // OpenMP kernel execution parameters
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED uint32_t execution_param;
+DEVICE uint32_t SHARED(execution_param);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Data sharing state
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED DataSharingStateTy DataSharingState;
+DEVICE DataSharingStateTy SHARED(DataSharingState);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scratchpad for teams reduction.
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED void *ReductionScratchpadPtr;
+DEVICE void *SHARED(ReductionScratchpadPtr);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Data sharing related variables.
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+DEVICE omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs);
 
 #pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -208,8 +208,8 @@
                          : /*Master thread only*/ 1;
   uint32_t TeamId = GetBlockIdInKernel();
   uint32_t NumTeams = GetNumberOfBlocksInKernel();
-  static SHARED unsigned Bound;
-  static SHARED unsigned ChunkTeamCount;
+  static unsigned SHARED(Bound);
+  static unsigned SHARED(ChunkTeamCount);
 
   // Block progress for teams greater than the current upper
   // limit. We always only allow a number of teams less or equal
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -10,6 +10,21 @@
 #
 ##===----------------------------------------------------------------------===##
 
+# TODO: This part needs to be refined when libomptarget is going to support
+# Windows!
+# TODO: This part can also be removed if we can change the clang driver to make
+# it support device only compilation.
+if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
+  set(aux_triple x86_64-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
+  set(aux_triple powerpc64le-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+  set(aux_triple aarch64-unknown-linux-gnu)
+else()
+  libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+  return()
+endif()
+
 get_filename_component(devicertl_base_directory
   ${CMAKE_CURRENT_SOURCE_DIR}
   DIRECTORY)
@@ -79,61 +94,91 @@
     )
 
     # Set flags for LLVM Bitcode compilation.
-    set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}
+    set(bc_flags -S -x c++
+                 -target nvptx64
+                 -Xclang -emit-llvm-bc
+                 -Xclang -aux-triple -Xclang ${aux_triple}
+                 -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+                 -D__CUDACC__
                  -I${devicertl_base_directory}
                  -I${devicertl_nvptx_directory}/src)
 
     if(${LIBOMPTARGET_NVPTX_DEBUG})
-      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1)
+      list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1)
     else()
-      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0)
+      list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0)
     endif()
 
     # Create target to build all Bitcode libraries.
     add_custom_target(omptarget-nvptx-bc)
 
-    # Generate a Bitcode library for all the compute capabilities the user requested.
+    # This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
+    # The last element is the default case.
+    set(cuda_version_list 110 102 101 100 92 91 90 80)
+    set(ptx_feature_list 70 65 64 63 61 61 60 42)
+    # The following two lines of ugly code is not needed when the minimal CMake
+    # version requirement is 3.17+.
+    list(LENGTH cuda_version_list num_version_supported)
+    math(EXPR loop_range "${num_version_supported} - 1")
+
+    # Generate a Bitcode library for all the compute capabilities the user
+    # requested and all PTX version we know for now.
     foreach(sm ${nvptx_sm_list})
-      set(cuda_arch --cuda-gpu-arch=sm_${sm})
-
-      # Compile CUDA files to bitcode.
-      set(bc_files "")
-      foreach(src ${cuda_src_files})
-        get_filename_component(infile ${src} ABSOLUTE)
-        get_filename_component(outfile ${src} NAME)
-
-        add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION}
-            -c ${infile} -o ${outfile}-sm_${sm}.bc
-          DEPENDS ${infile}
-          IMPLICIT_DEPENDS CXX ${infile}
-          COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
-          VERBATIM
+      set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+
+      # Uncomment the following code and remove those ugly part if the feature
+      # is available.
+      # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
+      foreach(itr RANGE ${loop_range})
+        list(GET cuda_version_list ${itr} cuda_version)
+        list(GET ptx_feature_list ${itr} ptx_num)
+        set(cuda_flags ${sm_flags})
+        list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
+        list(APPEND cuda_flags "-DCUDA_VERSION=${cuda_version}00")
+
+        set(bc_files "")
+        foreach(src ${cuda_src_files})
+          get_filename_component(infile ${src} ABSOLUTE)
+          get_filename_component(outfile ${src} NAME)
+          set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
+
+          add_custom_command(OUTPUT ${outfile}
+            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags}
+              ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+            DEPENDS ${infile}
+            IMPLICIT_DEPENDS CXX ${infile}
+            COMMENT "Building LLVM bitcode ${outfile}"
+            VERBATIM
+          )
+          set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
+
+          list(APPEND bc_files ${outfile})
+        endforeach()
+
+        set(bclib_name "libomptarget-nvptx-cuda_${cuda_version}-sm_${sm}.bc")
+
+        # Link to a bitcode library.
+        add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
+              -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+            DEPENDS ${bc_files}
+            COMMENT "Linking LLVM bitcode ${bclib_name}"
         )
-        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
+        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
 
-        list(APPEND bc_files ${outfile}-sm_${sm}.bc)
-      endforeach()
+        set(bclib_target_name "omptarget-nvptx-cuda_${cuda_version}-sm_${sm}-bc")
 
-      # Link to a bitcode library.
-      add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
-            -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
-          DEPENDS ${bc_files}
-          COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
-      )
-      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
-
-      add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
-      add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc)
-
-      # Copy library to destination.
-      add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
-                         COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-                         ${LIBOMPTARGET_LIBRARY_DIR})
-
-      # Install bitcode library under the lib destination folder.
-      install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+        add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
+        add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
+
+        # Copy library to destination.
+        add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                          COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                          ${LIBOMPTARGET_LIBRARY_DIR})
+
+        # Install bitcode library under the lib destination folder.
+        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+      endforeach()
     endforeach()
   endif()
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
@@ -11,7 +11,8 @@
 
 #include <stdint.h>
 
-#define EXTERN extern "C" __device__
+#define EXTERN extern "C"
+
 typedef uint32_t __kmpc_impl_lanemask_t;
 typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -13,18 +13,16 @@
 #define _TARGET_IMPL_H_
 
 #include <assert.h>
-#include <cuda.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 
 #include "nvptx_interface.h"
 
-#define DEVICE __device__
-#define INLINE __forceinline__ DEVICE
-#define NOINLINE __noinline__ DEVICE
-#define SHARED __shared__
-#define ALIGN(N) __align__(N)
+#define DEVICE
+#define INLINE inline __attribute__((always_inline))
+#define NOINLINE __attribute__((noinline))
+#define ALIGN(N) __attribute__((aligned(N)))
 
 ////////////////////////////////////////////////////////////////////////////////
 // Kernel options
@@ -96,10 +94,6 @@
 INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
 INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
 
-#ifndef CUDA_VERSION
-#error CUDA_VERSION macro is undefined, something wrong with cuda.
-#endif
-
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
 
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -14,8 +14,6 @@
 #include "target_impl.h"
 #include "common/debug.h"
 
-#include <cuda.h>
-
 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
 }