diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -711,7 +711,6 @@
   CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
   clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
-  std::string CudaVersionStr;
 
   // New CUDA versions often introduce new instructions that are only supported
   // by new PTX version, so we need to raise PTX level to enable them in NVPTX
@@ -720,7 +719,6 @@
   switch (CudaInstallationVersion) {
 #define CASE_CUDA_VERSION(CUDA_VER, PTX_VER)                                   \
   case CudaVersion::CUDA_##CUDA_VER:                                           \
-    CudaVersionStr = #CUDA_VER;                                                \
     PtxFeature = "+ptx" #PTX_VER;                                              \
     break;
     CASE_CUDA_VERSION(112, 72);
@@ -734,9 +732,6 @@
     CASE_CUDA_VERSION(90, 60);
 #undef CASE_CUDA_VERSION
   default:
-    // If unknown CUDA version, we take it as CUDA 8.0. Same assumption is also
-    // made in libomptarget/deviceRTLs.
-    CudaVersionStr = "80";
     PtxFeature = "+ptx42";
   }
   CC1Args.append({"-target-feature", PtxFeature});
@@ -757,8 +752,7 @@
       return;
     }
 
-    std::string BitcodeSuffix =
-        "nvptx-cuda_" + CudaVersionStr + "-" + GpuArch.str();
+    std::string BitcodeSuffix = "nvptx-" + GpuArch.str();
     addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
                        getTriple());
   }
diff --git a/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_102-sm_35.bc b/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_35.bc
rename from clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_102-sm_35.bc
rename to clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_35.bc
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -164,7 +164,7 @@
 // RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB-USER %s
 
-// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-cuda_102-sm_35.bc
+// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-sm_35.bc
 // CHK-BCLIB-USER: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
 // CHK-BCLIB-NOT: {{error:|warning:}}
 
@@ -177,7 +177,7 @@
 // RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB-WARN %s
 
-// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-cuda_102-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.
+// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.
 
 /// ###########################################################################
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -137,6 +137,7 @@
              -Xclang -emit-llvm-bc
              -Xclang -aux-triple -Xclang ${aux_triple}
              -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+             -Xclang -target-feature -Xclang +ptx61
              -D__CUDACC__
              -I${devicertl_base_directory}
              -I${devicertl_nvptx_directory}/src)
@@ -150,81 +151,51 @@
 # Create target to build all Bitcode libraries.
 add_custom_target(omptarget-nvptx-bc)
 
-# This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
-# The last element is the default case.
-set(cuda_version_list 112 111 110 102 101 100 92 91 90 80)
-set(ptx_feature_list 71 71 70 65 64 63 61 61 60 42)
-# The following two lines of ugly code is not needed when the minimal CMake
-# version requirement is 3.17+.
-list(LENGTH cuda_version_list num_version_supported)
-math(EXPR loop_range "${num_version_supported} - 1")
-
-# Generate a Bitcode library for all the compute capabilities the user
-# requested and all PTX version we know for now.
+# Generate a Bitcode library for all the compute capabilities the user requested
 foreach(sm ${nvptx_sm_list})
-  set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
-
-  # Uncomment the following code and remove those ugly part if the feature
-  # is available.
-  # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
-  foreach(itr RANGE ${loop_range})
-    list(GET cuda_version_list ${itr} cuda_version)
-    list(GET ptx_feature_list ${itr} ptx_num)
-    set(cuda_flags ${sm_flags})
-    list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
-    if("${cuda_version}" MATCHES "^([0-9]+)([0-9])$")
-      set(cuda_version_major ${CMAKE_MATCH_1})
-      set(cuda_version_minor ${CMAKE_MATCH_2})
-    else()
-      libomptarget_error_say(
-        "Unrecognized CUDA version format: ${cuda_version}")
-    endif()
-    list(APPEND cuda_flags
-      "-DCUDA_VERSION=${cuda_version_major}0${cuda_version_minor}0")
-
-    set(bc_files "")
-    foreach(src ${cuda_src_files})
-      get_filename_component(infile ${src} ABSOLUTE)
-      get_filename_component(outfile ${src} NAME)
-      set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
-
-      add_custom_command(OUTPUT ${outfile}
-        COMMAND ${cuda_compiler} ${bc_flags}
-          ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
-        DEPENDS ${infile}
-        IMPLICIT_DEPENDS CXX ${infile}
-        COMMENT "Building LLVM bitcode ${outfile}"
-        VERBATIM
-      )
-      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
-
-      list(APPEND bc_files ${outfile})
-    endforeach()
-
-    set(bclib_name "libomptarget-nvptx-cuda_${cuda_version}-sm_${sm}.bc")
-
-    # Link to a bitcode library.
-    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-        COMMAND ${bc_linker}
-          -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
-        DEPENDS ${bc_files}
-        COMMENT "Linking LLVM bitcode ${bclib_name}"
+  set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+  set(bc_files "")
+  foreach(src ${cuda_src_files})
+    get_filename_component(infile ${src} ABSOLUTE)
+    get_filename_component(outfile ${src} NAME)
+    set(outfile "${outfile}-sm_${sm}.bc")
+
+    add_custom_command(OUTPUT ${outfile}
+      COMMAND ${cuda_compiler} ${bc_flags}
+        ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+      DEPENDS ${infile}
+      IMPLICIT_DEPENDS CXX ${infile}
+      COMMENT "Building LLVM bitcode ${outfile}"
+      VERBATIM
     )
-    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
+    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
 
-    set(bclib_target_name "omptarget-nvptx-cuda_${cuda_version}-sm_${sm}-bc")
+    list(APPEND bc_files ${outfile})
+  endforeach()
 
-    add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
-    add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
+  set(bclib_name "libomptarget-nvptx-sm_${sm}.bc")
 
-    # Copy library to destination.
-    add_custom_command(TARGET ${bclib_target_name} POST_BUILD
-                      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-                      ${LIBOMPTARGET_LIBRARY_DIR})
+  # Link to a bitcode library.
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+      COMMAND ${bc_linker}
+        -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+      DEPENDS ${bc_files}
+      COMMENT "Linking LLVM bitcode ${bclib_name}"
+  )
+  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
 
-    # Install bitcode library under the lib destination folder.
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-  endforeach()
+  set(bclib_target_name "omptarget-nvptx-sm_${sm}-bc")
+
+  add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
+  add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
+
+  # Copy library to destination.
+  add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                    ${LIBOMPTARGET_LIBRARY_DIR})
+
+  # Install bitcode library under the lib destination folder.
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
 endforeach()
 
 # Test will be enabled if the building machine supports CUDA
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -53,46 +53,28 @@
   return (double)nsecs * __kmpc_impl_get_wtick();
 }
 
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION < 9020
-  return __nvvm_vote_ballot(1);
-#else
   unsigned int Mask;
   asm volatile("activemask.b32 %0;" : "=r"(Mask));
   return Mask;
-#endif
 }
 
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                      int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-#else
-  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
-#endif // CUDA_VERSION
 }
 
 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
                                           int32_t Var, uint32_t Delta,
                                           int32_t Width) {
   int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
-#if CUDA_VERSION >= 9000
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-#else
-  return __nvvm_shfl_down_i32(Var, Delta, T);
-#endif // CUDA_VERSION
 }
 
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
   __nvvm_bar_warp_sync(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
 }
 
 // NVPTX specific kernel initialization