diff --git a/openmp/README.rst b/openmp/README.rst
--- a/openmp/README.rst
+++ b/openmp/README.rst
@@ -281,7 +281,7 @@
 
  **LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES** = ``35``
   List of CUDA compute capabilities that should be supported by the NVPTX
-  device RTL. E.g. for compute capabilities 6.0 and 7.0, the option "60,70"
+  device RTL. E.g. for compute capabilities 6.0 and 7.0, the option "60;70"
   should be used. Compute capability 3.5 is the minimum required.
 
  **LIBOMPTARGET_NVPTX_DEBUG** = ``OFF|ON``
diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
--- a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -12,7 +12,7 @@
 # components of libomptarget. These are the dependencies we have:
 #
 # libelf : required by some targets to handle the ELF files at runtime.
-# libffi : required to launch target kernels given function and argument 
+# libffi : required to launch target kernels given function and argument
 #          pointers.
 # CUDA : required to control offloading to NVIDIA GPUs.
 # VEOS : required to control offloading to NEC Aurora.
@@ -47,18 +47,18 @@
     /sw/lib
     ENV LIBRARY_PATH
     ENV LD_LIBRARY_PATH)
-    
+
 set(LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
 find_package_handle_standard_args(
-  LIBOMPTARGET_DEP_LIBELF 
+  LIBOMPTARGET_DEP_LIBELF
   DEFAULT_MSG
   LIBOMPTARGET_DEP_LIBELF_LIBRARIES
   LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS)
 
 mark_as_advanced(
-  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS 
+  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS
   LIBOMPTARGET_DEP_LIBELF_LIBRARIES)
-  
+
 ################################################################################
 # Looking for libffi...
 ################################################################################
@@ -100,15 +100,15 @@
 
 set(LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
 find_package_handle_standard_args(
-  LIBOMPTARGET_DEP_LIBFFI 
+  LIBOMPTARGET_DEP_LIBFFI
   DEFAULT_MSG
   LIBOMPTARGET_DEP_LIBFFI_LIBRARIES
   LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS)
 
 mark_as_advanced(
-  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS 
+  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS
   LIBOMPTARGET_DEP_LIBFFI_LIBRARIES)
-  
+
 ################################################################################
 # Looking for CUDA...
 ################################################################################
@@ -118,7 +118,9 @@
 find_package(CUDA QUIET)
 
 # Try to get the highest Nvidia GPU architecture the system supports
-if (CUDA_FOUND)
+set(LIBOMPTARGET_NVPTX_AUTODETECT_COMPUTE_CAPABILITY TRUE CACHE BOOL
+  "Auto detect CUDA Compute Capability if CUDA is detected.")
+if (CUDA_FOUND AND LIBOMPTARGET_NVPTX_AUTODETECT_COMPUTE_CAPABILITY)
   cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS)
   string(REGEX MATCH "sm_([0-9]+)" CUDA_ARCH_MATCH_OUTPUT ${CUDA_ARCH_FLAGS})
   if (NOT DEFINED CUDA_ARCH_MATCH_OUTPUT OR "${CMAKE_MATCH_1}" LESS 35)
@@ -133,7 +135,7 @@
 set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
 
 mark_as_advanced(
-  LIBOMPTARGET_DEP_CUDA_FOUND 
+  LIBOMPTARGET_DEP_CUDA_FOUND
   LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS)
 
 ################################################################################
diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
deleted file mode 100644
--- a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
+++ /dev/null
@@ -1,111 +0,0 @@
-#
-#//===----------------------------------------------------------------------===//
-#//
-#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-#// See https://llvm.org/LICENSE.txt for license information.
-#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#//
-#//===----------------------------------------------------------------------===//
-#
-
-# We use the compiler and linker provided by the user, attempt to use the one
-# used to build libomptarget or just fail.
-set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE)
-
-if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
-  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
-elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
-  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
-else()
-  return()
-endif()
-
-# Get compiler directory to try to locate a suitable linker.
-get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY)
-set(llvm_link "${compiler_dir}/llvm-link")
-
-if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
-  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
-elseif (EXISTS "${llvm_link}")
-  # Use llvm-link from the compiler directory.
-  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}")
-else()
-  return()
-endif()
-
-function(try_compile_bitcode output source)
-  set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu)
-  file(WRITE ${srcfile} "${source}\n")
-  set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc)
-
-  # The remaining arguments are the flags to be tested.
-  # FIXME: Don't hardcode GPU version. This is currently required because
-  #        Clang refuses to compile its default of sm_20 with CUDA 9.
-  execute_process(
-    COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN}
-      --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile}
-    RESULT_VARIABLE result
-    OUTPUT_QUIET ERROR_QUIET)
-  if (result EQUAL 0)
-    set(${output} TRUE PARENT_SCOPE)
-  else()
-    set(${output} FALSE PARENT_SCOPE)
-  endif()
-endfunction()
-
-# Save for which compiler we are going to do the following checks so that we
-# can discard cached values if the user specifies a different value.
-set(discard_cached FALSE)
-if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND
-    NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}"))
-  set(discard_cached TRUE)
-endif()
-set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE)
-
-function(check_bitcode_compilation output source)
-  if (${discard_cached} OR NOT DEFINED ${output})
-    message(STATUS "Performing Test ${output}")
-    # Forward additional arguments which contain the flags.
-    try_compile_bitcode(result "${source}" ${ARGN})
-    set(${output} ${result} CACHE INTERNAL "" FORCE)
-    if(${result})
-      message(STATUS "Performing Test ${output} - Success")
-    else()
-      message(STATUS "Performing Test ${output} - Failed")
-    endif()
-  endif()
-endfunction()
-
-# These flags are required to emit LLVM Bitcode. We check them together because
-# if any of them are not supported, there is no point in finding out which are.
-set(compiler_flags_required -emit-llvm -O1 --cuda-device-only -std=c++14 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
-set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
-check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})
-
-# It makes no sense to continue given that the compiler doesn't support
-# emitting basic LLVM Bitcode
-if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED)
-  return()
-endif()
-
-set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required})
-
-# Declaring external shared device variables might need an additional flag
-# since Clang 7.0 and was entirely unsupported since version 4.0.
-set(extern_device_shared_src "extern __device__ __shared__ int test;")
-
-check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS})
-if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED)
-  set(compiler_flag_fcuda_rdc -fcuda-rdc)
-  set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc})
-  check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full})
-
-  if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC)
-    return()
-  endif()
-
-  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}")
-endif()
-
-# We can compile LLVM Bitcode from CUDA source code!
-set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -10,6 +10,37 @@
 #
 ##===----------------------------------------------------------------------===##
 
+# Check if we can create an LLVM bitcode implementation of the runtime library
+# that could be inlined in the user application. For that we need to find
+# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
+# an LLVM linker.
+set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
+  "Location of a CUDA compiler capable of emitting LLVM bitcode.")
+set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
+  "Location of a linker capable of linking LLVM bitcode objects.")
+
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
+  set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
+elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(cuda_compiler ${CMAKE_C_COMPILER})
+else()
+  libomptarget_say("Not building NVPTX deviceRTL: clang not found")
+  return()
+endif()
+
+# Get compiler directory to try to locate a suitable linker.
+get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY)
+set(llvm_link "${compiler_dir}/llvm-link")
+
+if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
+  set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER})
+elseif (EXISTS ${llvm_link})
+  set(bc_linker ${llvm_link})
+else()
+  libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found")
+  return()
+endif()
+
 # TODO: This part needs to be refined when libomptarget is going to support
 # Windows!
 # TODO: This part can also be removed if we can change the clang driver to make
@@ -33,156 +64,147 @@
 set(devicertl_nvptx_directory
   ${devicertl_base_directory}/nvptx)
 
-if(LIBOMPTARGET_DEP_CUDA_FOUND)
-  # Build library support for the highest compute capability the system supports
-  # and always build support for sm_35 by default
-  if (${LIBOMPTARGET_DEP_CUDA_ARCH} EQUAL 35)
-    set(default_capabilities 35)
-  else()
-      set(default_capabilities "35,${LIBOMPTARGET_DEP_CUDA_ARCH}")
-  endif()
+if (DEFINED LIBOMPTARGET_DEP_CUDA_ARCH)
+  set(default_capabilities ${LIBOMPTARGET_DEP_CUDA_ARCH})
+else()
+  set(default_capabilities 35 37 50 52 53 60 61 62 70 72 75 80)
+endif()
 
-  if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY)
-    set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
-    libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES")
-  endif()
-  set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING
-    "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
-  string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})
+set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING
+  "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
 
-  foreach(sm ${nvptx_sm_list})
-    set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
-  endforeach()
+set(nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})
+
+# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the
+# build.
+if (NOT nvptx_sm_list)
+  libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list")
+  return()
+endif()
 
-  # Override default MAX_SM in src/target_impl.h if requested
-  if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
-    set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
+# Check all SM values
+foreach(sm ${nvptx_sm_list})
+  if (NOT ${sm} IN_LIST default_capabilities)
+    message(FATAL_ERROR "LIBOMPTARGET-NVPTX: compute capability ${sm} is not supported. Supported values: ${default_capabilities}")
   endif()
+endforeach()
 
-  # Activate RTL message dumps if requested by the user.
-  set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
-    "Activate NVPTX device RTL debug messages.")
-
-  # Check if we can create an LLVM bitcode implementation of the runtime library
-  # that could be inlined in the user application. For that we need to find
-  # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
-  # an LLVM linker.
-  set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
-    "Location of a CUDA compiler capable of emitting LLVM bitcode.")
-  set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
-    "Location of a linker capable of linking LLVM bitcode objects.")
-
-  include(LibomptargetNVPTXBitcodeLibrary)
-
-  if (LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED)
-    libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
-
-    set(cuda_src_files
-      ${devicertl_common_directory}/src/cancel.cu
-      ${devicertl_common_directory}/src/critical.cu
-      ${devicertl_common_directory}/src/data_sharing.cu
-      ${devicertl_common_directory}/src/libcall.cu
-      ${devicertl_common_directory}/src/loop.cu
-      ${devicertl_common_directory}/src/omp_data.cu
-      ${devicertl_common_directory}/src/omptarget.cu
-      ${devicertl_common_directory}/src/parallel.cu
-      ${devicertl_common_directory}/src/reduction.cu
-      ${devicertl_common_directory}/src/support.cu
-      ${devicertl_common_directory}/src/sync.cu
-      ${devicertl_common_directory}/src/task.cu
-      src/target_impl.cu
-    )
+# Override default MAX_SM in src/target_impl.h if requested
+if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
+  set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
+endif()
+
+# Activate RTL message dumps if requested by the user.
+set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
+  "Activate NVPTX device RTL debug messages.")
+
+libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
+
+set(cuda_src_files
+  ${devicertl_common_directory}/src/cancel.cu
+  ${devicertl_common_directory}/src/critical.cu
+  ${devicertl_common_directory}/src/data_sharing.cu
+  ${devicertl_common_directory}/src/libcall.cu
+  ${devicertl_common_directory}/src/loop.cu
+  ${devicertl_common_directory}/src/omp_data.cu
+  ${devicertl_common_directory}/src/omptarget.cu
+  ${devicertl_common_directory}/src/parallel.cu
+  ${devicertl_common_directory}/src/reduction.cu
+  ${devicertl_common_directory}/src/support.cu
+  ${devicertl_common_directory}/src/sync.cu
+  ${devicertl_common_directory}/src/task.cu
+  src/target_impl.cu
+)
+
+# Set flags for LLVM Bitcode compilation.
+set(bc_flags -S -x c++
+              -target nvptx64
+              -Xclang -emit-llvm-bc
+              -Xclang -aux-triple -Xclang ${aux_triple}
+              -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+              -D__CUDACC__
+              -I${devicertl_base_directory}
+              -I${devicertl_nvptx_directory}/src)
+
+if(${LIBOMPTARGET_NVPTX_DEBUG})
+  list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1)
+else()
+  list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0)
+endif()
 
-    # Set flags for LLVM Bitcode compilation.
-    set(bc_flags -S -x c++
-                 -target nvptx64
-                 -Xclang -emit-llvm-bc
-                 -Xclang -aux-triple -Xclang ${aux_triple}
-                 -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
-                 -D__CUDACC__
-                 -I${devicertl_base_directory}
-                 -I${devicertl_nvptx_directory}/src)
-
-    if(${LIBOMPTARGET_NVPTX_DEBUG})
-      list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1)
-    else()
-      list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0)
-    endif()
-
-    # Create target to build all Bitcode libraries.
-    add_custom_target(omptarget-nvptx-bc)
-
-    # This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
-    # The last element is the default case.
-    set(cuda_version_list 110 102 101 100 92 91 90 80)
-    set(ptx_feature_list 70 65 64 63 61 61 60 42)
-    # The following two lines of ugly code is not needed when the minimal CMake
-    # version requirement is 3.17+.
-    list(LENGTH cuda_version_list num_version_supported)
-    math(EXPR loop_range "${num_version_supported} - 1")
-
-    # Generate a Bitcode library for all the compute capabilities the user
-    # requested and all PTX version we know for now.
-    foreach(sm ${nvptx_sm_list})
-      set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
-
-      # Uncomment the following code and remove those ugly part if the feature
-      # is available.
-      # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
-      foreach(itr RANGE ${loop_range})
-        list(GET cuda_version_list ${itr} cuda_version)
-        list(GET ptx_feature_list ${itr} ptx_num)
-        set(cuda_flags ${sm_flags})
-        list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
-        list(APPEND cuda_flags "-DCUDA_VERSION=${cuda_version}00")
-
-        set(bc_files "")
-        foreach(src ${cuda_src_files})
-          get_filename_component(infile ${src} ABSOLUTE)
-          get_filename_component(outfile ${src} NAME)
-          set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
-
-          add_custom_command(OUTPUT ${outfile}
-            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags}
-              ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
-            DEPENDS ${infile}
-            IMPLICIT_DEPENDS CXX ${infile}
-            COMMENT "Building LLVM bitcode ${outfile}"
-            VERBATIM
-          )
-          set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
-
-          list(APPEND bc_files ${outfile})
-        endforeach()
-
-        set(bclib_name "libomptarget-nvptx-cuda_${cuda_version}-sm_${sm}.bc")
-
-        # Link to a bitcode library.
-        add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
-              -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
-            DEPENDS ${bc_files}
-            COMMENT "Linking LLVM bitcode ${bclib_name}"
-        )
-        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
-
-        set(bclib_target_name "omptarget-nvptx-cuda_${cuda_version}-sm_${sm}-bc")
-
-        add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
-        add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
-
-        # Copy library to destination.
-        add_custom_command(TARGET ${bclib_target_name} POST_BUILD
-                          COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
-                          ${LIBOMPTARGET_LIBRARY_DIR})
-
-        # Install bitcode library under the lib destination folder.
-        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-      endforeach()
+# Create target to build all Bitcode libraries.
+add_custom_target(omptarget-nvptx-bc)
+
+# This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
+# The last element is the default case.
+set(cuda_version_list 110 102 101 100 92 91 90 80)
+set(ptx_feature_list 70 65 64 63 61 61 60 42)
+# The following two lines of ugly code is not needed when the minimal CMake
+# version requirement is 3.17+.
+list(LENGTH cuda_version_list num_version_supported)
+math(EXPR loop_range "${num_version_supported} - 1")
+
+# Generate a Bitcode library for all the compute capabilities the user
+# requested and all PTX version we know for now.
+foreach(sm ${nvptx_sm_list})
+  set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+
+  # Uncomment the following code and remove those ugly part if the feature
+  # is available.
+  # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
+  foreach(itr RANGE ${loop_range})
+    list(GET cuda_version_list ${itr} cuda_version)
+    list(GET ptx_feature_list ${itr} ptx_num)
+    set(cuda_flags ${sm_flags})
+    list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
+    list(APPEND cuda_flags "-DCUDA_VERSION=${cuda_version}00")
+
+    set(bc_files "")
+    foreach(src ${cuda_src_files})
+      get_filename_component(infile ${src} ABSOLUTE)
+      get_filename_component(outfile ${src} NAME)
+      set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
+
+      add_custom_command(OUTPUT ${outfile}
+        COMMAND ${cuda_compiler} ${bc_flags}
+          ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+        DEPENDS ${infile}
+        IMPLICIT_DEPENDS CXX ${infile}
+        COMMENT "Building LLVM bitcode ${outfile}"
+        VERBATIM
+      )
+      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
+
+      list(APPEND bc_files ${outfile})
     endforeach()
-  endif()
 
+    set(bclib_name "libomptarget-nvptx-cuda_${cuda_version}-sm_${sm}.bc")
+
+    # Link to a bitcode library.
+    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+        COMMAND ${bc_linker}
+          -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+        DEPENDS ${bc_files}
+        COMMENT "Linking LLVM bitcode ${bclib_name}"
+    )
+    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
+
+    set(bclib_target_name "omptarget-nvptx-cuda_${cuda_version}-sm_${sm}-bc")
+
+    add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
+    add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
+
+    # Copy library to destination.
+    add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                      ${LIBOMPTARGET_LIBRARY_DIR})
+
+    # Install bitcode library under the lib destination folder.
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+  endforeach()
+endforeach()
+
+# Test will be enabled if the building machine supports CUDA
+if (LIBOMPTARGET_DEP_CUDA_FOUND)
   add_subdirectory(test)
-else()
-  libomptarget_say("Not building CUDA offloading device RTL: tools to build bc lib not found in the system.")
 endif()