diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -711,7 +711,6 @@ CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); clang::CudaVersion CudaInstallationVersion = CudaInstallation.version(); - std::string CudaVersionStr; // New CUDA versions often introduce new instructions that are only supported // by new PTX version, so we need to raise PTX level to enable them in NVPTX @@ -720,7 +719,6 @@ switch (CudaInstallationVersion) { #define CASE_CUDA_VERSION(CUDA_VER, PTX_VER) \ case CudaVersion::CUDA_##CUDA_VER: \ - CudaVersionStr = #CUDA_VER; \ PtxFeature = "+ptx" #PTX_VER; \ break; CASE_CUDA_VERSION(112, 72); @@ -734,9 +732,6 @@ CASE_CUDA_VERSION(90, 60); #undef CASE_CUDA_VERSION default: - // If unknown CUDA version, we take it as CUDA 8.0. Same assumption is also - // made in libomptarget/deviceRTLs. - CudaVersionStr = "80"; PtxFeature = "+ptx42"; } CC1Args.append({"-target-feature", PtxFeature}); @@ -757,8 +752,7 @@ return; } - std::string BitcodeSuffix = - "nvptx-cuda_" + CudaVersionStr + "-" + GpuArch.str(); + std::string BitcodeSuffix = "nvptx-" + GpuArch.str(); addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix, getTriple()); } diff --git a/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_102-sm_35.bc b/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_35.bc rename from clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_102-sm_35.bc rename to clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_35.bc diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -164,7 +164,7 @@ // RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-BCLIB-USER %s -// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-cuda_102-sm_35.bc +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-sm_35.bc // CHK-BCLIB-USER: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc // CHK-BCLIB-NOT: {{error:|warning:}} @@ -177,7 +177,7 @@ // RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-BCLIB-WARN %s -// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-cuda_102-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library. +// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library. /// ########################################################################### diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -137,6 +137,7 @@ -Xclang -emit-llvm-bc -Xclang -aux-triple -Xclang ${aux_triple} -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device + -Xclang -target-feature -Xclang +ptx61 -D__CUDACC__ -I${devicertl_base_directory} -I${devicertl_nvptx_directory}/src) @@ -150,81 +151,51 @@ # Create target to build all Bitcode libraries. add_custom_target(omptarget-nvptx-bc) -# This map is from clang/lib/Driver/ToolChains/Cuda.cpp. -# The last element is the default case. -set(cuda_version_list 112 111 110 102 101 100 92 91 90 80) -set(ptx_feature_list 71 71 70 65 64 63 61 61 60 42) -# The following two lines of ugly code is not needed when the minimal CMake -# version requirement is 3.17+. -list(LENGTH cuda_version_list num_version_supported) -math(EXPR loop_range "${num_version_supported} - 1") - -# Generate a Bitcode library for all the compute capabilities the user -# requested and all PTX version we know for now. +# Generate a Bitcode library for all the compute capabilities the user requested foreach(sm ${nvptx_sm_list}) - set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0") - - # Uncomment the following code and remove those ugly part if the feature - # is available. - # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list) - foreach(itr RANGE ${loop_range}) - list(GET cuda_version_list ${itr} cuda_version) - list(GET ptx_feature_list ${itr} ptx_num) - set(cuda_flags ${sm_flags}) - list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num}) - if("${cuda_version}" MATCHES "^([0-9]+)([0-9])$") - set(cuda_version_major ${CMAKE_MATCH_1}) - set(cuda_version_minor ${CMAKE_MATCH_2}) - else() - libomptarget_error_say( - "Unrecognized CUDA version format: ${cuda_version}") - endif() - list(APPEND cuda_flags - "-DCUDA_VERSION=${cuda_version_major}0${cuda_version_minor}0") - - set(bc_files "") - foreach(src ${cuda_src_files}) - get_filename_component(infile ${src} ABSOLUTE) - get_filename_component(outfile ${src} NAME) - set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc") - - add_custom_command(OUTPUT ${outfile} - COMMAND ${cuda_compiler} ${bc_flags} - ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile} - DEPENDS ${infile} - IMPLICIT_DEPENDS CXX ${infile} - COMMENT "Building LLVM bitcode ${outfile}" - VERBATIM - ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) - - list(APPEND bc_files ${outfile}) - endforeach() - - set(bclib_name "libomptarget-nvptx-cuda_${cuda_version}-sm_${sm}.bc") - - # Link to a bitcode library. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} - COMMAND ${bc_linker} - -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files} - DEPENDS ${bc_files} - COMMENT "Linking LLVM bitcode ${bclib_name}" + set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0") + set(bc_files "") + foreach(src ${cuda_src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + set(outfile "${outfile}-sm_${sm}.bc") + + add_custom_command(OUTPUT ${outfile} + COMMAND ${cuda_compiler} ${bc_flags} + ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile} + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}" + VERBATIM ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) - set(bclib_target_name "omptarget-nvptx-cuda_${cuda_version}-sm_${sm}-bc") + list(APPEND bc_files ${outfile}) + endforeach() - add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) - add_dependencies(omptarget-nvptx-bc ${bclib_target_name}) + set(bclib_name "libomptarget-nvptx-sm_${sm}.bc") - # Copy library to destination. - add_custom_command(TARGET ${bclib_target_name} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} - ${LIBOMPTARGET_LIBRARY_DIR}) + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMAND ${bc_linker} + -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode ${bclib_name}" + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) - # Install bitcode library under the lib destination folder. - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") - endforeach() + set(bclib_target_name "omptarget-nvptx-sm_${sm}-bc") + + add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) + add_dependencies(omptarget-nvptx-bc ${bclib_target_name}) + + # Copy library to destination. + add_custom_command(TARGET ${bclib_target_name} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + ${LIBOMPTARGET_LIBRARY_DIR}) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") endforeach() # Test will be enabled if the building machine supports CUDA diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -53,46 +53,28 @@ return (double)nsecs * __kmpc_impl_get_wtick(); } -// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { -#if CUDA_VERSION < 9020 - return __nvvm_vote_ballot(1); -#else unsigned int Mask; asm volatile("activemask.b32 %0;" : "=r"(Mask)); return Mask; -#endif } -// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, int32_t SrcLane) { -#if CUDA_VERSION >= 9000 return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f); -#else - return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f); -#endif // CUDA_VERSION } DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { int32_t T = ((WARPSIZE - Width) << 8) | 0x1f; -#if CUDA_VERSION >= 9000 return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T); -#else - return __nvvm_shfl_down_i32(Var, Delta, T); -#endif // CUDA_VERSION } DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); } DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { -#if CUDA_VERSION >= 9000 __nvvm_bar_warp_sync(Mask); -#else - // In Cuda < 9.0 no need to sync threads in warps. -#endif // CUDA_VERSION } // NVPTX specific kernel initialization