Index: README.rst =================================================================== --- README.rst +++ README.rst @@ -257,9 +257,11 @@ Options for ``NVPTX device RTL`` -------------------------------- -**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``OFF|ON`` +**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``ON|OFF`` Enable CUDA LLVM bitcode offloading device RTL. This is used for link time - optimization of the OMP runtime and application code. + optimization of the OMP runtime and application code. This option is enabled + by default if the build system determines that `CMAKE_C_COMPILER` is able to + compile and link the library. **LIBOMPTARGET_NVPTX_CUDA_COMPILER** = ``""`` Location of a CUDA compiler capable of emitting LLVM bitcode. Currently only Index: libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake =================================================================== --- /dev/null +++ libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake @@ -0,0 +1,112 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +# We use the compiler and linker provided by the user, attempt to use the one +# used to build libomptarget or just fail. +set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE) + +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) +elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) +else() + return() +endif() + +# Get compiler directory to try to locate a suitable linker. +get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY) +set(llvm_link "${compiler_dir}/llvm-link") + +if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "") + set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER}) +elseif (EXISTS "${llvm_link}") + # Use llvm-link from the compiler directory. + set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}") +else() + return() +endif() + +function(try_compile_bitcode output source) + set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu) + file(WRITE ${srcfile} "${source}\n") + set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc) + + # The remaining arguments are the flags to be tested. + # FIXME: Don't hardcode GPU version. This is currently required because + # Clang refuses to compile its default of sm_20 with CUDA 9. + execute_process( + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN} + --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile} + RESULT_VARIABLE result + OUTPUT_QUIET ERROR_QUIET) + if (result EQUAL 0) + set(${output} TRUE PARENT_SCOPE) + else() + set(${output} FALSE PARENT_SCOPE) + endif() +endfunction() + +# Save for which compiler we are going to do the following checks so that we +# can discard cached values if the user specifies a different value. +set(discard_cached FALSE) +if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND + NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}")) + set(discard_cached TRUE) +endif() +set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE) + +function(check_bitcode_compilation output source) + if (${discard_cached} OR NOT DEFINED ${output}) + message(STATUS "Performing Test ${output}") + # Forward additional arguments which contain the flags. + try_compile_bitcode(result "${source}" ${ARGN}) + set(${output} ${result} CACHE INTERNAL "" FORCE) + if(${result}) + message(STATUS "Performing Test ${output} - Success") + else() + message(STATUS "Performing Test ${output} - Failed") + endif() + endif() +endfunction() + +# These flags are required to emit LLVM Bitcode. We check them together because +# if any of them are not supported, there is no point in finding out which are. +set(compiler_flags_required -emit-llvm -O1 --cuda-device-only) +set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }") +check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required}) + +# It makes no sense to continue given that the compiler doesn't support +# emitting basic LLVM Bitcode +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED) + return() +endif() + +set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required}) + +# Declaring external shared device variables might need an additional flag +# since Clang 7.0 and was entirely unsupported since version 4.0. +set(extern_device_shared_src "extern __device__ __shared__ int test;") + +check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}) +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED) + set(compiler_flag_fcuda_rdc -fcuda-rdc) + set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc}) + check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full}) + + if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC) + return() + endif() + + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}") +endif() + +# We can compile LLVM Bitcode from CUDA source code! +set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE) Index: libomptarget/deviceRTLs/nvptx/CMakeLists.txt =================================================================== --- libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -93,122 +93,87 @@ target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES}) + # Check if we can create an LLVM bitcode implementation of the runtime library - # that could be inlined in the user implementation. - set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB FALSE CACHE BOOL + # that could be inlined in the user application. For that we need to find + # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and + # an LLVM linker. + set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING + "Location of a CUDA compiler capable of emitting LLVM bitcode.") + set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING + "Location of a linker capable of linking LLVM bitcode objects.") + + include(LibomptargetNVPTXBitcodeLibrary) + + set(bclib_default FALSE) + if (${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED}) + set(bclib_default TRUE) + endif() + set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB ${bclib_default} CACHE BOOL "Enable CUDA LLVM bitcode offloading device RTL.") if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB}) - - # Find a clang compiler capable of compiling cuda files to LLVM bitcode and - # an LLVM linker. - # We use the one provided by the user, attempt to use the one used to build - # libomptarget or just fail. - - set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING - "Location of a CUDA compiler capable of emitting LLVM bitcode.") - set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING - "Location of a linker capable of linking LLVM bitcode objects.") - - if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") - set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) - elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") - set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) - else() - libomptarget_error_say("Cannot find a CUDA compiler capable of emitting LLVM bitcode.") - libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_CUDA_COMPILER") + if (NOT ${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED}) + libomptarget_error_say("Cannot build CUDA LLVM bitcode offloading device RTL!") endif() + libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.") - # Get compiler directory to try to locate a suitable linker - get_filename_component(COMPILER_DIR ${CMAKE_C_COMPILER} DIRECTORY) - - if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "") - set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER}) - elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND EXISTS "${COMPILER_DIR}/llvm-link") - # Use llvm-link from the directory containing clang - set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${COMPILER_DIR}/llvm-link) + # Set flags for LLVM Bitcode compilation. + set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} -DOMPTARGET_NVPTX_TEST=0) + if(${LIBOMPTARGET_NVPTX_DEBUG}) + set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1) else() - libomptarget_error_say("Cannot find a linker capable of linking LLVM bitcode objects.") - libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_BC_LINKER") + set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0) endif() - if(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER AND LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER) - libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.") - - # Decide which ptx version to use. Same choices as Clang. - if(CUDA_VERSION_MAJOR GREATER 9 OR CUDA_VERSION_MAJOR EQUAL 9) - set(CUDA_PTX_VERSION ptx60) - else() - set(CUDA_PTX_VERSION ptx42) - endif() - - set(BC_DEBUG -DOMPTARGET_NVPTX_DEBUG=0) - if(${LIBOMPTARGET_NVPTX_DEBUG}) - set(BC_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1) - endif() - - # Set flags for Clang cuda compilation. Only Clang is supported because there is - # no other compiler capable of generating bitcode from cuda sources. - set(CUDA_FLAGS - -emit-llvm - -O1 - -Xclang -target-feature - -Xclang +${CUDA_PTX_VERSION} - --cuda-device-only - -DOMPTARGET_NVPTX_TEST=0 - ${BC_DEBUG} - ) + # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared + # to handle. Therefore, we use 'weak' instead. We are compiling only for the + # device, so it should be equivalent. + if(CUDA_VERSION_MAJOR GREATER 8) + set(bc_flags ${bc_flags} -Dnv_weak=weak) + endif() - # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared - # to handle. Therefore, we use 'weak' instead. We are compiling only for the - # device, so it should be equivalent. - if(CUDA_VERSION_MAJOR EQUAL 9) - set(CUDA_FLAGS ${CUDA_FLAGS} -Dnv_weak=weak) - endif() - - # Get the compute capability the user requested or use SM_35 by default. - set(CUDA_ARCH "") - foreach(sm ${nvptx_sm_list}) - set(CUDA_ARCH --cuda-gpu-arch=sm_${sm}) - - # Compile cuda files to bitcode. - set(bc_files "") - foreach(src ${cuda_src_files}) - get_filename_component(infile ${src} ABSOLUTE) - get_filename_component(outfile ${src} NAME) - - add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${CUDA_FLAGS} ${CUDA_ARCH} ${CUDA_INCLUDES} - -c ${infile} -o ${outfile}-sm_${sm}.bc - DEPENDS ${infile} - IMPLICIT_DEPENDS CXX ${infile} - COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" - VERBATIM - ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) - - list(APPEND bc_files ${outfile}-sm_${sm}.bc) - endforeach() - - # Link to a bitcode library. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} - -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} - DEPENDS ${bc_files} - COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" + # Generate a Bitcode library for all the compute capabilities the user requested. + foreach(sm ${nvptx_sm_list}) + set(cuda_arch --cuda-gpu-arch=sm_${sm}) + + # Compile CUDA files to bitcode. + set(bc_files "") + foreach(src ${cuda_src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + + add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} + -c ${infile} -o ${outfile}-sm_${sm}.bc + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" + VERBATIM ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) - add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) + list(APPEND bc_files ${outfile}-sm_${sm}.bc) + endforeach() - # Copy library to destination. - add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - $) + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} + -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) - # Install device RTL under the lib destination folder. - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "lib") - endforeach() - endif() + add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) + + # Copy library to destination. + add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc + $) + + # Install device RTL under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "lib") + endforeach() endif() else()