diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake --- a/libc/cmake/modules/LLVMLibCObjectRules.cmake +++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake @@ -52,19 +52,149 @@ endif() endif() if (LIBC_TARGET_ARCHITECTURE_IS_GPU) - list(APPEND compile_options "-fopenmp") - list(APPEND compile_options "-fopenmp-cuda-mode") - foreach(gpu_arch ${LIBC_GPU_ARCHITECTURES}) - list(APPEND compile_options "--offload-arch=${gpu_arch}") - endforeach() list(APPEND compile_options "-nogpulib") - list(APPEND compile_options "-nogpuinc") list(APPEND compile_options "-fvisibility=hidden") - list(APPEND compile_options "-foffload-lto") endif() set(${output_var} ${compile_options} PARENT_SCOPE) endfunction() +# Builds the entrypoint target for the GPU. +# Usage: +# _build_gpu_entrypoint_objects( +# +# SRCS +# HDRS +# DEPENDS +# COMPILE_OPTIONS +# FLAGS +# ) +function(_build_gpu_entrypoint_objects fq_target_name) + cmake_parse_arguments( + "ADD_GPU_ENTRYPOINT_OBJ" + "" # No optional arguments + "NAME;CXX_STANDARD" # Single value arguments + "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS" # Multi value arguments + ${ARGN} + ) + + # The packaged version will be built for every target GPU architecture. We do + # this so we can support multiple accelerators on the same machine. + foreach(gpu_arch ${all_gpu_architectures}) + set(gpu_target_name ${fq_target_name}.${gpu_arch}) + set(compile_options ${ADD_GPU_ENTRYPOINT_OBJ_COMPILE_OPTIONS}) + # Derive the triple from the specified architecture. + if("${gpu_arch}" IN_LIST all_amdgpu_architectures) + set(gpu_target_triple "amdgcn-amd-amdhsa") + list(APPEND compile_options "-mcpu=${gpu_arch}") + elseif("${gpu_arch}" IN_LIST all_nvptx_architectures) + set(gpu_target_triple "nvptx64-nvidia-cuda") + list(APPEND compile_options "-march=${gpu_arch}") + else() + message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'") + endif() + list(APPEND compile_options "--target=${gpu_target_triple}") + list(APPEND compile_options "-emit-llvm") + + # Build the library for this target architecture. We always emit LLVM-IR for + # packaged GPU binaries. + add_library(${gpu_target_name} + EXCLUDE_FROM_ALL + OBJECT + ${ADD_GPU_ENTRYPOINT_OBJ_SRCS} + ${ADD_GPU_ENTRYPOINT_OBJ_HDRS} + ) + + target_compile_options(${gpu_target_name} PRIVATE ${compile_options}) + target_include_directories(${gpu_target_name} PRIVATE ${include_dirs}) + add_dependencies(${gpu_target_name} ${ADD_GPU_ENTRYPOINT_OBJ_DEPENDS}) + target_compile_definitions(${gpu_target_name} PRIVATE LLVM_LIBC_PUBLIC_PACKAGING) + + # Append this target to a list of images to package into a single binary. + set(input_file $) + list(APPEND packager_images + --image=file=${input_file},arch=${gpu_arch},triple=${gpu_target_triple}) + list(APPEND gpu_target_names ${gpu_target_name}) + endforeach() + + # After building the target for the desired GPUs we must package the output + # into a fatbinary, see https://clang.llvm.org/docs/OffloadingDesign.html for + # more information. + set(packaged_target_name ${fq_target_name}.__gpu__) + set(packaged_output_name ${CMAKE_CURRENT_BINARY_DIR}/${fq_target_name}.gpubin) + + add_custom_command(OUTPUT ${packaged_output_name} + COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER} + ${packager_images} -o ${packaged_output_name} + DEPENDS ${gpu_target_names} + COMMENT "Packaging LLVM offloading binary") + add_custom_target(${packaged_target_name} DEPENDS ${packaged_output_name}) + + # We create an empty 'stub' file for the host to contain the embedded device + # code. This will be packaged into 'libcgpu.a'. + # TODO: In the future we will want to combine every architecture for a target + # into a single bitcode file and use that. For now we simply build for + # every single one and let the offloading linker handle it. + get_filename_component(stub_filename ${ADD_GPU_ENTRYPOINT_OBJ_SRCS} NAME) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${stub_filename} "// Empty file.\n") + add_library( + ${fq_target_name} + # We want an object library as the objects will eventually get packaged into + # an archive (like libcgpu.a). + EXCLUDE_FROM_ALL + OBJECT + "${CMAKE_CURRENT_BINARY_DIR}/${stub_filename}" + ) + target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} + -DLLVM_LIBC_PUBLIC_PACKAGING + -nostdlib -Xclang -fembed-offload-object=${packaged_output_name}) + target_include_directories(${fq_target_name} PRIVATE ${include_dirs}) + add_dependencies(${fq_target_name} ${full_deps_list} ${packaged_target_name}) + + set_target_properties( + ${fq_target_name} + PROPERTIES + ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME} + TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE} + OBJECT_FILE "$" + CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD} + DEPS "${fq_deps_list}" + FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}" + ) + + # We only build the internal target for a single supported architecture. + set(internal_target_name ${fq_target_name}.__internal__) + set(include_dirs ${LIBC_BUILD_DIR}/include ${LIBC_SOURCE_DIR} ${LIBC_BUILD_DIR}) + if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU OR + LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) + add_library( + ${internal_target_name} + EXCLUDE_FROM_ALL + OBJECT + ${ADD_ENTRYPOINT_OBJ_SRCS} + ${ADD_ENTRYPOINT_OBJ_HDRS} + ) + target_compile_options(${internal_target_name} BEFORE PRIVATE + ${common_compile_options} --target=${LIBC_GPU_TARGET_TRIPLE}) + if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU) + target_compile_options(${internal_target_name} PRIVATE -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}) + elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) + target_compile_options(${internal_target_name} PRIVATE -march=${LIBC_GPU_TARGET_ARCHITECTURE}) + endif() + target_include_directories(${internal_target_name} PRIVATE ${include_dirs}) + add_dependencies(${internal_target_name} ${full_deps_list}) + set_target_properties( + ${internal_target_name} + PROPERTIES + CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD} + FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}" + ) + set_target_properties( + ${fq_target_name} + PROPERTIES OBJECT_FILE_RAW "$" + ) + endif() +endfunction() + # Rule which is essentially a wrapper over add_library to compile a set of # sources to object files. # Usage: @@ -127,7 +257,6 @@ if(NOT ADD_OBJECT_CXX_STANDARD) set(ADD_OBJECT_CXX_STANDARD ${CMAKE_CXX_STANDARD}) endif() - set_target_properties( ${fq_target_name} PROPERTIES @@ -350,53 +479,67 @@ endif() endif() - add_library( - ${internal_target_name} - # TODO: We don't need an object library for internal consumption. - # A future change should switch this to a normal static library. - EXCLUDE_FROM_ALL - OBJECT - ${ADD_ENTRYPOINT_OBJ_SRCS} - ${ADD_ENTRYPOINT_OBJ_HDRS} - ) - target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options}) - target_include_directories(${internal_target_name} PRIVATE ${include_dirs}) - add_dependencies(${internal_target_name} ${full_deps_list}) - set_target_properties( - ${internal_target_name} - PROPERTIES + # GPU builds require special handling for the objects because we want to + # export several different targets at once, e.g. for both Nvidia and AMD. + if(LIBC_TARGET_ARCHITECTURE_IS_GPU) + _build_gpu_entrypoint_objects( + ${fq_target_name} + SRCS ${ADD_ENTRYPOINT_OBJ_SRCS} + HDRS ${ADD_ENTRYPOINT_OBJ_HDRS} + COMPILE_OPTIONS ${common_compile_options} + DEPENDS ${full_deps_list} CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD} FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}" - ) + ) + else() + add_library( + ${internal_target_name} + # TODO: We don't need an object library for internal consumption. + # A future change should switch this to a normal static library. + EXCLUDE_FROM_ALL + OBJECT + ${ADD_ENTRYPOINT_OBJ_SRCS} + ${ADD_ENTRYPOINT_OBJ_HDRS} + ) + target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options}) + target_include_directories(${internal_target_name} PRIVATE ${include_dirs}) + add_dependencies(${internal_target_name} ${full_deps_list}) + set_target_properties( + ${internal_target_name} + PROPERTIES + CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD} + FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}" + ) - add_library( - ${fq_target_name} - # We want an object library as the objects will eventually get packaged into - # an archive (like libc.a). - EXCLUDE_FROM_ALL - OBJECT - ${ADD_ENTRYPOINT_OBJ_SRCS} - ${ADD_ENTRYPOINT_OBJ_HDRS} - ) - target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLLVM_LIBC_PUBLIC_PACKAGING) - target_include_directories(${fq_target_name} PRIVATE ${include_dirs}) - add_dependencies(${fq_target_name} ${full_deps_list}) + add_library( + ${fq_target_name} + # We want an object library as the objects will eventually get packaged into + # an archive (like libc.a). + EXCLUDE_FROM_ALL + OBJECT + ${ADD_ENTRYPOINT_OBJ_SRCS} + ${ADD_ENTRYPOINT_OBJ_HDRS} + ) + target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLLVM_LIBC_PUBLIC_PACKAGING) + target_include_directories(${fq_target_name} PRIVATE ${include_dirs}) + add_dependencies(${fq_target_name} ${full_deps_list}) - set_target_properties( - ${fq_target_name} - PROPERTIES - ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME} - TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE} - OBJECT_FILE "$" - # TODO: We don't need to list internal object files if the internal - # target is a normal static library. - OBJECT_FILE_RAW "$" - CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD} - DEPS "${fq_deps_list}" - FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}" - ) + set_target_properties( + ${fq_target_name} + PROPERTIES + ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME} + TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE} + OBJECT_FILE "$" + # TODO: We don't need to list internal object files if the internal + # target is a normal static library. + OBJECT_FILE_RAW "$" + CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD} + DEPS "${fq_deps_list}" + FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}" + ) + endif() - if(LLVM_LIBC_ENABLE_LINTING) + if(LLVM_LIBC_ENABLE_LINTING AND TARGET ${internal_target_name}) if(NOT LLVM_LIBC_CLANG_TIDY) message(FATAL_ERROR "Something is wrong! LLVM_LIBC_ENABLE_LINTING is " "ON but LLVM_LIBC_CLANG_TIDY is not set.") diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake --- a/libc/cmake/modules/prepare_libc_gpu_build.cmake +++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake @@ -4,12 +4,14 @@ endif() # Set up the target architectures to build the GPU libc for. -set(all_gpu_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62;" - "sm_70;sm_72;sm_75;sm_80;sm_86;gfx700;gfx701;gfx801;" - "gfx803;gfx900;gfx902;gfx906;gfx908;gfx90a;gfx90c;" - "gfx940;gfx1010;gfx1030;gfx1031;gfx1032;gfx1033;" - "gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;" - "gfx1103") +set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906;" + "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030;" + "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;" + "gfx1100;gfx1101;gfx1102;gfx1103") +set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62;" + "sm_70;sm_72;sm_75;sm_80;sm_86") +set(all_gpu_architectures + "${all_amdgpu_architectures};${all_nvptx_architectures}") set(LIBC_GPU_ARCHITECTURES ${all_gpu_architectures} CACHE STRING "List of GPU architectures to build the libc for.") if(LIBC_GPU_ARCHITECTURES STREQUAL "all") @@ -29,6 +31,15 @@ "GPU.") endif() +# Identify the program used to package multiple images into a single binary. +find_program(LIBC_CLANG_OFFLOAD_PACKAGER + NAMES clang-offload-packager + PATHS ${LLVM_BINARY_DIR}/bin) +if(NOT LIBC_CLANG_OFFLOAD_PACKAGER) + message(FATAL_ERROR "Cannot find the 'clang-offload-packager' for the GPU " + "build") +endif() + # Identify any locally installed AMD GPUs on the system to use for testing. find_program(LIBC_AMDGPU_ARCH NAMES amdgpu-arch diff --git a/libc/src/__support/common.h b/libc/src/__support/common.h --- a/libc/src/__support/common.h +++ b/libc/src/__support/common.h @@ -29,17 +29,15 @@ #define LIBC_INLINE inline #endif -// We use OpenMP to declare these functions on the device. -#define STR(X) #X -#define LLVM_LIBC_DECLARE_DEVICE(name) \ - _Pragma(STR(omp declare target to(name) device_type(nohost))) +#if defined(__AMDGPU__) || defined(__NVPTX__) +#define PACKAGE_FOR_GPU +#endif -// GPU targets do not support aliasing and must be declared on the device. -#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && defined(_OPENMP) +// GPU targets do not support aliasing. +#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && defined(PACKAGE_FOR_GPU) #define LLVM_LIBC_FUNCTION(type, name, arglist) \ LLVM_LIBC_FUNCTION_ATTR decltype(__llvm_libc::name) \ __##name##_impl__ __asm__(#name); \ - LLVM_LIBC_DECLARE_DEVICE(__##name##_impl__) \ type __##name##_impl__ arglist // MacOS needs to be excluded because it does not support aliasing. #elif defined(LLVM_LIBC_PUBLIC_PACKAGING) && (!defined(__APPLE__))