diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -497,12 +497,12 @@ # The GPU build requires overriding the default CMake triple and architecture. if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU) target_compile_options(${fq_build_target_name} PRIVATE - -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto - --target=${LIBC_GPU_TARGET_TRIPLE}) + -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} + -flto --target=${LIBC_GPU_TARGET_TRIPLE}) elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE}) target_compile_options(${fq_build_target_name} PRIVATE - ${nvptx_options} + ${nvptx_options} -fno-use-cxa-atexit --target=${LIBC_GPU_TARGET_TRIPLE}) endif() diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt --- a/libc/startup/gpu/nvptx/CMakeLists.txt +++ b/libc/startup/gpu/nvptx/CMakeLists.txt @@ -6,6 +6,8 @@ DEPENDS libc.src.__support.RPC.rpc_client libc.src.__support.GPU.utils + libc.src.stdlib.exit + libc.src.stdlib.atexit COMPILE_OPTIONS -ffreestanding # To avoid compiler warnings about calling the main function. -fno-builtin diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -8,6 +8,8 @@ #include "src/__support/GPU/utils.h" #include "src/__support/RPC/rpc_client.h" +#include "src/stdlib/atexit.h" +#include "src/stdlib/exit.h" extern "C" int main(int argc, char **argv, char **envp); @@ -15,21 +17,79 @@ static cpp::Atomic lock = 0; -static cpp::Atomic init = 0; +static cpp::Atomic count = 0; -void init_rpc(void *in, void *out, void *buffer) { - // Only a single thread should update the RPC data. +extern "C" { +// Nvidia's 'nvlink' linker does not provide these symbols. We instead need +// to manually create them and update the globals in the loader implememtation. +uintptr_t *__init_array_start [[gnu::visibility("protected")]]; +uintptr_t *__init_array_end [[gnu::visibility("protected")]]; +uintptr_t *__fini_array_start [[gnu::visibility("protected")]]; +uintptr_t *__fini_array_end [[gnu::visibility("protected")]]; +} + +using InitCallback = void(int, char **, char **); +using FiniCallback = void(void); + +static uint64_t get_grid_size() { + return gpu::get_num_threads() * gpu::get_num_blocks(); +} + +static void call_init_array_callbacks(int argc, char **argv, char **env) { + size_t init_array_size = __init_array_end - __init_array_start; + for (size_t i = 0; i < init_array_size; ++i) + reinterpret_cast(__init_array_start[i])(argc, argv, env); +} + +static void call_fini_array_callbacks() { + size_t fini_array_size = __fini_array_end - __fini_array_start; + for (size_t i = 0; i < fini_array_size; ++i) + reinterpret_cast(__fini_array_start[i])(); +} + +// TODO: Put this in a separate kernel and call it with one thread. +void initialize(int argc, char **argv, char **env, void *in, void *out, + void *buffer) { + // We need a single GPU thread to perform the initialization of the global + // constructors and data. We simply mask off all but a single thread and + // execute. + count.fetch_add(1, cpp::MemoryOrder::RELAXED); if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) { + // We need to set up the RPC client first in case any of the constructors + // require it. rpc::client.reset(&lock, in, out, buffer); - init.store(1, cpp::MemoryOrder::RELAXED); + + // We want the fini array callbacks to be run after other atexit + // callbacks are run. So, we register them before running the init + // array callbacks as they can potentially register their own atexit + // callbacks. + // FIXME: The function pointer escaping this TU causes warnings. + __llvm_libc::atexit(&call_fini_array_callbacks); + call_init_array_callbacks(argc, argv, env); } - // Wait until the previous thread signals that the data has been written. - while (!init.load(cpp::MemoryOrder::RELAXED)) + // We wait until every single thread launched on the GPU has seen the + // initialization code. This will get very, very slow for high thread counts, + // but for testing purposes it is unlikely to matter. + while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size()) rpc::sleep_briefly(); + gpu::sync_threads(); +} - // Wait for the threads in the block to converge and fence the write. +// TODO: Put this in a separate kernel and call it with one thread. +void finalize(int retval) { + // We wait until every single thread launched on the GPU has finished + // executing and reached the finalize region. + count.fetch_sub(1, cpp::MemoryOrder::RELAXED); + while (count.load(cpp::MemoryOrder::RELAXED) != 0) + rpc::sleep_briefly(); gpu::sync_threads(); + if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) { + // Only a single thread should call `exit` here, the rest should gracefully + // return from the kernel. This is so only one thread calls the destructors + // registred with 'atexit' above. + __llvm_libc::exit(retval); + } } } // namespace __llvm_libc @@ -37,7 +97,9 @@ extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void _start(int argc, char **argv, char **envp, int *ret, void *in, void *out, void *buffer) { - __llvm_libc::init_rpc(in, out, buffer); + __llvm_libc::initialize(argc, argv, envp, in, out, buffer); __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED); + + __llvm_libc::finalize(*ret); } diff --git a/libc/test/IntegrationTest/test.cpp b/libc/test/IntegrationTest/test.cpp --- a/libc/test/IntegrationTest/test.cpp +++ b/libc/test/IntegrationTest/test.cpp @@ -22,6 +22,7 @@ void *memcpy(void *__restrict, const void *__restrict, size_t); void *memmove(void *dst, const void *src, size_t count); void *memset(void *ptr, int value, size_t count); +int atexit(void (*func)(void)); } // namespace __llvm_libc @@ -44,6 +45,9 @@ return __llvm_libc::memset(ptr, value, count); } +// This is needed if the test was compiled with '-fno-use-cxa-atexit'. +int atexit(void (*func)(void)) { return __llvm_libc::atexit(func); } + } // extern "C" // Integration tests cannot use the SCUDO standalone allocator as SCUDO pulls diff --git a/libc/test/integration/startup/gpu/CMakeLists.txt b/libc/test/integration/startup/gpu/CMakeLists.txt --- a/libc/test/integration/startup/gpu/CMakeLists.txt +++ b/libc/test/integration/startup/gpu/CMakeLists.txt @@ -26,12 +26,9 @@ --threads 1 ) -# Constructors are currently only supported on AMDGPU. -if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU) - add_integration_test( - init_fini_array_test - SUITE libc-startup-tests - SRCS - init_fini_array_test.cpp - ) -endif() +add_integration_test( + init_fini_array_test + SUITE libc-startup-tests + SRCS + init_fini_array_test.cpp +) diff --git a/libc/test/integration/startup/gpu/init_fini_array_test.cpp b/libc/test/integration/startup/gpu/init_fini_array_test.cpp --- a/libc/test/integration/startup/gpu/init_fini_array_test.cpp +++ b/libc/test/integration/startup/gpu/init_fini_array_test.cpp @@ -53,7 +53,7 @@ initval = 0; } -TEST_MAIN() { +TEST_MAIN(int argc, char **argv, char **env) { ASSERT_EQ(global.get(GLOBAL_INDEX), INITVAL_INITIALIZER); ASSERT_EQ(initval, INITVAL_INITIALIZER); return 0; diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt --- a/libc/utils/gpu/loader/CMakeLists.txt +++ b/libc/utils/gpu/loader/CMakeLists.txt @@ -12,7 +12,9 @@ endif() find_package(CUDAToolkit QUIET) -if(CUDAToolkit_FOUND) +# The CUDA loader requires LLVM to traverse the ELF image for symbols. +find_package(LLVM QUIET) +if(CUDAToolkit_FOUND AND LLVM_FOUND) add_subdirectory(nvptx) else() message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected") diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt --- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt +++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt @@ -1,8 +1,14 @@ add_executable(nvptx_loader Loader.cpp) add_dependencies(nvptx_loader libc.src.__support.RPC.rpc) +if(NOT LLVM_ENABLE_RTTI) + target_compile_options(nvptx_loader PRIVATE -fno-rtti) +endif() +target_include_directories(nvptx_loader PRIVATE ${LLVM_INCLUDE_DIRS}) target_link_libraries(nvptx_loader PRIVATE gpu_loader CUDA::cuda_driver + LLVMObject + LLVMSupport ) diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -17,10 +17,18 @@ #include "Server.h" #include "cuda.h" + +#include "llvm/Object/ELF.h" +#include "llvm/Object/ELFObjectFile.h" + #include #include #include #include +#include + +using namespace llvm; +using namespace object; /// The arguments to the '_start' kernel. struct kernel_args_t { @@ -51,11 +59,122 @@ exit(EXIT_FAILURE); } +// Gets the names of all the globals that contain functions to initialize or +// deinitialize. We need to do this manually because the NVPTX toolchain does +// not contain the necessary binary manipulation tools. +template +Expected get_ctor_dtor_array(const void *image, const size_t size, + Alloc allocator, CUmodule binary) { + auto mem_buffer = MemoryBuffer::getMemBuffer( + StringRef(reinterpret_cast(image), size), "image", + /*RequiresNullTerminator=*/false); + Expected elf_or_err = + ELF64LEObjectFile::create(*mem_buffer); + if (!elf_or_err) + handle_error(toString(elf_or_err.takeError()).c_str()); + + std::vector> ctors; + std::vector> dtors; + // CUDA has no way to iterate over all the symbols so we need to inspect the + // ELF directly using the LLVM libraries. + for (const auto &symbol : elf_or_err->symbols()) { + auto name_or_err = symbol.getName(); + if (!name_or_err) + handle_error(toString(name_or_err.takeError()).c_str()); + + // Search for all symbols that contain a constructor or destructor. + if (!name_or_err->starts_with("__init_array_object_") && + !name_or_err->starts_with("__fini_array_object_")) + continue; + + uint16_t priority; + if (name_or_err->rsplit('_').second.getAsInteger(10, priority)) + handle_error("Invalid priority for constructor or destructor"); + + if (name_or_err->starts_with("__init")) + ctors.emplace_back(std::make_pair(name_or_err->data(), priority)); + else + dtors.emplace_back(std::make_pair(name_or_err->data(), priority)); + } + // Lower priority constructors are run before higher ones. The reverse is true + // for destructors. + llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; }); + llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; }); + llvm::reverse(dtors); + + // Allocate host pinned memory to make these arrays visible to the GPU. + CUdeviceptr *dev_memory = reinterpret_cast(allocator( + ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr))); + uint64_t global_size = 0; + + // Get the address of the global and then store the address of the constructor + // function to call in the constructor array. + CUdeviceptr *dev_ctors_start = dev_memory; + CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size(); + for (uint64_t i = 0; i < ctors.size(); ++i) { + CUdeviceptr dev_ptr; + if (CUresult err = + cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first)) + handle_error(err); + if (CUresult err = + cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t))) + handle_error(err); + } + + // Get the address of the global and then store the address of the destructor + // function to call in the destructor array. + CUdeviceptr *dev_dtors_start = dev_ctors_end; + CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size(); + for (uint64_t i = 0; i < dtors.size(); ++i) { + CUdeviceptr dev_ptr; + if (CUresult err = + cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first)) + handle_error(err); + if (CUresult err = + cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t))) + handle_error(err); + } + + // Obtain the address of the pointers the startup implementation uses to + // iterate the constructors and destructors. + CUdeviceptr init_start; + if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary, + "__init_array_start")) + handle_error(err); + CUdeviceptr init_end; + if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary, + "__init_array_end")) + handle_error(err); + CUdeviceptr fini_start; + if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary, + "__fini_array_start")) + handle_error(err); + CUdeviceptr fini_end; + if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary, + "__fini_array_end")) + handle_error(err); + + // Copy the pointers to the newly written array to the symbols so the startup + // implementation can iterate them. + if (CUresult err = + cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t))) + handle_error(err); + if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t))) + handle_error(err); + if (CUresult err = + cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t))) + handle_error(err); + if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t))) + handle_error(err); + + return dev_memory; +} + int load(int argc, char **argv, char **envp, void *image, size_t size, const LaunchParameters ¶ms) { + if (CUresult err = cuInit(0)) handle_error(err); - // Obtain the first device found on the system. CUdevice device; if (CUresult err = cuDeviceGet(&device, 0)) @@ -91,6 +210,11 @@ handle_error(err); return dev_ptr; }; + + auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary); + if (!memory_or_err) + handle_error(toString(memory_or_err.takeError()).c_str()); + void *dev_argv = copy_argument_vector(argc, argv, allocator); if (!dev_argv) handle_error("Failed to allocate device argv"); @@ -153,6 +277,8 @@ handle_error(err); // Free the memory allocated for the device. + if (CUresult err = cuMemFreeHost(*memory_or_err)) + handle_error(err); if (CUresult err = cuMemFree(dev_ret)) handle_error(err); if (CUresult err = cuMemFreeHost(dev_argv))