diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt --- a/libc/startup/gpu/amdgpu/CMakeLists.txt +++ b/libc/startup/gpu/amdgpu/CMakeLists.txt @@ -5,6 +5,8 @@ DEPENDS libc.src.__support.RPC.rpc_client libc.src.__support.GPU.utils + libc.src.stdlib.exit + libc.src.stdlib.atexit COMPILE_OPTIONS -ffreestanding # To avoid compiler warnings about calling the main function. -fno-builtin diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -8,6 +8,8 @@ #include "src/__support/GPU/utils.h" #include "src/__support/RPC/rpc_client.h" +#include "src/stdlib/atexit.h" +#include "src/stdlib/exit.h" extern "C" int main(int argc, char **argv, char **envp); @@ -15,21 +17,72 @@ static cpp::Atomic lock = 0; -static cpp::Atomic init = 0; +static cpp::Atomic count = 0; -void init_rpc(void *in, void *out, void *buffer) { - // Only a single thread should update the RPC data. +extern "C" uintptr_t __init_array_start[]; +extern "C" uintptr_t __init_array_end[]; +extern "C" uintptr_t __fini_array_start[]; +extern "C" uintptr_t __fini_array_end[]; + +using InitCallback = void(int, char **, char **); +using FiniCallback = void(void); + +static uint64_t get_grid_size() { + return gpu::get_num_threads() * gpu::get_num_blocks(); +} + +static void call_init_array_callbacks(int argc, char **argv, char **env) { + size_t init_array_size = __init_array_end - __init_array_start; + for (size_t i = 0; i < init_array_size; ++i) + reinterpret_cast(__init_array_start[i])(argc, argv, env); +} + +static void call_fini_array_callbacks() { + size_t fini_array_size = __fini_array_end - __fini_array_start; + for (size_t i = 0; i < fini_array_size; ++i) + reinterpret_cast(__fini_array_start[i])(); +} + +void initialize(int argc, char **argv, char **env, void *in, void *out, + void *buffer) { + // We need a single GPU thread to perform the initialization of the global + // constructors and data. We simply mask off all but a single thread and + // execute. + count.fetch_add(1, cpp::MemoryOrder::RELAXED); if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) { + // We need to set up the RPC client first in case any of the constructors + // require it. rpc::client.reset(&lock, in, out, buffer); - init.store(1, cpp::MemoryOrder::RELAXED); + + // We want the fini array callbacks to be run after other atexit + // callbacks are run. So, we register them before running the init + // array callbacks as they can potentially register their own atexit + // callbacks. + atexit(&call_fini_array_callbacks); + call_init_array_callbacks(argc, argv, env); } - // Wait until the previous thread signals that the data has been written. - while (!init.load(cpp::MemoryOrder::RELAXED)) + // We wait until every single thread launched on the GPU has seen the + // initialization code. This will get very, very slow for high thread counts, + // but for testing purposes it is unlikely to matter. + while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size()) rpc::sleep_briefly(); + gpu::sync_threads(); +} - // Wait for the threads in the block to converge and fence the write. +void finalize(int retval) { + // We wait until every single thread launched on the GPU has finished + // executing and reached the finalize region. + count.fetch_sub(1, cpp::MemoryOrder::RELAXED); + while (count.load(cpp::MemoryOrder::RELAXED) != 0) + rpc::sleep_briefly(); gpu::sync_threads(); + if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) { + // Only a single thread should call `exit` here, the rest should gracefully + // return from the kernel. This is so only one thread calls the destructors + // registred with 'atexit' above. + __llvm_libc::exit(retval); + } } } // namespace __llvm_libc @@ -37,7 +90,9 @@ extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void _start(int argc, char **argv, char **envp, int *ret, void *in, void *out, void *buffer) { - __llvm_libc::init_rpc(in, out, buffer); + __llvm_libc::initialize(argc, argv, envp, in, out, buffer); __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED); + + __llvm_libc::finalize(*ret); } diff --git a/libc/test/integration/startup/gpu/CMakeLists.txt b/libc/test/integration/startup/gpu/CMakeLists.txt --- a/libc/test/integration/startup/gpu/CMakeLists.txt +++ b/libc/test/integration/startup/gpu/CMakeLists.txt @@ -25,3 +25,13 @@ --blocks 16 --threads 1 ) + +# Constructors are currently only supported on AMDGPU. +if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU) + add_integration_test( + init_fini_array_test + SUITE libc-startup-tests + SRCS + init_fini_array_test.cpp + ) +endif() diff --git a/libc/test/integration/startup/gpu/init_fini_array_test.cpp b/libc/test/integration/startup/gpu/init_fini_array_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/integration/startup/gpu/init_fini_array_test.cpp @@ -0,0 +1,60 @@ +//===-- Loader test to test init and fini array iteration -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/IntegrationTest/test.h" + +#include + +int global_destroyed = false; + +class A { +private: + int val[1024]; + +public: + A(int i, int a) { + for (int k = 0; k < 1024; ++k) + val[k] = 0; + val[i] = a; + } + + ~A() { global_destroyed = true; } + + int get(int i) const { return val[i]; } +}; + +int GLOBAL_INDEX = 512; +int INITVAL_INITIALIZER = 0x600D; +int BEFORE_INITIALIZER = 0xFEED; + +A global(GLOBAL_INDEX, INITVAL_INITIALIZER); + +int initval = 0; +int before = 0; + +__attribute__((constructor(101))) void run_before() { + before = BEFORE_INITIALIZER; +} + +__attribute__((constructor(65535))) void run_after() { + ASSERT_EQ(before, BEFORE_INITIALIZER); +} + +__attribute__((constructor)) void set_initval() { + initval = INITVAL_INITIALIZER; +} +__attribute__((destructor(1))) void reset_initval() { + ASSERT_TRUE(global_destroyed); + initval = 0; +} + +TEST_MAIN() { + ASSERT_EQ(global.get(GLOBAL_INDEX), INITVAL_INITIALIZER); + ASSERT_EQ(initval, INITVAL_INITIALIZER); + return 0; +}