diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -204,6 +204,7 @@ add_subdirectory(FPUtil) add_subdirectory(OSUtil) add_subdirectory(StringUtil) +add_subdirectory(RPC) # Thread support is used by other "File". So, we add the "threads" # before "File". diff --git a/libc/src/__support/OSUtil/CMakeLists.txt b/libc/src/__support/OSUtil/CMakeLists.txt --- a/libc/src/__support/OSUtil/CMakeLists.txt +++ b/libc/src/__support/OSUtil/CMakeLists.txt @@ -8,12 +8,23 @@ return() endif() -add_header_library( - osutil - HDRS - io.h - quick_exit.h - syscall.h - DEPENDS - ${target_os_util} -) +# The OSUtil is an object library in GPU mode. +if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) + add_header_library( + osutil + HDRS + io.h + quick_exit.h + syscall.h + DEPENDS + ${target_os_util} + ) +else() + add_object_library( + osutil + ALIAS + ${target_os_util} + DEPENDS + ${target_os_util} + ) +endif() diff --git a/libc/src/__support/OSUtil/gpu/CMakeLists.txt b/libc/src/__support/OSUtil/gpu/CMakeLists.txt --- a/libc/src/__support/OSUtil/gpu/CMakeLists.txt +++ b/libc/src/__support/OSUtil/gpu/CMakeLists.txt @@ -2,8 +2,11 @@ gpu_util SRCS quick_exit.cpp + io.cpp HDRS quick_exit.h + io.h DEPENDS libc.src.__support.common + libc.src.__support.RPC.rpc_client ) diff --git a/libc/src/__support/OSUtil/io.h b/libc/src/__support/OSUtil/gpu/io.h copy from libc/src/__support/OSUtil/io.h copy to libc/src/__support/OSUtil/gpu/io.h --- a/libc/src/__support/OSUtil/io.h +++ b/libc/src/__support/OSUtil/gpu/io.h @@ -1,4 +1,4 @@ -//===---------------- Implementation of IO utils ----------------*- C++ -*-===// +//===-------------- GPU implementation of IO utils --------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,11 +6,13 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H -#define LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H +#ifndef LLVM_LIBC_SRC_SUPPORT_OSUTIL_GPU_IO_H +#define LLVM_LIBC_SRC_SUPPORT_OSUTIL_GPU_IO_H -#ifdef __unix__ -#include "linux/io.h" -#endif +namespace __llvm_libc { -#endif // LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H +void write_to_stderr(const char *msg); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_SUPPORT_OSUTIL_LINUX_IO_H diff --git a/libc/src/__support/OSUtil/gpu/io.cpp b/libc/src/__support/OSUtil/gpu/io.cpp new file mode 100644 --- /dev/null +++ b/libc/src/__support/OSUtil/gpu/io.cpp @@ -0,0 +1,29 @@ +//===-------------- GPU implementation of IO utils --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "io.h" + +#include "src/__support/RPC/rpc_client.h" +#include "src/string/string_utils.h" + +namespace __llvm_libc { + +void write_to_stderr(const char *msg) { + uint64_t length = internal::string_length(msg) + 1; + uint64_t buffer_len = sizeof(rpc::Buffer) - sizeof(uint64_t); + for (uint64_t i = 0; i < length; i += buffer_len) + rpc::client.run( + [&](rpc::Buffer *buffer) { + buffer->data[0] = rpc::Opcode::print; + inline_memcpy(reinterpret_cast(&buffer->data[1]), &msg[i], + (length > buffer_len ? buffer_len : length)); + }, + [](rpc::Buffer *) {}); +} + +} // namespace __llvm_libc diff --git a/libc/src/__support/OSUtil/gpu/quick_exit.cpp b/libc/src/__support/OSUtil/gpu/quick_exit.cpp --- a/libc/src/__support/OSUtil/gpu/quick_exit.cpp +++ b/libc/src/__support/OSUtil/gpu/quick_exit.cpp @@ -11,11 +11,21 @@ #include "quick_exit.h" +#include "src/__support/RPC/rpc_client.h" #include "src/__support/macros/properties/architectures.h" namespace __llvm_libc { void quick_exit(int status) { + // TODO: Support asynchronous calls so we don't wait and exit from the GPU + // immediately. + rpc::client.run( + [&](rpc::Buffer *buffer) { + buffer->data[0] = rpc::Opcode::exit; + buffer->data[1] = status; + }, + [](rpc::Buffer *) {}); + #if defined(LIBC_TARGET_ARCH_IS_NVPTX) asm("exit" ::: "memory"); #elif defined(LIBC_TARGET_ARCH_IS_AMDGPU) diff --git a/libc/src/__support/OSUtil/io.h b/libc/src/__support/OSUtil/io.h --- a/libc/src/__support/OSUtil/io.h +++ b/libc/src/__support/OSUtil/io.h @@ -9,7 +9,11 @@ #ifndef LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H #define LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H -#ifdef __unix__ +#include "src/__support/macros/properties/architectures.h" + +#if defined(LIBC_TARGET_ARCH_IS_GPU) +#include "gpu/io.h" +#elif defined(__unix__) #include "linux/io.h" #endif diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/src/__support/RPC/CMakeLists.txt @@ -0,0 +1,18 @@ +add_header_library( + rpc + HDRS + rpc.h + DEPENDS + libc.src.__support.common + libc.src.__support.CPP.atomic +) + +add_object_library( + rpc_client + SRCS + rpc_client.cpp + HDRS + rpc_client.h + DEPENDS + .rpc +) diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h new file mode 100644 --- /dev/null +++ b/libc/src/__support/RPC/rpc.h @@ -0,0 +1,140 @@ +//===-- Shared memory RPC client / server interface -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_H +#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_H + +#include "src/__support/CPP/atomic.h" + +#include + +namespace __llvm_libc { +namespace rpc { + +/// A list of opcodes that we use to invoke certain actions on the server. We +/// reserve the first 255 values for internal libc usage. +enum Opcode : uint64_t { + NOOP = 0, + PRINT_TO_STDERR = 1, + EXIT = 2, + LIBC_LAST = (1UL << 8) - 1, +}; + +/// A fixed size channel used to communicate between the RPC client and server. +struct Buffer { + uint64_t data[8]; +}; + +/// A common process used to synchronize communication between a client and a +/// server. The process contains an inbox and an outbox used for signaling +/// ownership of the shared buffer. +struct Process { + cpp::Atomic *inbox; + cpp::Atomic *outbox; + Buffer *buffer; + + /// Initialize the communicaiton channels. + void reset(void *inbox, void *outbox, void *buffer) { + *this = { + reinterpret_cast *>(inbox), + reinterpret_cast *>(outbox), + reinterpret_cast(buffer), + }; + } +}; + +/// The RPC client used to make requests to the server. +struct Client : public Process { + template void run(F fill, U use); +}; + +/// The RPC server used to respond to the client. +struct Server : public Process { + template bool run(W work, C clean); +}; + +/// Run the RPC client protocol to communicate with the server. We perform the +/// following high level actions to complete a communication: +/// - Apply \p fill to the shared buffer and write 1 to the outbox. +/// - Wait until the inbox is 1. +/// - Apply \p use to the shared buffer and write 0 to the outbox. +/// - Wait until the inbox is 0. +template void Client::run(F fill, U use) { + bool in = inbox->load(cpp::MemoryOrder::RELAXED); + bool out = outbox->load(cpp::MemoryOrder::RELAXED); + atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + // Write to buffer then to the outbox. + if (!in & !out) { + fill(buffer); + atomic_thread_fence(cpp::MemoryOrder::RELEASE); + outbox->store(1, cpp::MemoryOrder::RELEASE); + out = 1; + } + // Wait for the result from the server. + if (!in & out) { + while (!in) + in = inbox->load(cpp::MemoryOrder::RELAXED); + atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + } + // Read from the buffer and then write to outbox. + if (in & out) { + use(buffer); + atomic_thread_fence(cpp::MemoryOrder::RELEASE); + outbox->store(0, cpp::MemoryOrder::RELEASE); + out = 0; + } + // Wait for server to complete the communication. + if (in & !out) { + while (in) + in = inbox->load(cpp::MemoryOrder::RELAXED); + atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + } +} + +/// Run the RPC server protocol to communicate with the client. This is +/// non-blocking and only checks the server a single time. We perform the +/// following high level actions to complete a communication: +/// - Query if the inbox is 1 and exit if there is no work to do. +/// - Apply \p work to the shared buffer and write 1 to the outbox. +/// - Wait until the inbox is 0. +/// - Apply \p clean to the shared buffer and write 0 to the outbox. +template bool Server::run(W work, C clean) { + bool in = inbox->load(cpp::MemoryOrder::RELAXED); + bool out = outbox->load(cpp::MemoryOrder::RELAXED); + atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + // No work to do, exit. + if (!in & !out) + return false; + // Do work then write to the outbox. + if (in & !out) { + work(buffer); + atomic_thread_fence(cpp::MemoryOrder::RELEASE); + outbox->store(1, cpp::MemoryOrder::RELEASE); + out = 1; + } + // Wait for the client to read the result. + if (in & out) { + while (in) + in = inbox->load(cpp::MemoryOrder::RELAXED); + atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + } + // Clean up the buffer and signal the client. + if (!in & out) { + clean(buffer); + atomic_thread_fence(cpp::MemoryOrder::RELEASE); + outbox->store(0, cpp::MemoryOrder::RELEASE); + out = 0; + } + + return true; +} + +} // namespace rpc +} // namespace __llvm_libc + +#endif diff --git a/libc/src/__support/RPC/rpc_client.h b/libc/src/__support/RPC/rpc_client.h new file mode 100644 --- /dev/null +++ b/libc/src/__support/RPC/rpc_client.h @@ -0,0 +1,23 @@ +//===-- Shared memory RPC client instantiation ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H +#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H + +#include "rpc.h" + +namespace __llvm_libc { +namespace rpc { + +/// The libc client instance used to communicate with the server. +extern Client client; + +} // namespace rpc +} // namespace __llvm_libc + +#endif diff --git a/libc/src/__support/RPC/rpc_client.cpp b/libc/src/__support/RPC/rpc_client.cpp new file mode 100644 --- /dev/null +++ b/libc/src/__support/RPC/rpc_client.cpp @@ -0,0 +1,27 @@ +//===-- Shared memory RPC client instantiation ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H +#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H + +#include "rpc.h" + +namespace __llvm_libc { +namespace rpc { + +/// The libc client instance used to communicate with the server. +Client client; + +/// Externally visible symbol to signify the usage of an RPC client to +/// whomever needs to run the server. +extern "C" [[gnu::visibility("protected")]] const bool __llvm_libc_rpc = false; + +} // namespace rpc +} // namespace __llvm_libc + +#endif diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt --- a/libc/startup/gpu/amdgpu/CMakeLists.txt +++ b/libc/startup/gpu/amdgpu/CMakeLists.txt @@ -2,11 +2,12 @@ crt1 SRC start.cpp + DEPENDS + libc.src.__support.RPC.rpc_client COMPILE_OPTIONS -ffreestanding # To avoid compiler warnings about calling the main function. -fno-builtin -nogpulib # Do not include any GPU vendor libraries. - -nostdinc -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -emit-llvm # AMDGPU's intermediate object file format is bitcode. --target=${LIBC_GPU_TARGET_TRIPLE} diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -6,9 +6,13 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/RPC/rpc_client.h" + extern "C" int main(int argc, char **argv); extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void -_start(int argc, char **argv, int *ret) { +_start(int argc, char **argv, int *ret, void *in, void *out, void *buffer) { + __llvm_libc::rpc::client.reset(in, out, buffer); + __atomic_fetch_or(ret, main(argc, argv), __ATOMIC_RELAXED); } diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt --- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt +++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt @@ -1,4 +1,7 @@ add_executable(amdhsa_loader Loader.cpp) +add_dependencies(amdhsa_loader libc.src.__support.RPC.rpc) + +target_include_directories(amdhsa_loader PRIVATE ${LIBC_SOURCE_DIR}) target_link_libraries(amdhsa_loader PRIVATE hsa-runtime64::hsa-runtime64 diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -15,6 +15,8 @@ #include "Loader.h" +#include "src/__support/RPC/rpc.h" + #include #include @@ -31,8 +33,35 @@ int argc; void *argv; void *ret; + void *inbox; + void *outbox; + void *buffer; }; +static __llvm_libc::rpc::Server server; + +/// Queries the RPC client at least once and performs server-side work if there +/// are any active requests. +void handle_server() { + while (server.run( + [&](__llvm_libc::rpc::Buffer *buffer) { + switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) { + case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: { + fputs(reinterpret_cast(&buffer->data[1]), stderr); + break; + } + case __llvm_libc::rpc::Opcode::EXIT: { + exit(buffer->data[1]); + break; + } + default: + return; + }; + }, + [](__llvm_libc::rpc::Buffer *buffer) {})) + ; +} + /// Print the error code and exit if \p code indicates an error. static void handle_error(hsa_status_t code) { if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK) @@ -278,6 +307,26 @@ handle_error(err); hsa_amd_memory_fill(dev_ret, 0, sizeof(int)); + // Allocate finegrained memory for the RPC server and client to share. + void *server_inbox; + void *server_outbox; + void *buffer; + if (hsa_status_t err = hsa_amd_memory_pool_allocate( + finegrained_pool, sizeof(__llvm_libc::cpp::Atomic), + /*flags=*/0, &server_inbox)) + handle_error(err); + if (hsa_status_t err = hsa_amd_memory_pool_allocate( + finegrained_pool, sizeof(__llvm_libc::cpp::Atomic), + /*flags=*/0, &server_outbox)) + handle_error(err); + if (hsa_status_t err = hsa_amd_memory_pool_allocate( + finegrained_pool, sizeof(__llvm_libc::rpc::Buffer), + /*flags=*/0, &buffer)) + handle_error(err); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_inbox); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer); + // Initialie all the arguments (explicit and implicit) to zero, then set the // explicit arguments to the values created above. std::memset(args, 0, args_size); @@ -285,6 +334,9 @@ kernel_args->argc = argc; kernel_args->argv = dev_argv; kernel_args->ret = dev_ret; + kernel_args->inbox = server_outbox; + kernel_args->outbox = server_inbox; + kernel_args->buffer = buffer; // Obtain a packet from the queue. uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); @@ -316,6 +368,9 @@ hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) handle_error(err); + // Initialize the RPC server's buffer for host-device communication. + server.reset(server_inbox, server_outbox, buffer); + // Initialize the packet header and set the doorbell signal to begin execution // by the HSA runtime. uint16_t header = @@ -326,11 +381,12 @@ __ATOMIC_RELEASE); hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); - // Wait until the kernel has completed execution on the device. - while (hsa_signal_wait_scacquire(packet->completion_signal, - HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, - HSA_WAIT_STATE_ACTIVE) != 0) - ; + // Wait until the kernel has completed execution on the device. Periodically + // check the RPC client for work to be performed on the server. + while (hsa_signal_wait_scacquire( + packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0, + /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0) + handle_server(); // Create a memory signal and copy the return value back from the device into // a new buffer.