diff --git a/libc/utils/gpu/CMakeLists.txt b/libc/utils/gpu/CMakeLists.txt --- a/libc/utils/gpu/CMakeLists.txt +++ b/libc/utils/gpu/CMakeLists.txt @@ -1 +1,2 @@ +add_subdirectory(server) add_subdirectory(loader) diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt --- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt +++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt @@ -1,9 +1,9 @@ add_executable(amdhsa_loader Loader.cpp) -add_dependencies(amdhsa_loader libc.src.__support.RPC.rpc) target_include_directories(amdhsa_loader PRIVATE ${LIBC_SOURCE_DIR}) target_link_libraries(amdhsa_loader PRIVATE hsa-runtime64::hsa-runtime64 + rpc_server gpu_loader ) diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -15,7 +15,7 @@ #include "Loader.h" -#include "src/__support/RPC/rpc.h" +#include "utils/gpu/server/rpc_server.h" #include #include @@ -39,30 +39,6 @@ void *buffer; }; -static __llvm_libc::rpc::Server server; - -/// Queries the RPC client at least once and performs server-side work if there -/// are any active requests. -void handle_server() { - while (server.handle( - [&](__llvm_libc::rpc::Buffer *buffer) { - switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) { - case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: { - fputs(reinterpret_cast(&buffer->data[1]), stderr); - break; - } - case __llvm_libc::rpc::Opcode::EXIT: { - exit(buffer->data[1]); - break; - } - default: - return; - }; - }, - [](__llvm_libc::rpc::Buffer *buffer) {})) - ; -} - /// Print the error code and exit if \p code indicates an error. static void handle_error(hsa_status_t code) { if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK) @@ -311,24 +287,7 @@ hsa_amd_memory_fill(dev_ret, 0, sizeof(int)); // Allocate finegrained memory for the RPC server and client to share. - void *server_inbox; - void *server_outbox; - void *buffer; - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, sizeof(__llvm_libc::cpp::Atomic), - /*flags=*/0, &server_inbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, sizeof(__llvm_libc::cpp::Atomic), - /*flags=*/0, &server_outbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, sizeof(__llvm_libc::rpc::Buffer), - /*flags=*/0, &buffer)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_inbox); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer); + rpc_init(allocator); // Initialie all the arguments (explicit and implicit) to zero, then set the // explicit arguments to the values created above. @@ -338,9 +297,9 @@ kernel_args->argv = dev_argv; kernel_args->envp = dev_envp; kernel_args->ret = dev_ret; - kernel_args->inbox = server_outbox; - kernel_args->outbox = server_inbox; - kernel_args->buffer = buffer; + kernel_args->inbox = server.outbox; + kernel_args->outbox = server.inbox; + kernel_args->buffer = server.buffer; // Obtain a packet from the queue. uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); @@ -372,9 +331,6 @@ hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) handle_error(err); - // Initialize the RPC server's buffer for host-device communication. - server.reset(server_inbox, server_outbox, buffer); - // Initialize the packet header and set the doorbell signal to begin execution // by the HSA runtime. uint16_t header = @@ -390,7 +346,7 @@ while (hsa_signal_wait_scacquire( packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0, /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0) - handle_server(); + rpc_handle(); // Create a memory signal and copy the return value back from the device into // a new buffer. diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt --- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt +++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt @@ -1,9 +1,9 @@ add_executable(nvptx_loader Loader.cpp) -add_dependencies(nvptx_loader libc.src.__support.RPC.rpc) target_include_directories(nvptx_loader PRIVATE ${LIBC_SOURCE_DIR}) target_link_libraries(nvptx_loader PRIVATE gpu_loader + rpc_server CUDA::cuda_driver ) diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -15,7 +15,7 @@ #include "Loader.h" -#include "src/__support/RPC/rpc.h" +#include "utils/gpu/server/rpc_server.h" #include "cuda.h" #include @@ -34,30 +34,6 @@ void *buffer; }; -static __llvm_libc::rpc::Server server; - -/// Queries the RPC client at least once and performs server-side work if there -/// are any active requests. -void handle_server() { - while (server.handle( - [&](__llvm_libc::rpc::Buffer *buffer) { - switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) { - case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: { - fputs(reinterpret_cast(&buffer->data[1]), stderr); - break; - } - case __llvm_libc::rpc::Opcode::EXIT: { - exit(buffer->data[1]); - break; - } - default: - return; - }; - }, - [](__llvm_libc::rpc::Buffer *buffer) {})) - ; -} - static void handle_error(CUresult err) { if (err == CUDA_SUCCESS) return; @@ -132,11 +108,8 @@ if (CUresult err = cuMemsetD32(dev_ret, 0, 1)) handle_error(err); - void *server_inbox = allocator(sizeof(__llvm_libc::cpp::Atomic)); - void *server_outbox = allocator(sizeof(__llvm_libc::cpp::Atomic)); - void *buffer = allocator(sizeof(__llvm_libc::rpc::Buffer)); - if (!server_inbox || !server_outbox || !buffer) - handle_error("Failed to allocate memory the RPC client / server."); + // Allocate finegrained memory for the RPC server and client to share. + rpc_init(allocator); // Set up the arguments to the '_start' kernel on the GPU. uint64_t args_size = sizeof(kernel_args_t); @@ -146,16 +119,13 @@ args.argv = dev_argv; args.envp = dev_envp; args.ret = reinterpret_cast(dev_ret); - args.inbox = server_outbox; - args.outbox = server_inbox; - args.buffer = buffer; + args.inbox = server.outbox; + args.outbox = server.inbox; + args.buffer = server.buffer; void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args, CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, CU_LAUNCH_PARAM_END}; - // Initialize the RPC server's buffer for host-device communication. - server.reset(server_inbox, server_outbox, buffer); - // Call the kernel with the given arguments. if (CUresult err = cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1, @@ -166,7 +136,7 @@ // Wait until the kernel has completed execution on the device. Periodically // check the RPC client for work to be performed on the server. while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY) - handle_server(); + rpc_handle(); // Copy the return value back from the kernel and wait. int host_ret = 0; diff --git a/libc/utils/gpu/server/CMakeLists.txt b/libc/utils/gpu/server/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/server/CMakeLists.txt @@ -0,0 +1,8 @@ +add_library(rpc_server STATIC rpc_server.h rpc_server.cpp) + +# Include the RPC implemenation from libc. +add_dependencies(rpc_server libc.src.__support.RPC.rpc) +target_include_directories(rpc_server + PRIVATE + ${LIBC_SOURCE_DIR} +) diff --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/rpc_server.h new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/server/rpc_server.h @@ -0,0 +1,35 @@ +//===-- Shared memory RPC server instantiation ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_GPU_SERVER_RPC_SERVER_H +#define LLVM_LIBC_UTILS_GPU_SERVER_RPC_SERVER_H + +#include "src/__support/RPC/rpc.h" + +extern __llvm_libc::rpc::Server server; + +/// Initialize the server with unified memory to communicate with the client. +template void rpc_init(Allocator alloc) { + void *inbox = alloc(sizeof(__llvm_libc::cpp::Atomic)); + void *outbox = alloc(sizeof(__llvm_libc::cpp::Atomic)); + void *buffer = alloc(sizeof(__llvm_libc::rpc::Buffer)); + server.reset(inbox, outbox, buffer); +} + +/// Deallocate the memory associated with the server. +template void rpc_deinit(Deallocator dealloc) { + dealloc(server.inbox); + dealloc(server.outbox); + dealloc(server.buffer); +} + +/// Queries the RPC client at least once and performs server-side work if there +/// are any active requests. +void rpc_handle(); + +#endif diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/server/rpc_server.cpp @@ -0,0 +1,34 @@ +//===-- Shared memory RPC server instantiation ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "rpc_server.h" +#include +#include + +/// The server instance used to communicate with the libc client. +__llvm_libc::rpc::Server server; + +void rpc_handle() { + while (server.handle( + [&](__llvm_libc::rpc::Buffer *buffer) { + switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) { + case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: { + fputs(reinterpret_cast(&buffer->data[1]), stderr); + break; + } + case __llvm_libc::rpc::Opcode::EXIT: { + exit(buffer->data[1]); + break; + } + default: + return; + }; + }, + [](__llvm_libc::rpc::Buffer *buffer) {})) + ; +}