diff --git a/libc/utils/gpu/CMakeLists.txt b/libc/utils/gpu/CMakeLists.txt --- a/libc/utils/gpu/CMakeLists.txt +++ b/libc/utils/gpu/CMakeLists.txt @@ -1 +1,2 @@ +add_subdirectory(server) add_subdirectory(loader) diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt --- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt +++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt @@ -1,9 +1,8 @@ add_executable(amdhsa_loader Loader.cpp) -add_dependencies(amdhsa_loader libc.src.__support.RPC.rpc) -target_include_directories(amdhsa_loader PRIVATE ${LIBC_SOURCE_DIR}) target_link_libraries(amdhsa_loader PRIVATE hsa-runtime64::hsa-runtime64 + rpc_server gpu_loader ) diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -15,7 +15,7 @@ #include "Loader.h" -#include "src/__support/RPC/rpc.h" +#include "rpc_server.h" #include #include @@ -23,6 +23,7 @@ #include #include #include +#include #include /// The name of the kernel we will launch. All AMDHSA kernels end with '.kd'. @@ -39,30 +40,6 @@ void *buffer; }; -static __llvm_libc::rpc::Server server; - -/// Queries the RPC client at least once and performs server-side work if there -/// are any active requests. -void handle_server() { - while (server.handle( - [&](__llvm_libc::rpc::Buffer *buffer) { - switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) { - case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: { - fputs(reinterpret_cast(&buffer->data[1]), stderr); - break; - } - case __llvm_libc::rpc::Opcode::EXIT: { - exit(buffer->data[1]); - break; - } - default: - return; - }; - }, - [](__llvm_libc::rpc::Buffer *buffer) {})) - ; -} - /// Print the error code and exit if \p code indicates an error. static void handle_error(hsa_status_t code) { if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK) @@ -311,24 +288,18 @@ hsa_amd_memory_fill(dev_ret, 0, sizeof(int)); // Allocate finegrained memory for the RPC server and client to share. - void *server_inbox; - void *server_outbox; - void *buffer; - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, sizeof(__llvm_libc::cpp::Atomic), - /*flags=*/0, &server_inbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, sizeof(__llvm_libc::cpp::Atomic), - /*flags=*/0, &server_outbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, sizeof(__llvm_libc::rpc::Buffer), - /*flags=*/0, &buffer)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_inbox); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer); + auto rpc_data = std::make_tuple(finegrained_pool, dev_agent); + auto rpc_allocator = [](uint64_t size, void *data) -> void * { + auto &[finegrained_pool, dev_agent] = + *reinterpret_cast(data); + void *dev_ptr = nullptr; + if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, + /*flags=*/0, &dev_ptr)) + handle_error(err); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); + return dev_ptr; + }; + rpc_init(rpc_allocator, &rpc_data); // Initialie all the arguments (explicit and implicit) to zero, then set the // explicit arguments to the values created above. @@ -338,9 +309,9 @@ kernel_args->argv = dev_argv; kernel_args->envp = dev_envp; kernel_args->ret = dev_ret; - kernel_args->inbox = server_outbox; - kernel_args->outbox = server_inbox; - kernel_args->buffer = buffer; + kernel_args->inbox = rpc_get_outbox(); + kernel_args->outbox = rpc_get_inbox(); + kernel_args->buffer = rpc_get_buffer(); // Obtain a packet from the queue. uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); @@ -372,9 +343,6 @@ hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) handle_error(err); - // Initialize the RPC server's buffer for host-device communication. - server.reset(server_inbox, server_outbox, buffer); - // Initialize the packet header and set the doorbell signal to begin execution // by the HSA runtime. uint16_t header = @@ -390,7 +358,7 @@ while (hsa_signal_wait_scacquire( packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0, /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0) - handle_server(); + rpc_handle(); // Create a memory signal and copy the return value back from the device into // a new buffer. diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt --- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt +++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt @@ -1,9 +1,8 @@ add_executable(nvptx_loader Loader.cpp) -add_dependencies(nvptx_loader libc.src.__support.RPC.rpc) -target_include_directories(nvptx_loader PRIVATE ${LIBC_SOURCE_DIR}) target_link_libraries(nvptx_loader PRIVATE gpu_loader + rpc_server CUDA::cuda_driver ) diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -15,7 +15,7 @@ #include "Loader.h" -#include "src/__support/RPC/rpc.h" +#include "rpc_server.h" #include "cuda.h" #include @@ -34,30 +34,6 @@ void *buffer; }; -static __llvm_libc::rpc::Server server; - -/// Queries the RPC client at least once and performs server-side work if there -/// are any active requests. -void handle_server() { - while (server.handle( - [&](__llvm_libc::rpc::Buffer *buffer) { - switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) { - case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: { - fputs(reinterpret_cast(&buffer->data[1]), stderr); - break; - } - case __llvm_libc::rpc::Opcode::EXIT: { - exit(buffer->data[1]); - break; - } - default: - return; - }; - }, - [](__llvm_libc::rpc::Buffer *buffer) {})) - ; -} - static void handle_error(CUresult err) { if (err == CUDA_SUCCESS) return; @@ -132,11 +108,14 @@ if (CUresult err = cuMemsetD32(dev_ret, 0, 1)) handle_error(err); - void *server_inbox = allocator(sizeof(__llvm_libc::cpp::Atomic)); - void *server_outbox = allocator(sizeof(__llvm_libc::cpp::Atomic)); - void *buffer = allocator(sizeof(__llvm_libc::rpc::Buffer)); - if (!server_inbox || !server_outbox || !buffer) - handle_error("Failed to allocate memory the RPC client / server."); + // Allocate finegrained memory for the RPC server and client to share. + auto rpc_allocator = [](uint64_t size, void *) -> void * { + void *dev_ptr; + if (CUresult err = cuMemAllocHost(&dev_ptr, size)) + handle_error(err); + return dev_ptr; + }; + rpc_init(rpc_allocator, nullptr); // Set up the arguments to the '_start' kernel on the GPU. uint64_t args_size = sizeof(kernel_args_t); @@ -146,16 +125,13 @@ args.argv = dev_argv; args.envp = dev_envp; args.ret = reinterpret_cast(dev_ret); - args.inbox = server_outbox; - args.outbox = server_inbox; - args.buffer = buffer; + args.inbox = rpc_get_outbox(); + args.outbox = rpc_get_inbox(); + args.buffer = rpc_get_buffer(); void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args, CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, CU_LAUNCH_PARAM_END}; - // Initialize the RPC server's buffer for host-device communication. - server.reset(server_inbox, server_outbox, buffer); - // Call the kernel with the given arguments. if (CUresult err = cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1, @@ -166,7 +142,7 @@ // Wait until the kernel has completed execution on the device. Periodically // check the RPC client for work to be performed on the server. while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY) - handle_server(); + rpc_handle(); // Copy the return value back from the kernel and wait. int host_ret = 0; diff --git a/libc/utils/gpu/server/CMakeLists.txt b/libc/utils/gpu/server/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/server/CMakeLists.txt @@ -0,0 +1,6 @@ +add_library(rpc_server STATIC rpc_server.h rpc_server.cpp) + +# Include the RPC implemenation from libc. +add_dependencies(rpc_server libc.src.__support.RPC.rpc) +target_include_directories(rpc_server PRIVATE ${LIBC_SOURCE_DIR}) +target_include_directories(rpc_server PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/rpc_server.h new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/server/rpc_server.h @@ -0,0 +1,46 @@ +//===-- Shared memory RPC server instantiation ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_GPU_SERVER_RPC_SERVER_H +#define LLVM_LIBC_UTILS_GPU_SERVER_RPC_SERVER_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void *(rpc_alloc_ty)(uint64_t size, void *data); + +typedef void(rpc_dealloc_ty)(void *ptr, void *data); + +/// Initialize the server with unified memory to communicate with the client. +void rpc_init(rpc_alloc_ty alloc, void *data); + +/// Deallocate the memory associated with the server. +void rpc_deinit(rpc_dealloc_ty, void *data); + +/// Queries the RPC client at least once and performs server-side work if there +/// are any active requests. +void rpc_handle(); + +/// Get the pointer to the data inbox. +/// TODO: We should try to compress this into a single buffer. +void *rpc_get_inbox(); + +/// Get the pointer to the data outbox. +void *rpc_get_outbox(); + +/// Get the pointer to the data buffer. +void *rpc_get_buffer(); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/server/rpc_server.cpp @@ -0,0 +1,56 @@ +//===-- Shared memory RPC server instantiation ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "rpc_server.h" + +#include "src/__support/RPC/rpc.h" + +#include +#include + +/// The server instance used to communicate with the libc client. +__llvm_libc::rpc::Server server; + +void rpc_init(rpc_alloc_ty alloc, void *data) { + void *inbox = alloc(sizeof(__llvm_libc::cpp::Atomic), data); + void *outbox = alloc(sizeof(__llvm_libc::cpp::Atomic), data); + void *buffer = alloc(sizeof(__llvm_libc::rpc::Buffer), data); + server.reset(inbox, outbox, buffer); +} + +void rpc_deinit(rpc_dealloc_ty dealloc, void *data) { + dealloc(server.inbox, data); + dealloc(server.outbox, data); + dealloc(server.buffer, data); +} + +void rpc_handle() { + while (server.handle( + [&](__llvm_libc::rpc::Buffer *buffer) { + switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) { + case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: { + fputs(reinterpret_cast(&buffer->data[1]), stderr); + break; + } + case __llvm_libc::rpc::Opcode::EXIT: { + exit(buffer->data[1]); + break; + } + default: + return; + }; + }, + [](__llvm_libc::rpc::Buffer *buffer) {})) + ; +} + +void *rpc_get_inbox() { return server.inbox; } + +void *rpc_get_outbox() { return server.outbox; } + +void *rpc_get_buffer() { return server.buffer; }