diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h --- a/libc/src/__support/RPC/rpc_util.h +++ b/libc/src/__support/RPC/rpc_util.h @@ -17,9 +17,6 @@ namespace __llvm_libc { namespace rpc { -/// Maximum amount of data a single lane can use. -constexpr uint64_t MAX_LANE_SIZE = 64; - /// Suspend the thread briefly to assist the thread scheduler during busy loops. LIBC_INLINE void sleep_briefly() { #if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700 diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h --- a/libc/utils/gpu/loader/Loader.h +++ b/libc/utils/gpu/loader/Loader.h @@ -107,7 +107,9 @@ handle_error("Failure in the RPC server\n"); } +template inline void register_rpc_callbacks(uint32_t device_id) { + static_assert(lane_size == 32 || lane_size == 64, "Invalid Lane size"); // Register the ping test for the `libc` tests. rpc_register_callback( device_id, static_cast(RPC_TEST_INCREMENT), @@ -207,14 +209,14 @@ rpc_register_callback( device_id, static_cast(RPC_TEST_STREAM), [](rpc_port_t port, void *data) { - uint64_t sizes[RPC_MAXIMUM_LANE_SIZE] = {0}; - void *dst[RPC_MAXIMUM_LANE_SIZE] = {nullptr}; + uint64_t sizes[lane_size] = {0}; + void *dst[lane_size] = {nullptr}; rpc_recv_n( port, dst, sizes, [](uint64_t size, void *) -> void * { return new char[size]; }, nullptr); rpc_send_n(port, dst, sizes); - for (uint64_t i = 0; i < RPC_MAXIMUM_LANE_SIZE; ++i) { + for (uint64_t i = 0; i < lane_size; ++i) { if (dst[i]) delete[] reinterpret_cast(dst[i]); } diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -150,8 +150,6 @@ // Register RPC callbacks for the malloc and free functions on HSA. uint32_t device_id = 0; - register_rpc_callbacks(device_id); - auto tuple = std::make_tuple(dev_agent, coarsegrained_pool); rpc_register_callback( device_id, RPC_MALLOC, @@ -424,6 +422,14 @@ wavefront_size, rpc_alloc, &tuple)) handle_error(err); + // Register callbacks for the RPC unit tests. + if (wavefront_size == 32) + register_rpc_callbacks<32>(device_id); + else if (wavefront_size == 64) + register_rpc_callbacks<64>(device_id); + else + handle_error("Invalid wavefront size"); + // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU. // If the clock_freq symbol is missing, no work to do. hsa_executable_symbol_t freq_sym; diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -177,7 +177,7 @@ // Register RPC callbacks for the malloc and free functions on HSA. uint32_t device_id = 0; - register_rpc_callbacks(device_id); + register_rpc_callbacks<32>(device_id); rpc_register_callback( device_id, RPC_MALLOC, diff --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/rpc_server.h --- a/libc/utils/gpu/server/rpc_server.h +++ b/libc/utils/gpu/server/rpc_server.h @@ -20,9 +20,6 @@ /// The maxium number of ports that can be opened for any server. const uint64_t RPC_MAXIMUM_PORT_COUNT = 512; -/// The maximum number of parallel lanes that we can support. -const uint64_t RPC_MAXIMUM_LANE_SIZE = 64; - /// The symbol name associated with the client for use with the LLVM C library /// implementation. const char *const rpc_client_symbol_name = "__llvm_libc_rpc_client"; diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp --- a/libc/utils/gpu/server/rpc_server.cpp +++ b/libc/utils/gpu/server/rpc_server.cpp @@ -26,9 +26,6 @@ static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT, "Incorrect maximum port count"); -static_assert(RPC_MAXIMUM_LANE_SIZE == rpc::MAX_LANE_SIZE, - "Incorrect maximum port count"); - // The client needs to support different lane sizes for the SIMT model. Because // of this we need to select between the possible sizes that the client can use. struct Server { @@ -80,9 +77,9 @@ case RPC_WRITE_TO_STREAM: case RPC_WRITE_TO_STDERR: case RPC_WRITE_TO_STDOUT: { - uint64_t sizes[rpc::MAX_LANE_SIZE] = {0}; - void *strs[rpc::MAX_LANE_SIZE] = {nullptr}; - FILE *files[rpc::MAX_LANE_SIZE] = {nullptr}; + uint64_t sizes[lane_size] = {0}; + void *strs[lane_size] = {nullptr}; + FILE *files[lane_size] = {nullptr}; if (port->get_opcode() == RPC_WRITE_TO_STREAM) port->recv([&](rpc::Buffer *buffer, uint32_t id) { files[id] = reinterpret_cast(buffer->data[0]); @@ -96,18 +93,15 @@ : files[id]); uint64_t ret = fwrite(strs[id], 1, sizes[id], file); std::memcpy(buffer->data, &ret, sizeof(uint64_t)); + delete[] reinterpret_cast(strs[id]); }); - for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) { - if (strs[i]) - delete[] reinterpret_cast(strs[i]); - } break; } case RPC_READ_FROM_STREAM: case RPC_READ_FROM_STDIN: { - uint64_t sizes[rpc::MAX_LANE_SIZE] = {0}; - void *data[rpc::MAX_LANE_SIZE] = {nullptr}; - uint64_t rets[rpc::MAX_LANE_SIZE] = {0}; + uint64_t sizes[lane_size] = {0}; + void *data[lane_size] = {nullptr}; + uint64_t rets[lane_size] = {0}; port->recv([&](rpc::Buffer *buffer, uint32_t id) { sizes[id] = buffer->data[0]; data[id] = new char[sizes[id]]; @@ -124,8 +118,8 @@ break; } case RPC_OPEN_FILE: { - uint64_t sizes[rpc::MAX_LANE_SIZE] = {0}; - void *paths[rpc::MAX_LANE_SIZE] = {nullptr}; + uint64_t sizes[lane_size] = {0}; + void *paths[lane_size] = {nullptr}; port->recv_n(paths, sizes, [&](uint64_t size) { return new char[size]; }); port->recv_and_send([&](rpc::Buffer *buffer, uint32_t id) { FILE *file = fopen(reinterpret_cast(paths[id]), @@ -152,8 +146,8 @@ break; } case RPC_HOST_CALL: { - uint64_t sizes[rpc::MAX_LANE_SIZE] = {0}; - void *args[rpc::MAX_LANE_SIZE] = {nullptr}; + uint64_t sizes[lane_size] = {0}; + void *args[lane_size] = {nullptr}; port->recv_n(args, sizes, [&](uint64_t size) { return new char[size]; }); port->recv([&](rpc::Buffer *buffer, uint32_t id) { reinterpret_cast(buffer->data[0])(args[id]);