diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -113,13 +113,30 @@ cpp::Atomic lock[default_port_count] = {0}; /// Initialize the communication channels. - LIBC_INLINE void reset(uint64_t port_count, uint32_t lane_size, void *inbox, - void *outbox, void *packet) { + LIBC_INLINE void reset(uint64_t port_count, uint32_t lane_size, void *state) { + uint64_t p = memory_offset_primary_mailbox(port_count); + uint64_t s = memory_offset_secondary_mailbox(port_count); this->port_count = port_count; this->lane_size = lane_size; - this->inbox = reinterpret_cast *>(inbox); - this->outbox = reinterpret_cast *>(outbox); - this->packet = reinterpret_cast(packet); + this->inbox = reinterpret_cast *>( + (char *)state + (InvertInbox ? s : p)); + this->outbox = reinterpret_cast *>( + (char *)state + (InvertInbox ? p : s)); + this->packet = reinterpret_cast((char *)state + + memory_offset_buffer(port_count)); + } + + /// Allocate a single block of memory for use by client and server + /// template // N is generally a runtime value + /// struct equivalent { + /// atomic primary[N]; + /// atomic secondary[N]; + /// Packet buffer[N]; + /// }; + LIBC_INLINE static constexpr uint64_t allocation_size(uint64_t port_count, + uint32_t lane_size) { + return memory_offset_buffer(port_count) + + memory_allocated_buffer(port_count, lane_size); } /// The length of the packet is flexible because the server needs to look up @@ -244,6 +261,36 @@ fn(&packet.payload.slot[i], i); } } + + /// Number of bytes allocated for mailbox or buffer + LIBC_INLINE static constexpr uint64_t + memory_allocated_mailbox(uint64_t port_count) { + return port_count * sizeof(cpp::Atomic); + } + + LIBC_INLINE static constexpr uint64_t + memory_allocated_buffer(uint64_t port_count, uint32_t lane_size) { +#if defined(LIBC_TARGET_ARCH_IS_GPU) + (void)lane_size; + return port_count * sizeof(Packet); +#else + return port_count * (sizeof(Packet) + sizeof(Buffer) * lane_size); +#endif + } + + /// Offset of mailbox/buffer in single allocation + LIBC_INLINE static constexpr uint64_t + memory_offset_primary_mailbox(uint64_t /*port_count*/) { + return 0; + } + LIBC_INLINE static constexpr uint64_t + memory_offset_secondary_mailbox(uint64_t port_count) { + return memory_allocated_mailbox(port_count); + } + LIBC_INLINE static constexpr uint64_t + memory_offset_buffer(uint64_t port_count) { + return align_up(2 * memory_allocated_mailbox(port_count), alignof(Packet)); + } }; /// The port provides the interface to communicate between the multiple diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h --- a/libc/src/__support/RPC/rpc_util.h +++ b/libc/src/__support/RPC/rpc_util.h @@ -50,7 +50,8 @@ } /// Return \p val aligned "upwards" according to \p align. -template LIBC_INLINE V align_up(V val, A align) { +template +LIBC_INLINE constexpr V align_up(V val, A align) { return ((val + V(align) - 1) / V(align)) * V(align); } diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -38,12 +38,12 @@ } // namespace __llvm_libc extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void -_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) { +_begin(int argc, char **argv, char **env, void *shared_state) { // We need to set up the RPC client first in case any of the constructors // require it. __llvm_libc::rpc::client.reset(__llvm_libc::rpc::default_port_count, - __llvm_libc::gpu::get_lane_size(), in, out, - buffer); + __llvm_libc::gpu::get_lane_size(), + shared_state); // We want the fini array callbacks to be run after other atexit // callbacks are run. So, we register them before running the init diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -42,12 +42,12 @@ } // namespace __llvm_libc extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void -_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) { +_begin(int argc, char **argv, char **env, void *shared_state) { // We need to set up the RPC client first in case any of the constructors // require it. __llvm_libc::rpc::client.reset(__llvm_libc::rpc::default_port_count, - __llvm_libc::gpu::get_lane_size(), in, out, - buffer); + __llvm_libc::gpu::get_lane_size(), + shared_state); // We want the fini array callbacks to be run after other atexit // callbacks are run. So, we register them before running the init diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h --- a/libc/utils/gpu/loader/Loader.h +++ b/libc/utils/gpu/loader/Loader.h @@ -28,9 +28,7 @@ int argc; void *argv; void *envp; - void *inbox; - void *outbox; - void *buffer; + void *shared_state; }; /// The arguments to the '_start' kernel. diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -335,31 +335,18 @@ if (hsa_status_t err = hsa_agent_get_info( dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) handle_error(err); - void *server_inbox; - void *server_outbox; - void *buffer; - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, port_size * sizeof(__llvm_libc::cpp::Atomic), - /*flags=*/0, &server_inbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, port_size * sizeof(__llvm_libc::cpp::Atomic), - /*flags=*/0, &server_outbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, - port_size * - align_up(sizeof(__llvm_libc::rpc::Header) + - (wavefront_size * sizeof(__llvm_libc::rpc::Buffer)), - alignof(__llvm_libc::rpc::Packet)), - /*flags=*/0, &buffer)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_inbox); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer); + + uint64_t shared_state_size = + __llvm_libc::rpc::Server::allocation_size(port_size, wavefront_size); + void *shared_state; + if (hsa_status_t err = + hsa_amd_memory_pool_allocate(finegrained_pool, shared_state_size, + /*flags=*/0, &shared_state)) + handle_error(err); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, shared_state); // Initialize the RPC server's buffer for host-device communication. - server.reset(port_size, wavefront_size, server_inbox, server_outbox, buffer); + server.reset(port_size, wavefront_size, shared_state); // Obtain a queue with the minimum (power of two) size, used to send commands // to the HSA runtime and launch execution on the device. @@ -374,8 +361,7 @@ handle_error(err); LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; - begin_args_t init_args = {argc, dev_argv, dev_envp, - server_outbox, server_inbox, buffer}; + begin_args_t init_args = {argc, dev_argv, dev_envp, shared_state}; if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, queue, single_threaded_params, "_begin.kd", init_args)) @@ -422,11 +408,7 @@ handle_error(err); if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret)) handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_free(server_inbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_free(server_outbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_free(buffer)) + if (hsa_status_t err = hsa_amd_memory_pool_free(shared_state)) handle_error(err); if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret)) handle_error(err); diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -248,24 +248,20 @@ uint64_t port_size = __llvm_libc::rpc::default_port_count; uint32_t warp_size = 32; - void *server_inbox = - allocator(port_size * sizeof(__llvm_libc::cpp::Atomic)); - void *server_outbox = - allocator(port_size * sizeof(__llvm_libc::cpp::Atomic)); - void *buffer = allocator( - port_size * align_up(sizeof(__llvm_libc::rpc::Header) + - (warp_size * sizeof(__llvm_libc::rpc::Buffer)), - alignof(__llvm_libc::rpc::Packet))); - if (!server_inbox || !server_outbox || !buffer) + + uint64_t shared_state_size = + __llvm_libc::rpc::Server::allocation_size(port_size, warp_size); + void *shared_state = allocator(shared_state_size); + + if (!shared_state) handle_error("Failed to allocate memory the RPC client / server."); // Initialize the RPC server's buffer for host-device communication. - server.reset(port_size, warp_size, server_inbox, server_outbox, buffer); + server.reset(port_size, warp_size, shared_state); LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; // Call the kernel to - begin_args_t init_args = {argc, dev_argv, dev_envp, - server_outbox, server_inbox, buffer}; + begin_args_t init_args = {argc, dev_argv, dev_envp, shared_state}; if (CUresult err = launch_kernel(binary, stream, single_threaded_params, "_begin", init_args)) handle_error(err); @@ -295,11 +291,7 @@ handle_error(err); if (CUresult err = cuMemFreeHost(dev_argv)) handle_error(err); - if (CUresult err = cuMemFreeHost(server_inbox)) - handle_error(err); - if (CUresult err = cuMemFreeHost(server_outbox)) - handle_error(err); - if (CUresult err = cuMemFreeHost(buffer)) + if (CUresult err = cuMemFreeHost(shared_state)) handle_error(err); // Destroy the context and the loaded binary.