diff --git a/libc/src/__support/OSUtil/gpu/io.cpp b/libc/src/__support/OSUtil/gpu/io.cpp --- a/libc/src/__support/OSUtil/gpu/io.cpp +++ b/libc/src/__support/OSUtil/gpu/io.cpp @@ -17,6 +17,7 @@ void write_to_stderr(cpp::string_view msg) { rpc::Client::Port port = rpc::client.open(); port.send_n(msg.data(), msg.size()); + port.recv([](rpc::Buffer *) { /* void */ }); port.close(); } diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -417,44 +417,44 @@ send([](Buffer *) { /* no-op */ }); } +/// Helper routine to simplify the interface when sending from the GPU using +/// thread private pointers to the underlying value. +template +LIBC_INLINE void Port::send_n(const void *src, uint64_t size) { + static_assert(is_process_gpu(), "Only valid when running on the GPU"); + const void **src_ptr = &src; + uint64_t *size_ptr = &size; + send_n(src_ptr, size_ptr); +} + /// Sends an arbitrarily sized data buffer \p src across the shared channel in /// multiples of the packet length. template LIBC_INLINE void Port::send_n(const void *const *src, uint64_t *size) { - // TODO: We could send the first bytes in this call and potentially save an - // extra send operation. uint64_t num_sends = 0; send([&](Buffer *buffer, uint32_t id) { reinterpret_cast(buffer->data)[0] = lane_value(size, id); num_sends = is_process_gpu() ? lane_value(size, id) : max(lane_value(size, id), num_sends); + uint64_t len = + lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t) + ? sizeof(Buffer::data) - sizeof(uint64_t) + : lane_value(size, id); + inline_memcpy(&buffer->data[1], lane_value(src, id), len); }); - uint64_t idx = 0; - uint64_t mask = process.get_packet(index).header.mask; - while (gpu::ballot(mask, idx < num_sends)) { + uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t); + while (gpu::ballot(process.get_packet(index).header.mask, idx < num_sends)) { send([=](Buffer *buffer, uint32_t id) { - const uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data) - ? sizeof(Buffer::data) - : lane_value(size, id) - idx; + uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data) + ? sizeof(Buffer::data) + : lane_value(size, id) - idx; if (idx < lane_value(size, id)) - inline_memcpy( - buffer->data, - reinterpret_cast(lane_value(src, id)) + idx, len); + inline_memcpy(buffer->data, advance(lane_value(src, id), idx), len); }); idx += sizeof(Buffer::data); } } -/// Helper routine to simplify the interface when sending from the GPU using -/// thread private pointers to the underlying value. -template -LIBC_INLINE void Port::send_n(const void *src, uint64_t size) { - static_assert(is_process_gpu(), "Only valid when running on the GPU"); - const void **src_ptr = &src; - uint64_t *size_ptr = &size; - send_n(src_ptr, size_ptr); -} - /// Receives an arbitrarily sized data buffer across the shared channel in /// multiples of the packet length. The \p alloc function is called with the /// size of the data so that we can initialize the size of the \p dst buffer. @@ -468,8 +468,13 @@ reinterpret_cast(alloc(lane_value(size, id))); num_recvs = is_process_gpu() ? lane_value(size, id) : max(lane_value(size, id), num_recvs); + uint64_t len = + lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t) + ? sizeof(Buffer::data) - sizeof(uint64_t) + : lane_value(size, id); + inline_memcpy(lane_value(dst, id), &buffer->data[1], len); }); - uint64_t idx = 0; + uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t); uint64_t mask = process.get_packet(index).header.mask; while (gpu::ballot(mask, idx < num_recvs)) { recv([=](Buffer *buffer, uint32_t id) { @@ -477,8 +482,7 @@ ? sizeof(Buffer::data) : lane_value(size, id) - idx; if (idx < lane_value(size, id)) - inline_memcpy(reinterpret_cast(lane_value(dst, id)) + idx, - buffer->data, len); + inline_memcpy(advance(lane_value(dst, id), idx), buffer->data, len); }); idx += sizeof(Buffer::data); } diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h --- a/libc/src/__support/RPC/rpc_util.h +++ b/libc/src/__support/RPC/rpc_util.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H #define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H +#include "src/__support/CPP/type_traits.h" #include "src/__support/GPU/utils.h" #include "src/__support/macros/attributes.h" #include "src/__support/macros/properties/architectures.h" @@ -69,9 +70,13 @@ return x < y ? y : x; } -/// Advance the \p ptr by \p bytes. -template LIBC_INLINE T *advance(T ptr, U bytes) { - return reinterpret_cast(reinterpret_cast(ptr) + bytes); +/// Advance the \p p by \p bytes. +template LIBC_INLINE T *advance(T *ptr, U bytes) { + if constexpr (cpp::is_const_v) + return reinterpret_cast(reinterpret_cast(ptr) + + bytes); + else + return reinterpret_cast(reinterpret_cast(ptr) + bytes); } } // namespace rpc diff --git a/libc/utils/gpu/loader/Server.h b/libc/utils/gpu/loader/Server.h --- a/libc/utils/gpu/loader/Server.h +++ b/libc/utils/gpu/loader/Server.h @@ -35,6 +35,7 @@ uint64_t sizes[rpc::MAX_LANE_SIZE] = {0}; void *strs[rpc::MAX_LANE_SIZE] = {nullptr}; port->recv_n(strs, sizes, [&](uint64_t size) { return new char[size]; }); + port->send([](rpc::Buffer *) { /* void */ }); for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) { if (strs[i]) { fwrite(strs[i], sizes[i], 1, stderr);