diff --git a/libc/src/__support/OSUtil/gpu/io.cpp b/libc/src/__support/OSUtil/gpu/io.cpp
--- a/libc/src/__support/OSUtil/gpu/io.cpp
+++ b/libc/src/__support/OSUtil/gpu/io.cpp
@@ -17,6 +17,7 @@
 void write_to_stderr(cpp::string_view msg) {
   rpc::Client::Port port = rpc::client.open<rpc::PRINT_TO_STDERR>();
   port.send_n(msg.data(), msg.size());
+  port.recv([](rpc::Buffer *) { /* void */ });
   port.close();
 }
 
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -417,44 +417,44 @@
   send([](Buffer *) { /* no-op */ });
 }
 
+/// Helper routine to simplify the interface when sending from the GPU using
+/// thread private pointers to the underlying value.
+template <bool T>
+LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
+  static_assert(is_process_gpu(), "Only valid when running on the GPU");
+  const void **src_ptr = &src;
+  uint64_t *size_ptr = &size;
+  send_n(src_ptr, size_ptr);
+}
+
 /// Sends an arbitrarily sized data buffer \p src across the shared channel in
 /// multiples of the packet length.
 template <bool T>
 LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
-  // TODO: We could send the first bytes in this call and potentially save an
-  // extra send operation.
   uint64_t num_sends = 0;
   send([&](Buffer *buffer, uint32_t id) {
     reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
     num_sends = is_process_gpu() ? lane_value(size, id)
                                  : max(lane_value(size, id), num_sends);
+    uint64_t len =
+        lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
+            ? sizeof(Buffer::data) - sizeof(uint64_t)
+            : lane_value(size, id);
+    inline_memcpy(&buffer->data[1], lane_value(src, id), len);
   });
-  uint64_t idx = 0;
-  uint64_t mask = process.get_packet(index).header.mask;
-  while (gpu::ballot(mask, idx < num_sends)) {
+  uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
+  while (gpu::ballot(process.get_packet(index).header.mask, idx < num_sends)) {
     send([=](Buffer *buffer, uint32_t id) {
-      const uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
-                               ? sizeof(Buffer::data)
-                               : lane_value(size, id) - idx;
+      uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
+                         ? sizeof(Buffer::data)
+                         : lane_value(size, id) - idx;
       if (idx < lane_value(size, id))
-        inline_memcpy(
-            buffer->data,
-            reinterpret_cast<const uint8_t *>(lane_value(src, id)) + idx, len);
+        inline_memcpy(buffer->data, advance(lane_value(src, id), idx), len);
     });
     idx += sizeof(Buffer::data);
   }
 }
 
-/// Helper routine to simplify the interface when sending from the GPU using
-/// thread private pointers to the underlying value.
-template <bool T>
-LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
-  static_assert(is_process_gpu(), "Only valid when running on the GPU");
-  const void **src_ptr = &src;
-  uint64_t *size_ptr = &size;
-  send_n(src_ptr, size_ptr);
-}
-
 /// Receives an arbitrarily sized data buffer across the shared channel in
 /// multiples of the packet length. The \p alloc function is called with the
 /// size of the data so that we can initialize the size of the \p dst buffer.
@@ -468,8 +468,13 @@
         reinterpret_cast<uint8_t *>(alloc(lane_value(size, id)));
     num_recvs = is_process_gpu() ? lane_value(size, id)
                                  : max(lane_value(size, id), num_recvs);
+    uint64_t len =
+        lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
+            ? sizeof(Buffer::data) - sizeof(uint64_t)
+            : lane_value(size, id);
+    inline_memcpy(lane_value(dst, id), &buffer->data[1], len);
   });
-  uint64_t idx = 0;
+  uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
   uint64_t mask = process.get_packet(index).header.mask;
   while (gpu::ballot(mask, idx < num_recvs)) {
     recv([=](Buffer *buffer, uint32_t id) {
@@ -477,8 +482,7 @@
                          ? sizeof(Buffer::data)
                          : lane_value(size, id) - idx;
       if (idx < lane_value(size, id))
-        inline_memcpy(reinterpret_cast<uint8_t *>(lane_value(dst, id)) + idx,
-                      buffer->data, len);
+        inline_memcpy(advance(lane_value(dst, id), idx), buffer->data, len);
     });
     idx += sizeof(Buffer::data);
   }
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H
 #define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H
 
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/properties/architectures.h"
@@ -69,9 +70,13 @@
   return x < y ? y : x;
 }
 
-/// Advance the \p ptr by \p bytes.
-template <typename T, typename U> LIBC_INLINE T *advance(T ptr, U bytes) {
-  return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes);
+/// Advance the \p p by \p bytes.
+template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
+  if constexpr (cpp::is_const_v<T>)
+    return reinterpret_cast<T *>(reinterpret_cast<const uint8_t *>(ptr) +
+                                 bytes);
+  else
+    return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes);
 }
 
 } // namespace rpc
diff --git a/libc/utils/gpu/loader/Server.h b/libc/utils/gpu/loader/Server.h
--- a/libc/utils/gpu/loader/Server.h
+++ b/libc/utils/gpu/loader/Server.h
@@ -35,6 +35,7 @@
       uint64_t sizes[rpc::MAX_LANE_SIZE] = {0};
       void *strs[rpc::MAX_LANE_SIZE] = {nullptr};
       port->recv_n(strs, sizes, [&](uint64_t size) { return new char[size]; });
+      port->send([](rpc::Buffer *) { /* void */ });
       for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) {
         if (strs[i]) {
           fwrite(strs[i], sizes[i], 1, stderr);