diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -122,6 +122,7 @@
   return __builtin_amdgcn_readfirstlane(x);
 }
 
+/// Returns a bitmask of threads in the current lane for which \p x is true.
 [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
   // the lane_mask & gives the nvptx semantics when lane_mask is a subset of
   // the active threads
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -118,6 +118,7 @@
 #endif
 }
 
+/// Returns a bitmask of threads in the current lane for which \p x is true.
 [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
 #if __CUDA_ARCH__ >= 600
   return __nvvm_vote_ballot_sync(lane_mask, x);
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -430,7 +430,8 @@
     num_sends = is_process_gpu() ? lane_value(size, id)
                                  : max(lane_value(size, id), num_sends);
   });
-  for (uint64_t idx = 0; idx < num_sends; idx += sizeof(Buffer::data)) {
+  uint64_t idx = 0;
+  while (gpu::ballot(process.get_packet(index).header.mask, idx < num_sends)) {
     send([=](Buffer *buffer, uint32_t id) {
       const uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
                                ? sizeof(Buffer::data)
@@ -440,8 +441,8 @@
             buffer->data,
             reinterpret_cast<const uint8_t *>(lane_value(src, id)) + idx, len);
     });
+    idx += sizeof(Buffer::data);
   }
-  gpu::sync_lane(process.get_packet(index).header.mask);
 }
 
 /// Helper routine to simplify the interface when sending from the GPU using
@@ -468,7 +469,8 @@
     num_recvs = is_process_gpu() ? lane_value(size, id)
                                  : max(lane_value(size, id), num_recvs);
   });
-  for (uint64_t idx = 0; idx < num_recvs; idx += sizeof(Buffer::data)) {
+  uint64_t idx = 0;
+  while (gpu::ballot(process.get_packet(index).header.mask, idx < num_recvs)) {
     recv([=](Buffer *buffer, uint32_t id) {
       uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
                          ? sizeof(Buffer::data)
@@ -477,8 +479,8 @@
         inline_memcpy(reinterpret_cast<uint8_t *>(lane_value(dst, id)) + idx,
                       buffer->data, len);
     });
+    idx += sizeof(Buffer::data);
   }
-  return;
 }
 
 /// Attempts to open a port to use as the client. The client can only open a
diff --git a/libc/test/integration/startup/gpu/CMakeLists.txt b/libc/test/integration/startup/gpu/CMakeLists.txt
--- a/libc/test/integration/startup/gpu/CMakeLists.txt
+++ b/libc/test/integration/startup/gpu/CMakeLists.txt
@@ -50,5 +50,6 @@
   SRCS
    rpc_stream_test.cpp
   LOADER_ARGS
-   --threads-x 32
+   --threads 32
+   --blocks 8
 )
diff --git a/libc/test/integration/startup/gpu/rpc_stream_test.cpp b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
--- a/libc/test/integration/startup/gpu/rpc_stream_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
@@ -20,11 +20,12 @@
 using namespace __llvm_libc;
 
 static void test_stream() {
-  const char str[] = "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
-                     "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
-                     "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
-                     "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
-                     "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy";
+  static const char str[] =
+      "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
+      "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
+      "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
+      "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
+      "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy";
   uint64_t send_size = sizeof(str);
   void *send_ptr = malloc(send_size);
   void *recv_ptr;
@@ -44,8 +45,50 @@
   free(recv_ptr);
 }
 
+static void test_divergent() {
+  static const uint8_t data[] = {
+      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+      15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+      30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+      45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+      60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+      75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+      90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+      105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+      120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+      135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+      150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+      165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+      180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+      195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+      210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+      225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+      240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+      255,
+  };
+
+  uint8_t buffer[128] = {0};
+  uint64_t offset =
+      (gpu::get_thread_id() + gpu::get_num_threads() * gpu::get_block_id()) %
+      128;
+  void *recv_ptr;
+  uint64_t recv_size;
+  inline_memcpy(buffer, &data[offset], offset);
+  ASSERT_TRUE(inline_memcmp(buffer, &data[offset], offset) == 0 &&
+              "Data mismatch");
+  rpc::Client::Port port = rpc::client.open<rpc::TEST_STREAM>();
+  port.send_n(buffer, offset);
+  inline_memset(buffer, offset, 0);
+  port.recv_n(&recv_ptr, &recv_size, [&](uint64_t) { return buffer; });
+  port.close();
+
+  ASSERT_TRUE(inline_memcmp(recv_ptr, &data[offset], recv_size) == 0 &&
+              "Data mismatch");
+  ASSERT_TRUE(recv_size == offset && "Data size mismatch");
+}
+
 TEST_MAIN(int argc, char **argv, char **envp) {
-  test_stream();
+  test_divergent();
 
   return 0;
 }