diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -125,7 +125,8 @@
 }
 
 /// Copies the value from the first active thread in the wavefront to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,
+                                                           uint32_t x) {
   return __builtin_amdgcn_readfirstlane(x);
 }
 
diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
--- a/libc/src/__support/GPU/generic/utils.h
+++ b/libc/src/__support/GPU/generic/utils.h
@@ -61,12 +61,9 @@
 
 LIBC_INLINE uint64_t get_lane_mask() { return 1; }
 
-LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }
+LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; }
 
-LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
-  (void)lane_mask;
-  return x;
-}
+LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; }
 
 LIBC_INLINE void sync_threads() {}
 
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -111,14 +111,12 @@
 }
 
 /// Copies the value from the first active thread in the warp to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
-  // NOTE: This is not sufficient in all cases on Volta hardware or later. The
-  // lane mask returned here is not always the true lane mask used by the
-  // intrinsics in cases of incedental or enforced divergence by the user.
-  uint32_t lane_mask = static_cast<uint32_t>(get_lane_mask());
-  uint32_t id = __builtin_ffs(lane_mask) - 1;
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
+                                                           uint32_t x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
+  uint32_t id = __builtin_ffs(mask) - 1;
 #if __CUDA_ARCH__ >= 600
-  return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);
+  return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
 #else
   return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
 #endif
@@ -126,10 +124,11 @@
 
 /// Returns a bitmask of threads in the current lane for which \p x is true.
 [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
 #if __CUDA_ARCH__ >= 600
-  return __nvvm_vote_ballot_sync(static_cast<uint32_t>(lane_mask), x);
+  return __nvvm_vote_ballot_sync(mask, x);
 #else
-  return static_cast<uint32_t>(lane_mask) & __nvvm_vote_ballot(x);
+  return mask & __nvvm_vote_ballot(x);
 #endif
 }
 /// Waits for all the threads in the block to converge and issues a fence.
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -116,13 +116,15 @@
   }
 
   /// Retrieve the inbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_inbox(uint32_t index) {
-    return inbox[index].load(cpp::MemoryOrder::RELAXED);
+  LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) {
+    return gpu::broadcast_value(lane_mask,
+                                inbox[index].load(cpp::MemoryOrder::RELAXED));
   }
 
   /// Retrieve the outbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_outbox(uint32_t index) {
-    return outbox[index].load(cpp::MemoryOrder::RELAXED);
+  LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) {
+    return gpu::broadcast_value(lane_mask,
+                                outbox[index].load(cpp::MemoryOrder::RELAXED));
   }
 
   /// Signal to the other process that this one is finished with the buffer.
@@ -138,11 +140,11 @@
 
   // Given the current outbox and inbox values, wait until the inbox changes
   // to indicate that this thread owns the buffer element.
-  LIBC_INLINE void wait_for_ownership(uint32_t index, uint32_t outbox,
-                                      uint32_t in) {
+  LIBC_INLINE void wait_for_ownership(uint64_t lane_mask, uint32_t index,
+                                      uint32_t outbox, uint32_t in) {
     while (buffer_unavailable(in, outbox)) {
       sleep_briefly();
-      in = load_inbox(index);
+      in = load_inbox(lane_mask, index);
     }
     atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
   }
@@ -393,10 +395,10 @@
 template <bool T, typename S>
 template <typename F>
 LIBC_INLINE void Port<T, S>::send(F fill) {
-  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index);
+  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
 
   // We need to wait until we own the buffer before sending.
-  process.wait_for_ownership(index, out, in);
+  process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p fill function to initialize the buffer and release the memory.
   invoke_rpc(fill, process.packet[index]);
@@ -416,10 +418,10 @@
     owns_buffer = false;
   }
 
-  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index);
+  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
 
   // We need to wait until we own the buffer before receiving.
-  process.wait_for_ownership(index, out, in);
+  process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p use function to read the memory out of the buffer.
   invoke_rpc(use, process.packet[index]);
@@ -534,8 +536,8 @@
     if (!process.try_lock(lane_mask, index))
       continue;
 
-    uint32_t in = process.load_inbox(index);
-    uint32_t out = process.load_outbox(index);
+    uint32_t in = process.load_inbox(lane_mask, index);
+    uint32_t out = process.load_outbox(lane_mask, index);
 
     // Once we acquire the index we need to check if we are in a valid sending
     // state.
@@ -561,8 +563,9 @@
     Server<lane_size>::try_open() {
   // Perform a naive linear scan for a port that has a pending request.
   for (uint32_t index = 0; index < process.port_count; ++index) {
-    uint32_t in = process.load_inbox(index);
-    uint32_t out = process.load_outbox(index);
+    uint64_t lane_mask = gpu::get_lane_mask();
+    uint32_t in = process.load_inbox(lane_mask, index);
+    uint32_t out = process.load_outbox(lane_mask, index);
 
     // The server is passive, if there is no work pending don't bother
     // opening a port.
@@ -570,12 +573,11 @@
       continue;
 
     // Attempt to acquire the lock on this index.
-    uint64_t lane_mask = gpu::get_lane_mask();
     if (!process.try_lock(lane_mask, index))
       continue;
 
-    in = process.load_inbox(index);
-    out = process.load_outbox(index);
+    in = process.load_inbox(lane_mask, index);
+    out = process.load_outbox(lane_mask, index);
 
     if (process.buffer_unavailable(in, out)) {
       process.unlock(lane_mask, index);
diff --git a/libc/test/src/__support/RPC/rpc_smoke_test.cpp b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
--- a/libc/test/src/__support/RPC/rpc_smoke_test.cpp
+++ b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
@@ -49,36 +49,37 @@
   EXPECT_TRUE(ProcB.try_lock(lane_mask, index));
 
   // All zero to begin with
-  EXPECT_EQ(ProcA.load_inbox(index), 0u);
-  EXPECT_EQ(ProcB.load_inbox(index), 0u);
-  EXPECT_EQ(ProcA.load_outbox(index), 0u);
-  EXPECT_EQ(ProcB.load_outbox(index), 0u);
+  EXPECT_EQ(ProcA.load_inbox(lane_mask, index), 0u);
+  EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 0u);
+  EXPECT_EQ(ProcA.load_outbox(lane_mask, index), 0u);
+  EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u);
 
   // Available for ProcA and not for ProcB
-  EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(index),
-                                        ProcA.load_outbox(index)));
-  EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(index),
-                                       ProcB.load_outbox(index)));
+  EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index),
+                                        ProcA.load_outbox(lane_mask, index)));
+  EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index),
+                                       ProcB.load_outbox(lane_mask, index)));
 
   // ProcA write to outbox
-  uint32_t ProcAOutbox = ProcA.load_outbox(index);
+  uint32_t ProcAOutbox = ProcA.load_outbox(lane_mask, index);
   EXPECT_EQ(ProcAOutbox, 0u);
   ProcAOutbox = ProcA.invert_outbox(index, ProcAOutbox);
   EXPECT_EQ(ProcAOutbox, 1u);
 
   // No longer available for ProcA
-  EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(index), ProcAOutbox));
+  EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index),
+                                       ProcAOutbox));
 
   // Outbox is still zero, hasn't been written to
-  EXPECT_EQ(ProcB.load_outbox(index), 0u);
+  EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u);
 
   // Wait for ownership will terminate because load_inbox returns 1
-  EXPECT_EQ(ProcB.load_inbox(index), 1u);
-  ProcB.wait_for_ownership(index, 0u, 0u);
+  EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 1u);
+  ProcB.wait_for_ownership(lane_mask, index, 0u, 0u);
 
   // and B now has the buffer available
-  EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(index),
-                                        ProcB.load_outbox(index)));
+  EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index),
+                                        ProcB.load_outbox(lane_mask, index)));
 
   // Enough checks for one test, close the locks
   ProcA.unlock(lane_mask, index);