diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h --- a/libc/src/__support/GPU/amdgpu/utils.h +++ b/libc/src/__support/GPU/amdgpu/utils.h @@ -125,7 +125,8 @@ } /// Copies the value from the first active thread in the wavefront to the rest. -[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) { +[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t, + uint32_t x) { return __builtin_amdgcn_readfirstlane(x); } diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h --- a/libc/src/__support/GPU/generic/utils.h +++ b/libc/src/__support/GPU/generic/utils.h @@ -61,12 +61,9 @@ LIBC_INLINE uint64_t get_lane_mask() { return 1; } -LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; } +LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; } -LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { - (void)lane_mask; - return x; -} +LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; } LIBC_INLINE void sync_threads() {} diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h --- a/libc/src/__support/GPU/nvptx/utils.h +++ b/libc/src/__support/GPU/nvptx/utils.h @@ -111,14 +111,12 @@ } /// Copies the value from the first active thread in the warp to the rest. -[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) { - // NOTE: This is not sufficient in all cases on Volta hardware or later. The - // lane mask returned here is not always the true lane mask used by the - // intrinsics in cases of incedental or enforced divergence by the user. - uint32_t lane_mask = static_cast(get_lane_mask()); - uint32_t id = __builtin_ffs(lane_mask) - 1; +[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, + uint32_t x) { + uint32_t mask = static_cast(lane_mask); + uint32_t id = __builtin_ffs(mask) - 1; #if __CUDA_ARCH__ >= 600 - return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1); + return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1); #else return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1); #endif @@ -126,10 +124,11 @@ /// Returns a bitmask of threads in the current lane for which \p x is true. [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { + uint32_t mask = static_cast(lane_mask); #if __CUDA_ARCH__ >= 600 - return __nvvm_vote_ballot_sync(static_cast(lane_mask), x); + return __nvvm_vote_ballot_sync(mask, x); #else - return static_cast(lane_mask) & __nvvm_vote_ballot(x); + return mask & __nvvm_vote_ballot(x); #endif } /// Waits for all the threads in the block to converge and issues a fence. diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -116,13 +116,15 @@ } /// Retrieve the inbox state from memory shared between processes. - LIBC_INLINE uint32_t load_inbox(uint32_t index) { - return inbox[index].load(cpp::MemoryOrder::RELAXED); + LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) { + return gpu::broadcast_value(lane_mask, + inbox[index].load(cpp::MemoryOrder::RELAXED)); } /// Retrieve the outbox state from memory shared between processes. - LIBC_INLINE uint32_t load_outbox(uint32_t index) { - return outbox[index].load(cpp::MemoryOrder::RELAXED); + LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) { + return gpu::broadcast_value(lane_mask, + outbox[index].load(cpp::MemoryOrder::RELAXED)); } /// Signal to the other process that this one is finished with the buffer. @@ -138,11 +140,11 @@ // Given the current outbox and inbox values, wait until the inbox changes // to indicate that this thread owns the buffer element. - LIBC_INLINE void wait_for_ownership(uint32_t index, uint32_t outbox, - uint32_t in) { + LIBC_INLINE void wait_for_ownership(uint64_t lane_mask, uint32_t index, + uint32_t outbox, uint32_t in) { while (buffer_unavailable(in, outbox)) { sleep_briefly(); - in = load_inbox(index); + in = load_inbox(lane_mask, index); } atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); } @@ -393,10 +395,10 @@ template template LIBC_INLINE void Port::send(F fill) { - uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index); + uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index); // We need to wait until we own the buffer before sending. - process.wait_for_ownership(index, out, in); + process.wait_for_ownership(lane_mask, index, out, in); // Apply the \p fill function to initialize the buffer and release the memory. invoke_rpc(fill, process.packet[index]); @@ -416,10 +418,10 @@ owns_buffer = false; } - uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index); + uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index); // We need to wait until we own the buffer before receiving. - process.wait_for_ownership(index, out, in); + process.wait_for_ownership(lane_mask, index, out, in); // Apply the \p use function to read the memory out of the buffer. invoke_rpc(use, process.packet[index]); @@ -534,8 +536,8 @@ if (!process.try_lock(lane_mask, index)) continue; - uint32_t in = process.load_inbox(index); - uint32_t out = process.load_outbox(index); + uint32_t in = process.load_inbox(lane_mask, index); + uint32_t out = process.load_outbox(lane_mask, index); // Once we acquire the index we need to check if we are in a valid sending // state. @@ -561,8 +563,9 @@ Server::try_open() { // Perform a naive linear scan for a port that has a pending request. for (uint32_t index = 0; index < process.port_count; ++index) { - uint32_t in = process.load_inbox(index); - uint32_t out = process.load_outbox(index); + uint64_t lane_mask = gpu::get_lane_mask(); + uint32_t in = process.load_inbox(lane_mask, index); + uint32_t out = process.load_outbox(lane_mask, index); // The server is passive, if there is no work pending don't bother // opening a port. @@ -570,12 +573,11 @@ continue; // Attempt to acquire the lock on this index. - uint64_t lane_mask = gpu::get_lane_mask(); if (!process.try_lock(lane_mask, index)) continue; - in = process.load_inbox(index); - out = process.load_outbox(index); + in = process.load_inbox(lane_mask, index); + out = process.load_outbox(lane_mask, index); if (process.buffer_unavailable(in, out)) { process.unlock(lane_mask, index); diff --git a/libc/test/src/__support/RPC/rpc_smoke_test.cpp b/libc/test/src/__support/RPC/rpc_smoke_test.cpp --- a/libc/test/src/__support/RPC/rpc_smoke_test.cpp +++ b/libc/test/src/__support/RPC/rpc_smoke_test.cpp @@ -49,36 +49,37 @@ EXPECT_TRUE(ProcB.try_lock(lane_mask, index)); // All zero to begin with - EXPECT_EQ(ProcA.load_inbox(index), 0u); - EXPECT_EQ(ProcB.load_inbox(index), 0u); - EXPECT_EQ(ProcA.load_outbox(index), 0u); - EXPECT_EQ(ProcB.load_outbox(index), 0u); + EXPECT_EQ(ProcA.load_inbox(lane_mask, index), 0u); + EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 0u); + EXPECT_EQ(ProcA.load_outbox(lane_mask, index), 0u); + EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u); // Available for ProcA and not for ProcB - EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(index), - ProcA.load_outbox(index))); - EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(index), - ProcB.load_outbox(index))); + EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index), + ProcA.load_outbox(lane_mask, index))); + EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index), + ProcB.load_outbox(lane_mask, index))); // ProcA write to outbox - uint32_t ProcAOutbox = ProcA.load_outbox(index); + uint32_t ProcAOutbox = ProcA.load_outbox(lane_mask, index); EXPECT_EQ(ProcAOutbox, 0u); ProcAOutbox = ProcA.invert_outbox(index, ProcAOutbox); EXPECT_EQ(ProcAOutbox, 1u); // No longer available for ProcA - EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(index), ProcAOutbox)); + EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index), + ProcAOutbox)); // Outbox is still zero, hasn't been written to - EXPECT_EQ(ProcB.load_outbox(index), 0u); + EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u); // Wait for ownership will terminate because load_inbox returns 1 - EXPECT_EQ(ProcB.load_inbox(index), 1u); - ProcB.wait_for_ownership(index, 0u, 0u); + EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 1u); + ProcB.wait_for_ownership(lane_mask, index, 0u, 0u); // and B now has the buffer available - EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(index), - ProcB.load_outbox(index))); + EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index), + ProcB.load_outbox(lane_mask, index))); // Enough checks for one test, close the locks ProcA.unlock(lane_mask, index);