diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -184,13 +184,24 @@ // // mask != packed implies at least one of the threads got the lock // atomic semantics of fetch_or mean at most one of the threads for the lock - return lane_mask != packed; + + // If holding the lock then the caller can load values knowing said loads + // won't move past the lock. No such guarantee is needed if the lock acquire + // failed. This conditional branch is expected to fold in the caller after + // inlining the current function. + bool holding_lock = lane_mask != packed; + if (holding_lock) + atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + return holding_lock; } /// Unlock the lock at index. We need a lane sync to keep this function /// convergent, otherwise the compiler will sink the store and deadlock. [[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask, uint64_t index) { + // Do not move any writes past the unlock + atomic_thread_fence(cpp::MemoryOrder::RELEASE); + // Wait for other threads in the warp to finish using the lock gpu::sync_lane(lane_mask); @@ -479,9 +490,6 @@ if (!this->try_lock(lane_mask, index)) continue; - // The mailbox state must be read with the lock held. - atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); - uint32_t in = this->load_inbox(index); uint32_t out = this->load_outbox(index); @@ -528,13 +536,9 @@ // Attempt to acquire the lock on this index. uint64_t lane_mask = gpu::get_lane_mask(); - // Attempt to acquire the lock on this index. if (!this->try_lock(lane_mask, index)) continue; - // The mailbox state must be read with the lock held. - atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); - in = this->load_inbox(index); out = this->load_outbox(index);