This is an archive of the discontinued LLVM Phabricator instance.

[libc][rpc] Update locking to work on volta
ClosedPublic

Authored by JonChesterfield on May 4 2023, 1:14 PM.

Download Raw Diff

Details

Reviewers

jhuber6
jdoerfert

Commits

rG75b7b9f292f7: [libc][rpc] Update locking to work on volta
rGb1323738649e: [libc][rpc] Update locking to work on volta

Summary

Carefully work around not knowing the thread mask that nvptx intrinsic
functions require.

If the warp is converged when calling try_lock, a single rpc call will handle
all lanes within it. Otherwise more than one rpc call with thread masks that
compose to the unknown one will occur.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

JonChesterfield created this revision.May 4 2023, 1:14 PM

Herald added projects: Restricted Project, Restricted Project. · View Herald TranscriptMay 4 2023, 1:14 PM

Herald added subscribers: libc-commits, kosarev, mattd and 5 others. · View Herald Transcript

JonChesterfield requested review of this revision.May 4 2023, 1:14 PM

Harbormaster completed remote builds in B230078: Diff 519626.May 4 2023, 1:14 PM

We might end up with memory fences in lock/unlock but I'm hopeful the ones associated with send/recv will be sufficient.

Unlock is straightforward - fetch_and is doing the same thing as store zero, except making sure that at most one write of zero occurs in the warp.

Lock took a while to derive. The problem is that fetch_or can be run by threads which are not in lane_mask, or by a subset of lane_mask, and the threads outside of lane_mask may be the ones that win the fetch_or to set the bit. It might be clearer to use is_first_lane in lock instead of having all threads try to take the lock.

Both avoid branching in is_first_lane, partly on performance grounds and partly because simpler control flow seems to compile more reliably. This might allow removing the broadcast functions in util.

Fix missing header

Harbormaster completed remote builds in B230092: Diff 519649.May 4 2023, 2:11 PM

Logic is harder to understand, but is probably more correct.

libc/src/__support/RPC/rpc.h
158	We might need another lane sync here. I'll test when I put the parallelism back in.

This revision is now accepted and ready to land.May 4 2023, 2:11 PM

update comment

Harbormaster completed remote builds in B230097: Diff 519655.May 4 2023, 2:26 PM

Closed by commit rGb1323738649e: [libc][rpc] Update locking to work on volta (authored by JonChesterfield). · Explain WhyMay 4 2023, 2:31 PM

This revision was automatically updated to reflect the committed changes.

JonChesterfield added a commit: rGb1323738649e: [libc][rpc] Update locking to work on volta.

JonChesterfield added a reverting change: rG8aaaa1c3b987: Revert "[libc][rpc] Update locking to work on volta".May 4 2023, 2:44 PM

JonChesterfield added a commit: rG75b7b9f292f7: [libc][rpc] Update locking to work on volta.May 4 2023, 2:58 PM

Revision Contents

Path

Size

libc/

src/

__support/

CPP/

atomic.h

4 lines

GPU/

amdgpu/

utils.h

12 lines

generic/

utils.h

5 lines

nvptx/

utils.h

7 lines

RPC/

rpc.h

47 lines

rpc_util.h

1 line

Diff 519661

libc/src/__support/CPP/atomic.h

Show First 20 Lines • Show All 84 Lines • ▼ Show 20 Lines	public:
T fetch_add(T increment, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {		T fetch_add(T increment, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
return __atomic_fetch_add(&val, increment, int(mem_ord));		return __atomic_fetch_add(&val, increment, int(mem_ord));
}		}

T fetch_or(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {		T fetch_or(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
return __atomic_fetch_or(&val, mask, int(mem_ord));		return __atomic_fetch_or(&val, mask, int(mem_ord));
}		}

		T fetch_and(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
		return __atomic_fetch_and(&val, mask, int(mem_ord));
		}

T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {		T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
return __atomic_fetch_sub(&val, decrement, int(mem_ord));		return __atomic_fetch_sub(&val, decrement, int(mem_ord));
}		}

// Set the value without using an atomic operation. This is useful		// Set the value without using an atomic operation. This is useful
// in initializing atomic values without a constructor.		// in initializing atomic values without a constructor.
void set(T rhs) { val = rhs; }		void set(T rhs) { val = rhs; }
};		};
Show All 17 Lines

libc/src/__support/GPU/amdgpu/utils.h

Show First 20 Lines • Show All 100 Lines • ▼ Show 20 Lines	return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
get_num_threads_x() * get_num_threads_y() * get_thread_id_z();		get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
}		}

/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.		/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.
LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }		LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }

/// Returns the id of the thread inside of an AMD wavefront executing together.		/// Returns the id of the thread inside of an AMD wavefront executing together.
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {		[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
if (LANE_SIZE == 64)		if constexpr (LANE_SIZE == 64)
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));		return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
else		else
return __builtin_amdgcn_mbcnt_lo(~0u, 0u);		return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
}		}

/// Returns the bit-mask of active threads in the current wavefront.		/// Returns the bit-mask of active threads in the current wavefront.
[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {		[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
return __builtin_amdgcn_read_exec();		return __builtin_amdgcn_read_exec();
}		}

/// Copies the value from the first active thread in the wavefront to the rest.		/// Copies the value from the first active thread in the wavefront to the rest.
[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {		[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
return __builtin_amdgcn_readfirstlane(x);		return __builtin_amdgcn_readfirstlane(x);
}		}

		[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
		// the lane_mask & gives the nvptx semantics when lane_mask is a subset of
		// the active threads
		if constexpr (LANE_SIZE == 64) {
		return lane_mask & __builtin_amdgcn_ballot_w64(x);
		} else {
		return lane_mask & __builtin_amdgcn_ballot_w32(x);
		}
		}

/// Waits for all the threads in the block to converge and issues a fence.		/// Waits for all the threads in the block to converge and issues a fence.
[[clang::convergent]] LIBC_INLINE void sync_threads() {		[[clang::convergent]] LIBC_INLINE void sync_threads() {
__builtin_amdgcn_s_barrier();		__builtin_amdgcn_s_barrier();
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");		__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
}		}

/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.		/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {		[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {
__builtin_amdgcn_wave_barrier();		__builtin_amdgcn_wave_barrier();
}		}

} // namespace gpu		} // namespace gpu
} // namespace __llvm_libc		} // namespace __llvm_libc

#endif		#endif

libc/src/__support/GPU/generic/utils.h

	Show First 20 Lines • Show All 52 Lines • ▼ Show 20 Lines
	LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }			LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }

	LIBC_INLINE uint32_t get_lane_id() { return 0; }			LIBC_INLINE uint32_t get_lane_id() { return 0; }

	LIBC_INLINE uint64_t get_lane_mask() { return 1; }			LIBC_INLINE uint64_t get_lane_mask() { return 1; }

	LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }			LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }

				LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
				(void)lane_mask;
				return x;
				}

	LIBC_INLINE void sync_threads() {}			LIBC_INLINE void sync_threads() {}

	LIBC_INLINE void sync_lane(uint64_t) {}			LIBC_INLINE void sync_lane(uint64_t) {}

	} // namespace gpu			} // namespace gpu
	} // namespace __llvm_libc			} // namespace __llvm_libc

	#endif			#endif

libc/src/__support/GPU/nvptx/utils.h

Show First 20 Lines • Show All 112 Lines • ▼ Show 20 Lines	[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
uint64_t id = __builtin_ffsl(lane_mask) - 1;		uint64_t id = __builtin_ffsl(lane_mask) - 1;
#if __CUDA_ARCH__ >= 600		#if __CUDA_ARCH__ >= 600
return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);		return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);
#else		#else
return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);		return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
#endif		#endif
}		}

		[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
		#if __CUDA_ARCH__ >= 600
		return __nvvm_vote_ballot_sync(lane_mask, x);
		#else
		return lane_mask & __nvvm_vote_ballot(x);
		#endif
		}
/// Waits for all the threads in the block to converge and issues a fence.		/// Waits for all the threads in the block to converge and issues a fence.
[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }		[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }

/// Waits for all threads in the warp to reconverge for independent scheduling.		/// Waits for all threads in the warp to reconverge for independent scheduling.
[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {		[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
#if __CUDA_ARCH__ >= 700		#if __CUDA_ARCH__ >= 700
__nvvm_bar_warp_sync(mask);		__nvvm_bar_warp_sync(mask);
#else		#else
(void)mask;		(void)mask;
#endif		#endif
}		}

} // namespace gpu		} // namespace gpu
} // namespace __llvm_libc		} // namespace __llvm_libc

#endif		#endif

libc/src/__support/RPC/rpc.h

Show First 20 Lines • Show All 101 Lines • ▼ Show 20 Lines	template <bool InvertInbox> struct Process {
}		}

/// Determines if this process needs to wait for ownership of the buffer.		/// Determines if this process needs to wait for ownership of the buffer.
LIBC_INLINE static bool buffer_unavailable(uint32_t in, uint32_t out) {		LIBC_INLINE static bool buffer_unavailable(uint32_t in, uint32_t out) {
return in != out;		return in != out;
}		}

/// Attempt to claim the lock at index. Return true on lock taken.		/// Attempt to claim the lock at index. Return true on lock taken.
		/// lane_mask is a bitmap of the threads in the warp that would hold the
		/// single lock on success, e.g. the result of gpu::get_lane_mask()
/// The lock is held when the zeroth bit of the uint32_t at lock[index]		/// The lock is held when the zeroth bit of the uint32_t at lock[index]
/// is set, and available when that bit is clear. Bits [1, 32) are zero.		/// is set, and available when that bit is clear. Bits [1, 32) are zero.
/// Or with one is a no-op when the lock is already held.		/// Or with one is a no-op when the lock is already held.
LIBC_INLINE bool try_lock(uint64_t, uint64_t index) {		[[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
return lock[index].fetch_or(1, cpp::MemoryOrder::RELAXED) == 0;		uint64_t index) {
		// On amdgpu, test and set to lock[index] and a sync_lane would suffice
		// On volta, need to handle differences between the threads running and
		// the threads that were detected in the previous call to get_lane_mask()
		//
		// All threads in lane_mask try to claim the lock. At most one can succeed.
		// There may be threads active which are not in lane mask which must not
		// succeed in taking the lock, as otherwise it will leak. This is handled
		// by making threads which are not in lane_mask or with 0, a no-op.
		uint32_t id = gpu::get_lane_id();
		bool id_in_lane_mask = lane_mask & (1ul << id);

		// All threads in the warp call fetch_or. Possibly at the same time.
		bool before =
		lock[index].fetch_or(id_in_lane_mask, cpp::MemoryOrder::RELAXED);
		uint64_t packed = gpu::ballot(lane_mask, before);

		// If every bit set in lane_mask is also set in packed, every single thread
		// in the warp failed to get the lock. Ballot returns unset for threads not
		// in the lane mask.
		//
		// Cases, per thread:
		// mask==0 -> unspecified before, discarded by ballot -> 0
		// mask==1 and before==0 (success), set zero by ballot -> 0
		// mask==1 and before==1 (failure), set one by ballot -> 1
		//
		// mask != packed implies at least one of the threads got the lock
		// atomic semantics of fetch_or mean at most one of the threads for the lock
		return lane_mask != packed;
}		}

// Unlock the lock at index.		// Unlock the lock at index.
LIBC_INLINE void unlock(uint64_t, uint64_t index) {		[[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
lock[index].store(0, cpp::MemoryOrder::RELAXED);		uint64_t index) {
		// Wait for other threads in the warp to finish using the lock
		gpu::sync_lane(lane_mask);

		// Use exactly one thread to clear the bit at position 0 in lock[index]
		// Must restrict to a single thread to avoid one thread dropping the lock,
		// then an unrelated warp claiming the lock, then a second thread in this
		// warp dropping the lock again.
		uint32_t and_mask = ~(rpc::is_first_lane(lane_mask) ? 1 : 0);
		lock[index].fetch_and(and_mask, cpp::MemoryOrder::RELAXED);
		jhuber6Unsubmitted Not Done Reply Inline Actions We might need another lane sync here. I'll test when I put the parallelism back in. jhuber6: We might need another lane sync here. I'll test when I put the parallelism back in.
}		}
};		};

/// The port provides the interface to communicate between the multiple		/// The port provides the interface to communicate between the multiple
/// processes. A port is conceptually an index into the memory provided by the		/// processes. A port is conceptually an index into the memory provided by the
/// underlying process that is guarded by a lock bit.		/// underlying process that is guarded by a lock bit.
template <bool T> struct Port {		template <bool T> struct Port {
// TODO: This should be move-only.		// TODO: This should be move-only.
▲ Show 20 Lines • Show All 221 Lines • Show Last 20 Lines

libc/src/__support/RPC/rpc_util.h

	//===-- Shared memory RPC client / server utilities -------------- C++ --===//			//===-- Shared memory RPC client / server utilities -------------- C++ --===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H			#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H
	#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H			#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H

				#include "src/__support/GPU/utils.h"
	#include "src/__support/macros/attributes.h"			#include "src/__support/macros/attributes.h"
	#include "src/__support/macros/properties/architectures.h"			#include "src/__support/macros/properties/architectures.h"

	namespace __llvm_libc {			namespace __llvm_libc {
	namespace rpc {			namespace rpc {

	/// Suspend the thread briefly to assist the thread scheduler during busy loops.			/// Suspend the thread briefly to assist the thread scheduler during busy loops.
	LIBC_INLINE void sleep_briefly() {			LIBC_INLINE void sleep_briefly() {
	Show All 13 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[libc][rpc] Update locking to work on voltaClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 519661

libc/src/__support/CPP/atomic.h

libc/src/__support/GPU/amdgpu/utils.h

libc/src/__support/GPU/generic/utils.h

libc/src/__support/GPU/nvptx/utils.h

libc/src/__support/RPC/rpc.h

libc/src/__support/RPC/rpc_util.h

[libc][rpc] Update locking to work on volta
ClosedPublic