Diff 555019

libc/src/__support/GPU/amdgpu/utils.h

Show First 20 Lines • Show All 119 Lines • ▼ Show 20 Lines

}

/// Returns the bit-mask of active threads in the current wavefront.

[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {

return __builtin_amdgcn_read_exec();

}

/// Copies the value from the first active thread in the wavefront to the rest.

[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {

[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,

uint32_t x) {

return __builtin_amdgcn_readfirstlane(x);

jhuber6Unsubmitted

Not Done

/// Copies the value from the first active thread in the wavefront to the rest.

- [[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,

+ [[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,

uint32_t x) {

- (void)lane_mask;

return __builtin_amdgcn_readfirstlane(x);

You can just leave out the argument, same below.

jhuber6: You can just leave out the argument, same below.

}

/// Returns a bitmask of threads in the current lane for which \p x is true.

[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {

// the lane_mask & gives the nvptx semantics when lane_mask is a subset of

// the active threads

if constexpr (LANE_SIZE == 64) {

return lane_mask & __builtin_amdgcn_ballot_w64(x);

▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

libc/src/__support/GPU/generic/utils.h

	Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines
	LIBC_INLINE uint64_t get_thread_id() { return 0; }			LIBC_INLINE uint64_t get_thread_id() { return 0; }

	LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }			LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }

	LIBC_INLINE uint32_t get_lane_id() { return 0; }			LIBC_INLINE uint32_t get_lane_id() { return 0; }

	LIBC_INLINE uint64_t get_lane_mask() { return 1; }			LIBC_INLINE uint64_t get_lane_mask() { return 1; }

	LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }			LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; }

	LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {			LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; }
	(void)lane_mask;
	return x;
	}

	LIBC_INLINE void sync_threads() {}			LIBC_INLINE void sync_threads() {}

	LIBC_INLINE void sync_lane(uint64_t) {}			LIBC_INLINE void sync_lane(uint64_t) {}

	LIBC_INLINE uint64_t processor_clock() { return 0; }			LIBC_INLINE uint64_t processor_clock() { return 0; }

	LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; }			LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; }

	} // namespace gpu			} // namespace gpu
	} // namespace __llvm_libc			} // namespace __llvm_libc

	#endif			#endif

libc/src/__support/GPU/nvptx/utils.h

	Show First 20 Lines • Show All 105 Lines • ▼ Show 20 Lines
	/// Returns the bit-mask of active threads in the current warp.			/// Returns the bit-mask of active threads in the current warp.
	[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {			[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
	uint32_t mask;			uint32_t mask;
	LIBC_INLINE_ASM("activemask.b32 %0;" : "=r"(mask));			LIBC_INLINE_ASM("activemask.b32 %0;" : "=r"(mask));
	return mask;			return mask;
	}			}

	/// Copies the value from the first active thread in the warp to the rest.			/// Copies the value from the first active thread in the warp to the rest.
	[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {			[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
	// NOTE: This is not sufficient in all cases on Volta hardware or later. The			uint32_t x) {
	// lane mask returned here is not always the true lane mask used by the			uint32_t mask = static_cast<uint32_t>(lane_mask);
	// intrinsics in cases of incedental or enforced divergence by the user.			uint32_t id = __builtin_ffs(mask) - 1;
	uint32_t lane_mask = static_cast<uint32_t>(get_lane_mask());
	uint32_t id = __builtin_ffs(lane_mask) - 1;
	#if __CUDA_ARCH__ >= 600			#if __CUDA_ARCH__ >= 600
	return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);			return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
	#else			#else
	return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);			return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
	#endif			#endif
	}			}

	/// Returns a bitmask of threads in the current lane for which \p x is true.			/// Returns a bitmask of threads in the current lane for which \p x is true.
	[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {			[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
				uint32_t mask = static_cast<uint32_t>(lane_mask);
				jhuber6Unsubmitted Not Done Reply Inline Actions Unrelated? jhuber6: Unrelated?
	#if __CUDA_ARCH__ >= 600			#if __CUDA_ARCH__ >= 600
	return __nvvm_vote_ballot_sync(static_cast<uint32_t>(lane_mask), x);			return __nvvm_vote_ballot_sync(mask, x);
	#else			#else
	return static_cast<uint32_t>(lane_mask) & __nvvm_vote_ballot(x);			return mask & __nvvm_vote_ballot(x);
	#endif			#endif
	}			}
	/// Waits for all the threads in the block to converge and issues a fence.			/// Waits for all the threads in the block to converge and issues a fence.
	[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }			[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }

	/// Waits for all threads in the warp to reconverge for independent scheduling.			/// Waits for all threads in the warp to reconverge for independent scheduling.
	[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {			[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
	__nvvm_bar_warp_sync(static_cast<uint32_t>(mask));			__nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
	Show All 20 Lines

libc/src/__support/RPC/rpc.h

Show First 20 Lines • Show All 110 Lines • ▼ Show 20 Lines	template <bool Invert, typename Packet> struct Process {
/// Atomic<uint32_t> secondary[port_count];		/// Atomic<uint32_t> secondary[port_count];
/// Packet buffer[port_count];		/// Packet buffer[port_count];
/// };		/// };
LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count) {		LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count) {
return buffer_offset(port_count) + buffer_bytes(port_count);		return buffer_offset(port_count) + buffer_bytes(port_count);
}		}

/// Retrieve the inbox state from memory shared between processes.		/// Retrieve the inbox state from memory shared between processes.
LIBC_INLINE uint32_t load_inbox(uint32_t index) {		LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) {
return inbox[index].load(cpp::MemoryOrder::RELAXED);		return gpu::broadcast_value(lane_mask,
		inbox[index].load(cpp::MemoryOrder::RELAXED));
}		}

/// Retrieve the outbox state from memory shared between processes.		/// Retrieve the outbox state from memory shared between processes.
LIBC_INLINE uint32_t load_outbox(uint32_t index) {		LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) {
return outbox[index].load(cpp::MemoryOrder::RELAXED);		return gpu::broadcast_value(lane_mask,
		outbox[index].load(cpp::MemoryOrder::RELAXED));
}		}

/// Signal to the other process that this one is finished with the buffer.		/// Signal to the other process that this one is finished with the buffer.
/// Equivalent to loading outbox followed by store of the inverted value		/// Equivalent to loading outbox followed by store of the inverted value
/// The outbox is write only by this warp and tracking the value locally is		/// The outbox is write only by this warp and tracking the value locally is
/// cheaper than calling load_outbox to get the value to store.		/// cheaper than calling load_outbox to get the value to store.
LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {		LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
uint32_t inverted_outbox = !current_outbox;		uint32_t inverted_outbox = !current_outbox;
atomic_thread_fence(cpp::MemoryOrder::RELEASE);		atomic_thread_fence(cpp::MemoryOrder::RELEASE);
outbox[index].store(inverted_outbox, cpp::MemoryOrder::RELAXED);		outbox[index].store(inverted_outbox, cpp::MemoryOrder::RELAXED);
return inverted_outbox;		return inverted_outbox;
}		}

// Given the current outbox and inbox values, wait until the inbox changes		// Given the current outbox and inbox values, wait until the inbox changes
// to indicate that this thread owns the buffer element.		// to indicate that this thread owns the buffer element.
LIBC_INLINE void wait_for_ownership(uint32_t index, uint32_t outbox,		LIBC_INLINE void wait_for_ownership(uint64_t lane_mask, uint32_t index,
uint32_t in) {		uint32_t outbox, uint32_t in) {
while (buffer_unavailable(in, outbox)) {		while (buffer_unavailable(in, outbox)) {
sleep_briefly();		sleep_briefly();
in = load_inbox(index);		in = load_inbox(lane_mask, index);
}		}
atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);		atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
}		}

/// Determines if this process needs to wait for ownership of the buffer. We		/// Determines if this process needs to wait for ownership of the buffer. We
/// invert the condition on one of the processes to indicate that if one		/// invert the condition on one of the processes to indicate that if one
/// process owns the buffer then the other does not.		/// process owns the buffer then the other does not.
LIBC_INLINE static bool buffer_unavailable(uint32_t in, uint32_t out) {		LIBC_INLINE static bool buffer_unavailable(uint32_t in, uint32_t out) {
▲ Show 20 Lines • Show All 173 Lines • ▼ Show 20 Lines	LIBC_INLINE void close() {
// The server is passive, if it own the buffer when it closes we need to		// The server is passive, if it own the buffer when it closes we need to
// give ownership back to the client.		// give ownership back to the client.
if (owns_buffer && T)		if (owns_buffer && T)
out = process.invert_outbox(index, out);		out = process.invert_outbox(index, out);
process.unlock(lane_mask, index);		process.unlock(lane_mask, index);
}		}

private:		private:
Process<T, S> &process;		Process<T, S> &process;
		JonChesterfieldAuthorUnsubmitted Done Reply Inline Actions These values may be a problem. I think Port instances usually end up on the stack in the hope that SROA will disassemble the pieces. Lane_mask and index should be invariant over the lifetime of the Port. I'm not sure what the out variable is - my recollection is that it's a cache of the outbox state, but if that's true it should be a bool. I'd guess out/receive/owns are mutable and toggle back and forth as the machine executes. Mutable or not, I think these should all be warp invariant, and whether that invariant is known at compile time or not is probably an emergent feature of the optimiser at the moment. JonChesterfield: These values may be a problem. I think Port instances usually end up on the stack in the hope…
uint64_t lane_mask;		uint64_t lane_mask;
uint32_t index;		uint32_t index;
uint32_t out;		uint32_t out;
bool receive;		bool receive;
bool owns_buffer;		bool owns_buffer;
};		};

/// The RPC client used to make requests to the server.		/// The RPC client used to make requests to the server.
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines
private:		private:
Process<true, Packet<lane_size>> process;		Process<true, Packet<lane_size>> process;
};		};

/// Applies \p fill to the shared buffer and initiates a send operation.		/// Applies \p fill to the shared buffer and initiates a send operation.
template <bool T, typename S>		template <bool T, typename S>
template <typename F>		template <typename F>
LIBC_INLINE void Port<T, S>::send(F fill) {		LIBC_INLINE void Port<T, S>::send(F fill) {
uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index);		uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);

// We need to wait until we own the buffer before sending.		// We need to wait until we own the buffer before sending.
process.wait_for_ownership(index, out, in);		process.wait_for_ownership(lane_mask, index, out, in);

// Apply the \p fill function to initialize the buffer and release the memory.		// Apply the \p fill function to initialize the buffer and release the memory.
invoke_rpc(fill, process.packet[index]);		invoke_rpc(fill, process.packet[index]);
out = process.invert_outbox(index, out);		out = process.invert_outbox(index, out);
owns_buffer = false;		owns_buffer = false;
receive = false;		receive = false;
}		}

/// Applies \p use to the shared buffer and acknowledges the send.		/// Applies \p use to the shared buffer and acknowledges the send.
template <bool T, typename S>		template <bool T, typename S>
template <typename U>		template <typename U>
LIBC_INLINE void Port<T, S>::recv(U use) {		LIBC_INLINE void Port<T, S>::recv(U use) {
// We only exchange ownership of the buffer during a receive if we are waiting		// We only exchange ownership of the buffer during a receive if we are waiting
// for a previous receive to finish.		// for a previous receive to finish.
if (receive) {		if (receive) {
out = process.invert_outbox(index, out);		out = process.invert_outbox(index, out);
owns_buffer = false;		owns_buffer = false;
}		}

uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index);		uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);

// We need to wait until we own the buffer before receiving.		// We need to wait until we own the buffer before receiving.
process.wait_for_ownership(index, out, in);		process.wait_for_ownership(lane_mask, index, out, in);

// Apply the \p use function to read the memory out of the buffer.		// Apply the \p use function to read the memory out of the buffer.
invoke_rpc(use, process.packet[index]);		invoke_rpc(use, process.packet[index]);
receive = true;		receive = true;
owns_buffer = true;		owns_buffer = true;
}		}

/// Combines a send and receive into a single function.		/// Combines a send and receive into a single function.
▲ Show 20 Lines • Show All 98 Lines • ▼ Show 20 Lines	for (uint32_t index = 0;; ++index) {
if (index >= process.port_count)		if (index >= process.port_count)
index = 0;		index = 0;

// Attempt to acquire the lock on this index.		// Attempt to acquire the lock on this index.
uint64_t lane_mask = gpu::get_lane_mask();		uint64_t lane_mask = gpu::get_lane_mask();
if (!process.try_lock(lane_mask, index))		if (!process.try_lock(lane_mask, index))
continue;		continue;

uint32_t in = process.load_inbox(index);		uint32_t in = process.load_inbox(lane_mask, index);
uint32_t out = process.load_outbox(index);		uint32_t out = process.load_outbox(lane_mask, index);

// Once we acquire the index we need to check if we are in a valid sending		// Once we acquire the index we need to check if we are in a valid sending
// state.		// state.
if (process.buffer_unavailable(in, out)) {		if (process.buffer_unavailable(in, out)) {
process.unlock(lane_mask, index);		process.unlock(lane_mask, index);
continue;		continue;
}		}

Show All 9 Lines
/// Attempts to open a port to use as the server. The server can only open a		/// Attempts to open a port to use as the server. The server can only open a
/// port if it has a pending receive operation		/// port if it has a pending receive operation
template <uint32_t lane_size>		template <uint32_t lane_size>
[[clang::convergent]] LIBC_INLINE		[[clang::convergent]] LIBC_INLINE
cpp::optional<typename Server<lane_size>::Port>		cpp::optional<typename Server<lane_size>::Port>
Server<lane_size>::try_open() {		Server<lane_size>::try_open() {
// Perform a naive linear scan for a port that has a pending request.		// Perform a naive linear scan for a port that has a pending request.
for (uint32_t index = 0; index < process.port_count; ++index) {		for (uint32_t index = 0; index < process.port_count; ++index) {
uint32_t in = process.load_inbox(index);		uint64_t lane_mask = gpu::get_lane_mask();
uint32_t out = process.load_outbox(index);		uint32_t in = process.load_inbox(lane_mask, index);
		uint32_t out = process.load_outbox(lane_mask, index);

// The server is passive, if there is no work pending don't bother		// The server is passive, if there is no work pending don't bother
// opening a port.		// opening a port.
if (process.buffer_unavailable(in, out))		if (process.buffer_unavailable(in, out))
continue;		continue;

// Attempt to acquire the lock on this index.		// Attempt to acquire the lock on this index.
uint64_t lane_mask = gpu::get_lane_mask();
if (!process.try_lock(lane_mask, index))		if (!process.try_lock(lane_mask, index))
continue;		continue;

in = process.load_inbox(index);		in = process.load_inbox(lane_mask, index);
out = process.load_outbox(index);		out = process.load_outbox(lane_mask, index);

if (process.buffer_unavailable(in, out)) {		if (process.buffer_unavailable(in, out)) {
process.unlock(lane_mask, index);		process.unlock(lane_mask, index);
continue;		continue;
}		}

return Port(process, lane_mask, index, out);		return Port(process, lane_mask, index, out);
}		}
Show All 16 Lines

libc/test/src/__support/RPC/rpc_smoke_test.cpp

Show First 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	TEST(LlvmLibcRPCSmoke, SanityCheck) {
uint64_t index = 0; // any < port_count		uint64_t index = 0; // any < port_count
uint64_t lane_mask = 1;		uint64_t lane_mask = 1;

// Each process has its own local lock for index		// Each process has its own local lock for index
EXPECT_TRUE(ProcA.try_lock(lane_mask, index));		EXPECT_TRUE(ProcA.try_lock(lane_mask, index));
EXPECT_TRUE(ProcB.try_lock(lane_mask, index));		EXPECT_TRUE(ProcB.try_lock(lane_mask, index));

// All zero to begin with		// All zero to begin with
EXPECT_EQ(ProcA.load_inbox(index), 0u);		EXPECT_EQ(ProcA.load_inbox(lane_mask, index), 0u);
EXPECT_EQ(ProcB.load_inbox(index), 0u);		EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 0u);
EXPECT_EQ(ProcA.load_outbox(index), 0u);		EXPECT_EQ(ProcA.load_outbox(lane_mask, index), 0u);
EXPECT_EQ(ProcB.load_outbox(index), 0u);		EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u);

// Available for ProcA and not for ProcB		// Available for ProcA and not for ProcB
EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(index),		EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index),
ProcA.load_outbox(index)));		ProcA.load_outbox(lane_mask, index)));
EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(index),		EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index),
ProcB.load_outbox(index)));		ProcB.load_outbox(lane_mask, index)));

// ProcA write to outbox		// ProcA write to outbox
uint32_t ProcAOutbox = ProcA.load_outbox(index);		uint32_t ProcAOutbox = ProcA.load_outbox(lane_mask, index);
EXPECT_EQ(ProcAOutbox, 0u);		EXPECT_EQ(ProcAOutbox, 0u);
ProcAOutbox = ProcA.invert_outbox(index, ProcAOutbox);		ProcAOutbox = ProcA.invert_outbox(index, ProcAOutbox);
EXPECT_EQ(ProcAOutbox, 1u);		EXPECT_EQ(ProcAOutbox, 1u);

// No longer available for ProcA		// No longer available for ProcA
EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(index), ProcAOutbox));		EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index),
		ProcAOutbox));

// Outbox is still zero, hasn't been written to		// Outbox is still zero, hasn't been written to
EXPECT_EQ(ProcB.load_outbox(index), 0u);		EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u);

// Wait for ownership will terminate because load_inbox returns 1		// Wait for ownership will terminate because load_inbox returns 1
EXPECT_EQ(ProcB.load_inbox(index), 1u);		EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 1u);
ProcB.wait_for_ownership(index, 0u, 0u);		ProcB.wait_for_ownership(lane_mask, index, 0u, 0u);

// and B now has the buffer available		// and B now has the buffer available
EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(index),		EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index),
ProcB.load_outbox(index)));		ProcB.load_outbox(lane_mask, index)));

// Enough checks for one test, close the locks		// Enough checks for one test, close the locks
ProcA.unlock(lane_mask, index);		ProcA.unlock(lane_mask, index);
ProcB.unlock(lane_mask, index);		ProcB.unlock(lane_mask, index);
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[libc][gpu] Thread divergence fix on volta
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 555019

libc/src/__support/GPU/amdgpu/utils.h

libc/src/__support/GPU/generic/utils.h

libc/src/__support/GPU/nvptx/utils.h

libc/src/__support/RPC/rpc.h

libc/test/src/__support/RPC/rpc_smoke_test.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[libc][gpu] Thread divergence fix on voltaClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 555019

libc/src/__support/GPU/amdgpu/utils.h

libc/src/__support/GPU/generic/utils.h

libc/src/__support/GPU/nvptx/utils.h

libc/src/__support/RPC/rpc.h

libc/test/src/__support/RPC/rpc_smoke_test.cpp

[libc][gpu] Thread divergence fix on volta
ClosedPublic