This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
libc/
-
src/__support/
-
__support/
-
GPU/
-
amdgpu/
-
utils.h
-
nvptx/
-
utils.h
-
RPC/
-
rpc.h
-
test/integration/startup/gpu/
-
integration/
-
startup/
-
gpu/
-
CMakeLists.txt
-
rpc_stream_test.cpp

Differential D150992

[libc] Fix the `send_n` and `recv_n` utilities under divergent lanes
ClosedPublic

Authored by jhuber6 on May 19 2023, 1:06 PM.

Download Raw Diff

Details

Reviewers

jdoerfert
JonChesterfield
sivachandra
michaelrj
lntue

Commits

rG29d3da3b86cc: [libc] Fix the `send_n` and `recv_n` utilities under divergent lanes

Summary

We provide the send_n and recv_n utilities as a generic way to
stream data between both sides of the process. This was previously
tested and performed as expected when using a string of constant size.
However, when the size was allowed to diverge between the threads in the
warp or wavefront this could deadlock. This did not occur on NVPTX
because of the use of the explicit warp sync. However, on AMD one of the
work items in the wavefront could continue executing and hit the next
recv call before the other threads, then we would deadlock as we
violated the RPC invariants.

This patch replaces the for loop with a thread ballot. This will cause
every thread in the warp or wavefront to continue executing the loop
until all of them can exit. This acts as a more explicit wavefront sync.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

jhuber6 created this revision.May 19 2023, 1:06 PM

Herald added projects: Restricted Project, Restricted Project. · View Herald TranscriptMay 19 2023, 1:06 PM

Herald added subscribers: libc-commits, mattd, asavonic and 4 others. · View Herald Transcript

jhuber6 requested review of this revision.May 19 2023, 1:06 PM

Accidentally removed the old test.

Harbormaster completed remote builds in B233295: Diff 523932.May 19 2023, 3:57 PM

jhuber6 added a child revision: D151041: [libc] More efficiently send bytes via `send_n` and `recv_n`.May 20 2023, 6:29 PM

Does
> process.get_packet(index).header.mask
hoist out of the loop? If not it might be worth doing that manually. Also interested in whether index is in a sgpr at this point.

Those are codegen effectiveness questions though, and the high level keep going while any lane thinks there's work to do seems reasonable.

This revision is now accepted and ready to land.May 23 2023, 8:35 AM

Closed by commit rG29d3da3b86cc: [libc] Fix the `send_n` and `recv_n` utilities under divergent lanes (authored by jhuber6). · Explain WhyMay 23 2023, 9:00 AM

This revision was automatically updated to reflect the committed changes.

jhuber6 added a commit: rG29d3da3b86cc: [libc] Fix the `send_n` and `recv_n` utilities under divergent lanes.

Revision Contents

Path

Size

libc/

src/

__support/

GPU/

amdgpu/

utils.h

1 line

nvptx/

utils.h

1 line

RPC/

rpc.h

13 lines

test/

integration/

startup/

gpu/

CMakeLists.txt

3 lines

rpc_stream_test.cpp

54 lines

Diff 524749

libc/src/__support/GPU/amdgpu/utils.h

Show First 20 Lines • Show All 116 Lines • ▼ Show 20 Lines	[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
return __builtin_amdgcn_read_exec();		return __builtin_amdgcn_read_exec();
}		}

/// Copies the value from the first active thread in the wavefront to the rest.		/// Copies the value from the first active thread in the wavefront to the rest.
[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {		[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
return __builtin_amdgcn_readfirstlane(x);		return __builtin_amdgcn_readfirstlane(x);
}		}

		/// Returns a bitmask of threads in the current lane for which \p x is true.
[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {		[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
// the lane_mask & gives the nvptx semantics when lane_mask is a subset of		// the lane_mask & gives the nvptx semantics when lane_mask is a subset of
// the active threads		// the active threads
if constexpr (LANE_SIZE == 64) {		if constexpr (LANE_SIZE == 64) {
return lane_mask & __builtin_amdgcn_ballot_w64(x);		return lane_mask & __builtin_amdgcn_ballot_w64(x);
} else {		} else {
return lane_mask & __builtin_amdgcn_ballot_w32(x);		return lane_mask & __builtin_amdgcn_ballot_w32(x);
}		}
Show All 17 Lines

libc/src/__support/GPU/nvptx/utils.h

Show First 20 Lines • Show All 112 Lines • ▼ Show 20 Lines	[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
uint64_t id = __builtin_ffsl(lane_mask) - 1;		uint64_t id = __builtin_ffsl(lane_mask) - 1;
#if __CUDA_ARCH__ >= 600		#if __CUDA_ARCH__ >= 600
return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);		return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);
#else		#else
return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);		return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
#endif		#endif
}		}

		/// Returns a bitmask of threads in the current lane for which \p x is true.
[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {		[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
#if __CUDA_ARCH__ >= 600		#if __CUDA_ARCH__ >= 600
return __nvvm_vote_ballot_sync(lane_mask, x);		return __nvvm_vote_ballot_sync(lane_mask, x);
#else		#else
return lane_mask & __nvvm_vote_ballot(x);		return lane_mask & __nvvm_vote_ballot(x);
#endif		#endif
}		}
/// Waits for all the threads in the block to converge and issues a fence.		/// Waits for all the threads in the block to converge and issues a fence.
Show All 11 Lines

libc/src/__support/RPC/rpc.h

Show First 20 Lines • Show All 417 Lines • ▼ Show 20 Lines
}		}

/// Sends an arbitrarily sized data buffer \p src across the shared channel in		/// Sends an arbitrarily sized data buffer \p src across the shared channel in
/// multiples of the packet length.		/// multiples of the packet length.
template <bool T>		template <bool T>
LIBC_INLINE void Port<T>::send_n(const void const src, uint64_t *size) {		LIBC_INLINE void Port<T>::send_n(const void const src, uint64_t *size) {
// TODO: We could send the first bytes in this call and potentially save an		// TODO: We could send the first bytes in this call and potentially save an
// extra send operation.		// extra send operation.
// TODO: We may need a way for the CPU to send different strings per thread.
uint64_t num_sends = 0;		uint64_t num_sends = 0;
send([&](Buffer *buffer, uint32_t id) {		send([&](Buffer *buffer, uint32_t id) {
reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);		reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
num_sends = is_process_gpu() ? lane_value(size, id)		num_sends = is_process_gpu() ? lane_value(size, id)
: max(lane_value(size, id), num_sends);		: max(lane_value(size, id), num_sends);
});		});
for (uint64_t idx = 0; idx < num_sends; idx += sizeof(Buffer::data)) {		uint64_t idx = 0;
		uint64_t mask = process.get_packet(index).header.mask;
		while (gpu::ballot(mask, idx < num_sends)) {
send([=](Buffer *buffer, uint32_t id) {		send([=](Buffer *buffer, uint32_t id) {
const uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)		const uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
? sizeof(Buffer::data)		? sizeof(Buffer::data)
: lane_value(size, id) - idx;		: lane_value(size, id) - idx;
if (idx < lane_value(size, id))		if (idx < lane_value(size, id))
inline_memcpy(		inline_memcpy(
buffer->data,		buffer->data,
reinterpret_cast<const uint8_t *>(lane_value(src, id)) + idx, len);		reinterpret_cast<const uint8_t *>(lane_value(src, id)) + idx, len);
});		});
		idx += sizeof(Buffer::data);
}		}
gpu::sync_lane(process.get_packet(index).header.mask);
}		}

/// Helper routine to simplify the interface when sending from the GPU using		/// Helper routine to simplify the interface when sending from the GPU using
/// thread private pointers to the underlying value.		/// thread private pointers to the underlying value.
template <bool T>		template <bool T>
LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {		LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
static_assert(is_process_gpu(), "Only valid when running on the GPU");		static_assert(is_process_gpu(), "Only valid when running on the GPU");
const void **src_ptr = &src;		const void **src_ptr = &src;
Show All 10 Lines	LIBC_INLINE void Port<T>::recv_n(void *dst, uint64_t size, A &&alloc) {
uint64_t num_recvs = 0;		uint64_t num_recvs = 0;
recv([&](Buffer *buffer, uint32_t id) {		recv([&](Buffer *buffer, uint32_t id) {
lane_value(size, id) = reinterpret_cast<uint64_t *>(buffer->data)[0];		lane_value(size, id) = reinterpret_cast<uint64_t *>(buffer->data)[0];
lane_value(dst, id) =		lane_value(dst, id) =
reinterpret_cast<uint8_t *>(alloc(lane_value(size, id)));		reinterpret_cast<uint8_t *>(alloc(lane_value(size, id)));
num_recvs = is_process_gpu() ? lane_value(size, id)		num_recvs = is_process_gpu() ? lane_value(size, id)
: max(lane_value(size, id), num_recvs);		: max(lane_value(size, id), num_recvs);
});		});
for (uint64_t idx = 0; idx < num_recvs; idx += sizeof(Buffer::data)) {		uint64_t idx = 0;
		uint64_t mask = process.get_packet(index).header.mask;
		while (gpu::ballot(mask, idx < num_recvs)) {
recv([=](Buffer *buffer, uint32_t id) {		recv([=](Buffer *buffer, uint32_t id) {
uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)		uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
? sizeof(Buffer::data)		? sizeof(Buffer::data)
: lane_value(size, id) - idx;		: lane_value(size, id) - idx;
if (idx < lane_value(size, id))		if (idx < lane_value(size, id))
inline_memcpy(reinterpret_cast<uint8_t *>(lane_value(dst, id)) + idx,		inline_memcpy(reinterpret_cast<uint8_t *>(lane_value(dst, id)) + idx,
buffer->data, len);		buffer->data, len);
});		});
		idx += sizeof(Buffer::data);
}		}
return;
}		}

/// Attempts to open a port to use as the client. The client can only open a		/// Attempts to open a port to use as the client. The client can only open a
/// port if we find an index that is in a valid sending state. That is, there		/// port if we find an index that is in a valid sending state. That is, there
/// are send operations pending that haven't been serviced on this port. Each		/// are send operations pending that haven't been serviced on this port. Each
/// port instance uses an associated \p opcode to tell the server what to do.		/// port instance uses an associated \p opcode to tell the server what to do.
template <uint16_t opcode>		template <uint16_t opcode>
[[clang::convergent]] LIBC_INLINE cpp::optional<Client::Port>		[[clang::convergent]] LIBC_INLINE cpp::optional<Client::Port>
▲ Show 20 Lines • Show All 87 Lines • Show Last 20 Lines

libc/test/integration/startup/gpu/CMakeLists.txt

	Show First 20 Lines • Show All 44 Lines • ▼ Show 20 Lines
	)			)

	add_integration_test(			add_integration_test(
	startup_rpc_stream_test			startup_rpc_stream_test
	SUITE libc-startup-tests			SUITE libc-startup-tests
	SRCS			SRCS
	rpc_stream_test.cpp			rpc_stream_test.cpp
	LOADER_ARGS			LOADER_ARGS
	--threads-x 32			--threads 32
				--blocks 8
	)			)

libc/test/integration/startup/gpu/rpc_stream_test.cpp

	Show All 14 Lines
	#include "test/IntegrationTest/test.h"			#include "test/IntegrationTest/test.h"

	extern "C" void *malloc(uint64_t);			extern "C" void *malloc(uint64_t);
	extern "C" void free(void *);			extern "C" void free(void *);

	using namespace __llvm_libc;			using namespace __llvm_libc;

	static void test_stream() {			static void test_stream() {
	const char str[] = "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"			static const char str[] =
				"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
	"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"			"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
	"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"			"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
	"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"			"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
	"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy";			"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy";
	uint64_t send_size = sizeof(str);			uint64_t send_size = sizeof(str);
	void *send_ptr = malloc(send_size);			void *send_ptr = malloc(send_size);
	void *recv_ptr;			void *recv_ptr;
	uint64_t recv_size;			uint64_t recv_size;

	inline_memcpy(send_ptr, str, send_size);			inline_memcpy(send_ptr, str, send_size);
	ASSERT_TRUE(inline_memcmp(send_ptr, str, send_size) == 0 && "Data mismatch");			ASSERT_TRUE(inline_memcmp(send_ptr, str, send_size) == 0 && "Data mismatch");
	rpc::Client::Port port = rpc::client.open<rpc::TEST_STREAM>();			rpc::Client::Port port = rpc::client.open<rpc::TEST_STREAM>();
	port.send_n(send_ptr, send_size);			port.send_n(send_ptr, send_size);
	port.recv_n(&recv_ptr, &recv_size,			port.recv_n(&recv_ptr, &recv_size,
	[](uint64_t size) { return malloc(size); });			[](uint64_t size) { return malloc(size); });
	port.close();			port.close();
	ASSERT_TRUE(inline_memcmp(recv_ptr, str, recv_size) == 0 && "Data mismatch");			ASSERT_TRUE(inline_memcmp(recv_ptr, str, recv_size) == 0 && "Data mismatch");
	ASSERT_TRUE(recv_size == send_size && "Data size mismatch");			ASSERT_TRUE(recv_size == send_size && "Data size mismatch");

	free(send_ptr);			free(send_ptr);
	free(recv_ptr);			free(recv_ptr);
	}			}

				static void test_divergent() {
				static const uint8_t data[] = {
				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
				15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
				30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
				45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
				60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
				75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
				90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
				105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
				120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
				135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
				150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
				165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
				180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
				195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
				210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
				225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
				240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
				255,
				};

				uint8_t buffer[128] = {0};
				uint64_t offset =
				(gpu::get_thread_id() + gpu::get_num_threads() * gpu::get_block_id()) %
				128;
				void *recv_ptr;
				uint64_t recv_size;
				inline_memcpy(buffer, &data[offset], offset);
				ASSERT_TRUE(inline_memcmp(buffer, &data[offset], offset) == 0 &&
				"Data mismatch");
				rpc::Client::Port port = rpc::client.open<rpc::TEST_STREAM>();
				port.send_n(buffer, offset);
				inline_memset(buffer, offset, 0);
				port.recv_n(&recv_ptr, &recv_size, [&](uint64_t) { return buffer; });
				port.close();

				ASSERT_TRUE(inline_memcmp(recv_ptr, &data[offset], recv_size) == 0 &&
				"Data mismatch");
				ASSERT_TRUE(recv_size == offset && "Data size mismatch");
				}

	TEST_MAIN(int argc, char argv, char envp) {			TEST_MAIN(int argc, char argv, char envp) {
	test_stream();			test_stream();
				test_divergent();

	return 0;			return 0;
	}			}