This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
libc/
-
src/__support/RPC/
-
__support/
-
RPC/
-
rpc.h
-
rpc_util.h
-
test/integration/startup/gpu/
-
integration/
-
startup/
-
gpu/
-
CMakeLists.txt
-
rpc_stream_test.cpp
-
utils/gpu/loader/
-
gpu/
-
loader/
-
Server.h

Differential D150379

[libc] Implement a generic streaming interface in the RPC
ClosedPublic

Authored by jhuber6 on May 11 2023, 9:13 AM.

Download Raw Diff

Details

Reviewers

JonChesterfield
sivachandra
lntue
michaelrj
jdoerfert
tianshilei1992

Commits

rGd21e507cfc9f: [libc] Implement a generic streaming interface in the RPC

Summary

Currently we provide the send_n and recv_n functions. These were
somewhat divergent and not tested on the GPU. This patch changes the
support to be more common. We do this my making the CPU provide an array
equal the to at least the lane size while the GPU can rely on the
private memory address of its stack variables. This allows us to send
data back and forth generically.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

jhuber6 created this revision.May 11 2023, 9:13 AM

Herald added projects: Restricted Project, Restricted Project. · View Herald TranscriptMay 11 2023, 9:13 AM

Herald added subscribers: libc-commits, ecnelises, tschuett. · View Herald Transcript

jhuber6 requested review of this revision.May 11 2023, 9:13 AM

How does deallocation happen here? I see the lambda using malloc on one path and new on another, but can't find a corresponding free for one of them

In D150379#4335274, @JonChesterfield wrote:

How does deallocation happen here? I see the lambda using malloc on one path and new on another, but can't find a corresponding free for one of them

The server does a delete []. The malloc on the GPU is currently a bump pointer so free is a no-op.

Add free

JonChesterfield accepted this revision.May 11 2023, 9:38 AM

This revision is now accepted and ready to land.May 11 2023, 9:38 AM

Harbormaster completed remote builds in B231367: Diff 521348.May 11 2023, 9:54 AM

Closed by commit rGd21e507cfc9f: [libc] Implement a generic streaming interface in the RPC (authored by jhuber6). · Explain WhyMay 11 2023, 9:55 AM

This revision was automatically updated to reflect the committed changes.

jhuber6 added a commit: rGd21e507cfc9f: [libc] Implement a generic streaming interface in the RPC.

Revision Contents

Path

Size

libc/

src/

__support/

RPC/

rpc.h

93 lines

rpc_util.h

15 lines

test/

integration/

startup/

gpu/

CMakeLists.txt

9 lines

rpc_stream_test.cpp

50 lines

utils/

gpu/

loader/

Server.h

25 lines

Diff 521355

libc/src/__support/RPC/rpc.h

Show All 31 Lines

/// A list of opcodes that we use to invoke certain actions on the server.		/// A list of opcodes that we use to invoke certain actions on the server.
enum Opcode : uint16_t {		enum Opcode : uint16_t {
NOOP = 0,		NOOP = 0,
PRINT_TO_STDERR = 1,		PRINT_TO_STDERR = 1,
EXIT = 2,		EXIT = 2,
TEST_INCREMENT = 3,		TEST_INCREMENT = 3,
TEST_INTERFACE = 4,		TEST_INTERFACE = 4,
		TEST_STREAM = 5,
};		};

/// A fixed size channel used to communicate between the RPC client and server.		/// A fixed size channel used to communicate between the RPC client and server.
struct Buffer {		struct Buffer {
uint64_t data[8];		uint64_t data[8];
};		};
static_assert(sizeof(Buffer) == 64, "Buffer size mismatch");		static_assert(sizeof(Buffer) == 64, "Buffer size mismatch");

▲ Show 20 Lines • Show All 265 Lines • ▼ Show 20 Lines	private:
friend class cpp::optional<Port<T>>;		friend class cpp::optional<Port<T>>;

public:		public:
template <typename U> LIBC_INLINE void recv(U use);		template <typename U> LIBC_INLINE void recv(U use);
template <typename F> LIBC_INLINE void send(F fill);		template <typename F> LIBC_INLINE void send(F fill);
template <typename F, typename U>		template <typename F, typename U>
LIBC_INLINE void send_and_recv(F fill, U use);		LIBC_INLINE void send_and_recv(F fill, U use);
template <typename W> LIBC_INLINE void recv_and_send(W work);		template <typename W> LIBC_INLINE void recv_and_send(W work);
		LIBC_INLINE void send_n(const void const src, uint64_t *size);
LIBC_INLINE void send_n(const void *src, uint64_t size);		LIBC_INLINE void send_n(const void *src, uint64_t size);
template <typename A> LIBC_INLINE void recv_n(A alloc);		template <typename A>
		LIBC_INLINE void recv_n(void *dst, uint64_t size, A &&alloc);

LIBC_INLINE uint16_t get_opcode() const {		LIBC_INLINE uint16_t get_opcode() const {
return process.get_packet(index).header.opcode;		return process.get_packet(index).header.opcode;
}		}

LIBC_INLINE void close() {		LIBC_INLINE void close() {
// If the server last did a receive it needs to exchange ownership before		// If the server last did a receive it needs to exchange ownership before
// closing the port.		// closing the port.
▲ Show 20 Lines • Show All 88 Lines • ▼ Show 20 Lines
LIBC_INLINE void Port<T>::recv_and_send(W work) {		LIBC_INLINE void Port<T>::recv_and_send(W work) {
recv(work);		recv(work);
send([](Buffer ) { / no-op */ });		send([](Buffer ) { / no-op */ });
}		}

/// Sends an arbitrarily sized data buffer \p src across the shared channel in		/// Sends an arbitrarily sized data buffer \p src across the shared channel in
/// multiples of the packet length.		/// multiples of the packet length.
template <bool T>		template <bool T>
LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {		LIBC_INLINE void Port<T>::send_n(const void const src, uint64_t *size) {
// TODO: We could send the first bytes in this call and potentially save an		// TODO: We could send the first bytes in this call and potentially save an
// extra send operation.		// extra send operation.
// TODO: We may need a way for the CPU to send different strings per thread.		// TODO: We may need a way for the CPU to send different strings per thread.
send([=](Buffer *buffer) {		uint64_t num_sends = 0;
reinterpret_cast<uint64_t *>(buffer->data)[0] = size;		send([&](Buffer *buffer, uint32_t id) {
		reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
		num_sends = is_process_gpu() ? lane_value(size, id)
		: max(lane_value(size, id), num_sends);
});		});
const uint8_t ptr = reinterpret_cast<const uint8_t >(src);		for (uint64_t idx = 0; idx < num_sends; idx += sizeof(Buffer::data)) {
for (uint64_t idx = 0; idx < size; idx += sizeof(Buffer::data)) {		send([=](Buffer *buffer, uint32_t id) {
send([=](Buffer *buffer) {		const uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
const uint64_t len =		? sizeof(Buffer::data)
size - idx > sizeof(Buffer::data) ? sizeof(Buffer::data) : size - idx;		: lane_value(size, id) - idx;
inline_memcpy(buffer->data, ptr + idx, len);		if (idx < lane_value(size, id))
		inline_memcpy(
		buffer->data,
		reinterpret_cast<const uint8_t *>(lane_value(src, id)) + idx, len);
});		});
}		}
gpu::sync_lane(process.get_packet(index).header.mask);		gpu::sync_lane(process.get_packet(index).header.mask);
}		}

		/// Helper routine to simplify the interface when sending from the GPU using
		/// thread private pointers to the underlying value.
		template <bool T>
		LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
		static_assert(is_process_gpu(), "Only valid when running on the GPU");
		const void **src_ptr = &src;
		uint64_t *size_ptr = &size;
		send_n(src_ptr, size_ptr);
		}

/// Receives an arbitrarily sized data buffer across the shared channel in		/// Receives an arbitrarily sized data buffer across the shared channel in
/// multiples of the packet length. The \p alloc function is called with the		/// multiples of the packet length. The \p alloc function is called with the
/// size of the data so that we can initialize the size of the \p dst buffer.		/// size of the data so that we can initialize the size of the \p dst buffer.
template <bool T>		template <bool T>
template <typename A>		template <typename A>
LIBC_INLINE void Port<T>::recv_n(A alloc) {		LIBC_INLINE void Port<T>::recv_n(void *dst, uint64_t size, A &&alloc) {
// The GPU handles thread private variables and masking implicitly through its		uint64_t num_recvs = 0;
// execution model. If this is the CPU we need to manually handle the
// possibility that the sent data is of different length.
if constexpr (is_process_gpu()) {
uint64_t size = 0;
recv([&](Buffer *buffer) {
size = reinterpret_cast<uint64_t *>(buffer->data)[0];
});
uint8_t dst = reinterpret_cast<uint8_t >(alloc(size), gpu::get_lane_id());
for (uint64_t idx = 0; idx < size; idx += sizeof(Buffer::data)) {
recv([=](Buffer *buffer) {
uint64_t len = size - idx > sizeof(Buffer::data) ? sizeof(Buffer::data)
: size - idx;
inline_memcpy(dst + idx, buffer->data, len);
});
}
return;
} else {
uint64_t size[MAX_LANE_SIZE];
uint8_t *dst[MAX_LANE_SIZE];
uint64_t max = 0;
recv([&](Buffer *buffer, uint32_t id) {		recv([&](Buffer *buffer, uint32_t id) {
size[id] = reinterpret_cast<uint64_t *>(buffer->data)[0];		lane_value(size, id) = reinterpret_cast<uint64_t *>(buffer->data)[0];
dst[id] = reinterpret_cast<uint8_t *>(alloc(size[id], id));		lane_value(dst, id) =
max = size[id] > max ? size[id] : max;		reinterpret_cast<uint8_t *>(alloc(lane_value(size, id)));
		num_recvs = is_process_gpu() ? lane_value(size, id)
		: max(lane_value(size, id), num_recvs);
});		});
for (uint64_t idx = 0; idx < max; idx += sizeof(Buffer::data)) {		for (uint64_t idx = 0; idx < num_recvs; idx += sizeof(Buffer::data)) {
recv([=](Buffer *buffer, uint32_t id) {		recv([=](Buffer *buffer, uint32_t id) {
uint64_t len = size[id] - idx > sizeof(Buffer::data)		uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
? sizeof(Buffer::data)		? sizeof(Buffer::data)
: size[id] - idx;		: lane_value(size, id) - idx;
if (idx < size[id])		if (idx < lane_value(size, id))
inline_memcpy(dst[id] + idx, buffer->data, len);		inline_memcpy(reinterpret_cast<uint8_t *>(lane_value(dst, id)) + idx,
		buffer->data, len);
});		});
}		}
return;		return;
}		}
}

/// Attempts to open a port to use as the client. The client can only open a		/// Attempts to open a port to use as the client. The client can only open a
/// port if we find an index that is in a valid sending state. That is, there		/// port if we find an index that is in a valid sending state. That is, there
/// are send operations pending that haven't been serviced on this port. Each		/// are send operations pending that haven't been serviced on this port. Each
/// port instance uses an associated \p opcode to tell the server what to do.		/// port instance uses an associated \p opcode to tell the server what to do.
template <uint16_t opcode>		template <uint16_t opcode>
[[clang::convergent]] LIBC_INLINE cpp::optional<Client::Port>		[[clang::convergent]] LIBC_INLINE cpp::optional<Client::Port>
Client::try_open() {		Client::try_open() {
▲ Show 20 Lines • Show All 86 Lines • Show Last 20 Lines

libc/src/__support/RPC/rpc_util.h

	Show First 20 Lines • Show All 48 Lines • ▼ Show 20 Lines
	#endif			#endif
	}			}

	/// Return \p val aligned "upwards" according to \p align.			/// Return \p val aligned "upwards" according to \p align.
	template <typename V, typename A> LIBC_INLINE V align_up(V val, A align) {			template <typename V, typename A> LIBC_INLINE V align_up(V val, A align) {
	return ((val + V(align) - 1) / V(align)) * V(align);			return ((val + V(align) - 1) / V(align)) * V(align);
	}			}

				/// Utility to provide a unified interface between the CPU and GPU's memory
				/// model. On the GPU stack variables are always private to a lane so we can
				/// simply use the variable passed in. On the CPU we need to allocate enough
				/// space for the whole lane and index into it.
				template <typename V> LIBC_INLINE V &lane_value(V *val, uint32_t id) {
				if constexpr (is_process_gpu())
				return *val;
				return val[id];
				}

				/// Helper to get the maximum value.
				template <typename T> LIBC_INLINE const T &max(const T &x, const T &y) {
				return x < y ? y : x;
				}

	} // namespace rpc			} // namespace rpc
	} // namespace __llvm_libc			} // namespace __llvm_libc

	#endif			#endif

libc/test/integration/startup/gpu/CMakeLists.txt

	Show All 37 Lines
	)			)

	add_integration_test(			add_integration_test(
	startup_rpc_interface_test			startup_rpc_interface_test
	SUITE libc-startup-tests			SUITE libc-startup-tests
	SRCS			SRCS
	rpc_interface_test.cpp			rpc_interface_test.cpp
	)			)

				add_integration_test(
				startup_rpc_stream_test
				SUITE libc-startup-tests
				SRCS
				rpc_stream_test.cpp
				LOADER_ARGS
				--threads-x 32
				)

libc/test/integration/startup/gpu/rpc_stream_test.cpp

This file was added.

				//===-- Loader test to check the RPC streaming interface with the loader --===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//

				#include "src/__support/GPU/utils.h"
				#include "src/__support/RPC/rpc_client.h"
				#include "src/__support/integer_to_string.h"
				#include "src/string/memory_utils/memcmp_implementations.h"
				#include "src/string/memory_utils/memcpy_implementations.h"
				#include "src/string/string_utils.h"
				#include "test/IntegrationTest/test.h"

				extern "C" void *malloc(uint64_t);

				using namespace __llvm_libc;

				static void test_stream() {
				const char str[] = "ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
				"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
				"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
				"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy"
				"ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxy";
				uint64_t send_size = sizeof(str);
				void *send_ptr = malloc(send_size);
				void *recv_ptr;
				uint64_t recv_size;

				inline_memcpy(send_ptr, str, send_size);
				ASSERT_TRUE(inline_memcmp(send_ptr, str, send_size) == 0 && "Data mismatch");
				rpc::Client::Port port = rpc::client.open<rpc::TEST_STREAM>();
				port.send_n(send_ptr, send_size);
				port.recv_n(&recv_ptr, &recv_size,
				[](uint64_t size) { return malloc(size); });
				port.close();
				ASSERT_TRUE(inline_memcmp(recv_ptr, str, recv_size) == 0 && "Data mismatch");
				ASSERT_TRUE(recv_size == send_size && "Data size mismatch");

				free(send_ptr);
				free(recv_ptr);
				}

				TEST_MAIN(int argc, char argv, char envp) {
				test_stream();

				return 0;
				}

libc/utils/gpu/loader/Server.h

Show All 26 Lines	void handle_server() {
// Continue servicing the client until there is no work left and we return.		// Continue servicing the client until there is no work left and we return.
for (;;) {		for (;;) {
auto port = server.try_open();		auto port = server.try_open();
if (!port)		if (!port)
return;		return;

switch (port->get_opcode()) {		switch (port->get_opcode()) {
case rpc::Opcode::PRINT_TO_STDERR: {		case rpc::Opcode::PRINT_TO_STDERR: {
uint64_t str_size[rpc::MAX_LANE_SIZE] = {0};		uint64_t sizes[rpc::MAX_LANE_SIZE] = {0};
char *strs[rpc::MAX_LANE_SIZE] = {nullptr};		void *strs[rpc::MAX_LANE_SIZE] = {nullptr};
port->recv_n([&](uint64_t size, uint32_t id) {		port->recv_n(strs, sizes, [&](uint64_t size) { return new char[size]; });
str_size[id] = size;
strs[id] = new char[size];
return strs[id];
});
for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) {		for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) {
if (strs[i]) {		if (strs[i]) {
fwrite(strs[i], str_size[i], 1, stderr);		fwrite(strs[i], sizes[i], 1, stderr);
delete[] strs[i];		delete[] reinterpret_cast<uint8_t *>(strs[i]);
}		}
}		}
break;		break;
}		}
case rpc::Opcode::EXIT: {		case rpc::Opcode::EXIT: {
port->recv([](rpc::Buffer *buffer) {		port->recv([](rpc::Buffer *buffer) {
exit(reinterpret_cast<uint32_t *>(buffer->data)[0]);		exit(reinterpret_cast<uint32_t *>(buffer->data)[0]);
});		});
Show All 19 Lines	case rpc::Opcode::TEST_INTERFACE: {
port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });		port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
if (end_with_recv)		if (end_with_recv)
port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });		port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
else		else
port->send(		port->send(
[&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });		[&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
break;		break;
}		}
		case rpc::Opcode::TEST_STREAM: {
		uint64_t sizes[rpc::MAX_LANE_SIZE] = {0};
		void *dst[rpc::MAX_LANE_SIZE] = {nullptr};
		port->recv_n(dst, sizes, [](uint64_t size) { return new char[size]; });
		port->send_n(dst, sizes);
		for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) {
		if (dst[i])
		delete[] reinterpret_cast<uint8_t *>(dst[i]);
		}
		break;
		}
default:		default:
port->recv([](rpc::Buffer *buffer) {});		port->recv([](rpc::Buffer *buffer) {});
}		}
port->close();		port->close();
}		}
}		}

#endif		#endif