Diff 531782

libc/src/__support/RPC/rpc.h

Show First 20 Lines • Show All 120 Lines • ▼ Show 20 Lines	LIBC_INLINE void reset(uint64_t port_count, uint32_t lane_size,
this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(		this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
advance(buffer, inbox_offset(port_count)));		advance(buffer, inbox_offset(port_count)));
this->outbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(		this->outbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
advance(buffer, outbox_offset(port_count)));		advance(buffer, outbox_offset(port_count)));
this->packet =		this->packet =
reinterpret_cast<Packet *>(advance(buffer, buffer_offset(port_count)));		reinterpret_cast<Packet *>(advance(buffer, buffer_offset(port_count)));
}		}

		/// Returns the beginning of the unified buffer. Intended for initializing the
		/// client after the server has been started.
		LIBC_INLINE void *get_buffer_start() const { return Invert ? outbox : inbox; }

/// Allocate a memory buffer sufficient to store the following equivalent		/// Allocate a memory buffer sufficient to store the following equivalent
/// representation in memory.		/// representation in memory.
///		///
/// struct Equivalent {		/// struct Equivalent {
/// Atomic<uint32_t> primary[port_count];		/// Atomic<uint32_t> primary[port_count];
/// Atomic<uint32_t> secondary[port_count];		/// Atomic<uint32_t> secondary[port_count];
/// Packet buffer[port_count];		/// Packet buffer[port_count];
/// };		/// };
▲ Show 20 Lines • Show All 452 Lines • Show Last 20 Lines

libc/utils/gpu/CMakeLists.txt

				add_subdirectory(server)
	add_subdirectory(loader)			add_subdirectory(loader)

libc/utils/gpu/loader/Loader.h

	//===-- Generic device loader interface -----------------------------------===//			//===-- Generic device loader interface -----------------------------------===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H			#ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
	#define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H			#define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H

				#include "utils/gpu/server/Server.h"
				#include <cstddef>
	#include <cstdint>			#include <cstdint>
				#include <cstdio>
				#include <cstdlib>
	#include <cstring>			#include <cstring>
	#include <stddef.h>

	/// Generic launch parameters for configuration the number of blocks / threads.			/// Generic launch parameters for configuration the number of blocks / threads.
	struct LaunchParameters {			struct LaunchParameters {
	uint32_t num_threads_x;			uint32_t num_threads_x;
	uint32_t num_threads_y;			uint32_t num_threads_y;
	uint32_t num_threads_z;			uint32_t num_threads_z;
	uint32_t num_blocks_x;			uint32_t num_blocks_x;
	uint32_t num_blocks_y;			uint32_t num_blocks_y;
	▲ Show 20 Lines • Show All 64 Lines • ▼ Show 20 Lines
	void copy_environment(char *envp, Allocator alloc) {			void copy_environment(char *envp, Allocator alloc) {
	int envc = 0;			int envc = 0;
	for (char *env = envp; env != 0; ++env)			for (char *env = envp; env != 0; ++env)
	++envc;			++envc;

	return copy_argument_vector(envc, envp, alloc);			return copy_argument_vector(envc, envp, alloc);
	};			};

				inline void handle_error(const char *msg) {
				fprintf(stderr, "%s\n", msg);
				exit(EXIT_FAILURE);
				}

				inline void handle_error(rpc_status_t) {
				handle_error("Failure in the RPC server\n");
				}

	#endif			#endif

libc/utils/gpu/loader/Server.h

This file was deleted.

	//===-- Generic RPC server interface --------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIBC_UTILS_GPU_LOADER_RPC_H
	#define LLVM_LIBC_UTILS_GPU_LOADER_RPC_H

	#include <cstdint>
	#include <cstdio>
	#include <cstdlib>
	#include <cstring>
	#include <stddef.h>

	#include "src/__support/RPC/rpc.h"

	static __llvm_libc::rpc::Server server;

	/// Queries the RPC client at least once and performs server-side work if there
	/// are any active requests.
	template <typename Alloc, typename Dealloc>
	void handle_server(Alloc allocator, Dealloc deallocator) {
	using namespace __llvm_libc;

	// Continue servicing the client until there is no work left and we return.
	for (;;) {
	auto port = server.try_open();
	if (!port)
	return;

	switch (port->get_opcode()) {
	case rpc::Opcode::WRITE_TO_STREAM:
	case rpc::Opcode::WRITE_TO_STDERR:
	case rpc::Opcode::WRITE_TO_STDOUT: {
	uint64_t sizes[rpc::MAX_LANE_SIZE] = {0};
	void *strs[rpc::MAX_LANE_SIZE] = {nullptr};
	FILE *files[rpc::MAX_LANE_SIZE] = {nullptr};
	if (port->get_opcode() == rpc::Opcode::WRITE_TO_STREAM)
	port->recv([&](rpc::Buffer *buffer, uint32_t id) {
	files[id] = reinterpret_cast<FILE *>(buffer->data[0]);
	});
	port->recv_n(strs, sizes, [&](uint64_t size) { return new char[size]; });
	port->send([&](rpc::Buffer *buffer, uint32_t id) {
	FILE *file = port->get_opcode() == rpc::Opcode::WRITE_TO_STDOUT
	? stdout
	: (port->get_opcode() == rpc::Opcode::WRITE_TO_STDERR
	? stderr
	: files[id]);
	int ret = fwrite(strs[id], sizes[id], 1, file);
	reinterpret_cast<int *>(buffer->data)[0] = ret >= 0 ? sizes[id] : ret;
	});
	for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) {
	if (strs[i])
	delete[] reinterpret_cast<uint8_t *>(strs[i]);
	}
	break;
	}
	case rpc::Opcode::EXIT: {
	port->recv([](rpc::Buffer *buffer) {
	exit(reinterpret_cast<uint32_t *>(buffer->data)[0]);
	});
	break;
	}
	case rpc::Opcode::MALLOC: {
	port->recv_and_send([&](rpc::Buffer *buffer) {
	buffer->data[0] =
	reinterpret_cast<uintptr_t>(allocator(buffer->data[0]));
	});
	break;
	}
	case rpc::Opcode::FREE: {
	port->recv([&](rpc::Buffer *buffer) {
	deallocator(reinterpret_cast<void *>(buffer->data[0]));
	});
	break;
	}
	case rpc::Opcode::TEST_INCREMENT: {
	port->recv_and_send([](rpc::Buffer *buffer) {
	reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
	});
	break;
	}
	case rpc::Opcode::TEST_INTERFACE: {
	uint64_t cnt = 0;
	bool end_with_recv;
	port->recv([&](rpc::Buffer *buffer) { end_with_recv = buffer->data[0]; });
	port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
	port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
	port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
	port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
	port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
	port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
	port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
	port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
	if (end_with_recv)
	port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
	else
	port->send(
	[&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
	break;
	}
	case rpc::Opcode::TEST_STREAM: {
	uint64_t sizes[rpc::MAX_LANE_SIZE] = {0};
	void *dst[rpc::MAX_LANE_SIZE] = {nullptr};
	port->recv_n(dst, sizes, [](uint64_t size) { return new char[size]; });
	port->send_n(dst, sizes);
	for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) {
	if (dst[i])
	delete[] reinterpret_cast<uint8_t *>(dst[i]);
	}
	break;
	}
	default:
	port->recv([](rpc::Buffer *buffer) {});
	}
	port->close();
	}
	}

	#endif

libc/utils/gpu/loader/amdgpu/CMakeLists.txt

	add_executable(amdhsa_loader Loader.cpp)			add_executable(amdhsa_loader Loader.cpp)
	add_dependencies(amdhsa_loader libc.src.__support.RPC.rpc)			add_dependencies(amdhsa_loader libc.src.__support.RPC.rpc)

	target_link_libraries(amdhsa_loader			target_link_libraries(amdhsa_loader
	PRIVATE			PRIVATE
	hsa-runtime64::hsa-runtime64			hsa-runtime64::hsa-runtime64
	gpu_loader			gpu_loader
				rpc_server
	)			)

libc/utils/gpu/loader/amdgpu/Loader.cpp

//===-- Loader Implementation for AMDHSA devices --------------------------===//		//===-- Loader Implementation for AMDHSA devices --------------------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file impelements a simple loader to run images supporting the AMDHSA		// This file impelements a simple loader to run images supporting the AMDHSA
// architecture. The file launches the '_start' kernel which should be provided		// architecture. The file launches the '_start' kernel which should be provided
// by the device application start code and call ultimately call the 'main'		// by the device application start code and call ultimately call the 'main'
// function.		// function.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "Loader.h"		#include "Loader.h"
#include "Server.h"

#include <hsa/hsa.h>		#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>		#include <hsa/hsa_ext_amd.h>

#include <cstdio>		#include <cstdio>
#include <cstdlib>		#include <cstdlib>
#include <cstring>		#include <cstring>
		#include <tuple>
#include <utility>		#include <utility>

/// Print the error code and exit if \p code indicates an error.		/// Print the error code and exit if \p code indicates an error.
static void handle_error(hsa_status_t code) {		static void handle_error(hsa_status_t code) {
if (code == HSA_STATUS_SUCCESS \|\| code == HSA_STATUS_INFO_BREAK)		if (code == HSA_STATUS_SUCCESS \|\| code == HSA_STATUS_INFO_BREAK)
return;		return;

const char *desc;		const char *desc;
if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)		if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
desc = "Unknown error";		desc = "Unknown error";
fprintf(stderr, "%s\n", desc);		fprintf(stderr, "%s\n", desc);
exit(EXIT_FAILURE);		exit(EXIT_FAILURE);
}		}

static void handle_error(const char *msg) {
fprintf(stderr, "%s\n", msg);
exit(EXIT_FAILURE);
}

/// Generic interface for iterating using the HSA callbacks.		/// Generic interface for iterating using the HSA callbacks.
		JonChesterfieldUnsubmitted Done Reply Inline Actions Call into the other one, static void handle_error(rpc_status_t) { handle_error("Failure in the RPC server"); } JonChesterfield: Call into the other one, ``` static void handle_error(rpc_status_t) { handle_error("Failure…
template <typename elem_ty, typename func_ty, typename callback_ty>		template <typename elem_ty, typename func_ty, typename callback_ty>
hsa_status_t iterate(func_ty func, callback_ty cb) {		hsa_status_t iterate(func_ty func, callback_ty cb) {
auto l = [](elem_ty elem, void *data) -> hsa_status_t {		auto l = [](elem_ty elem, void *data) -> hsa_status_t {
callback_ty unwrapped = static_cast<callback_ty >(data);		callback_ty unwrapped = static_cast<callback_ty >(data);
return (*unwrapped)(elem);		return (*unwrapped)(elem);
};		};
return func(l, static_cast<void *>(&cb));		return func(l, static_cast<void *>(&cb));
}		}
▲ Show 20 Lines • Show All 85 Lines • ▼ Show 20 Lines	hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
hsa_queue_t *queue, const LaunchParameters &params,		hsa_queue_t *queue, const LaunchParameters &params,
const char *kernel_name, args_t kernel_args) {		const char *kernel_name, args_t kernel_args) {
// Look up the '_start' kernel in the loaded executable.		// Look up the '_start' kernel in the loaded executable.
hsa_executable_symbol_t symbol;		hsa_executable_symbol_t symbol;
if (hsa_status_t err = hsa_executable_get_symbol_by_name(		if (hsa_status_t err = hsa_executable_get_symbol_by_name(
executable, kernel_name, &dev_agent, &symbol))		executable, kernel_name, &dev_agent, &symbol))
return err;		return err;

auto allocator = [&](uint64_t size) -> void * {		// Register RPC callbacks for the malloc and free functions on HSA.
		uint32_t device_id = 0;
		auto tuple = std::make_tuple(dev_agent, coarsegrained_pool);
		rpc_register_callback(
		device_id, RPC_MALLOC,
		[](rpc_port_t port, void *data) {
		auto malloc_handler = [](rpc_buffer_t buffer, void data) -> void {
		auto &[dev_agent, pool] = static_cast<decltype(tuple) >(data);
		uint64_t size = buffer->data[0];
void *dev_ptr = nullptr;		void *dev_ptr = nullptr;
if (hsa_status_t err =		if (hsa_status_t err =
hsa_amd_memory_pool_allocate(coarsegrained_pool, size,		hsa_amd_memory_pool_allocate(pool, size,
/flags=/0, &dev_ptr))		/flags=/0, &dev_ptr))
handle_error(err);		handle_error(err);
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);		hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
return dev_ptr;		buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr);
};		};
		rpc_recv_and_send(port, malloc_handler, data);
auto deallocator = [](void *ptr) -> void {		},
if (hsa_status_t err = hsa_amd_memory_pool_free(ptr))		&tuple);
		rpc_register_callback(
		device_id, RPC_FREE,
		[](rpc_port_t port, void *data) {
		auto free_handler = [](rpc_buffer_t buffer, void ) {
		if (hsa_status_t err = hsa_amd_memory_pool_free(
		reinterpret_cast<void *>(buffer->data[0])))
handle_error(err);		handle_error(err);
};		};
		rpc_recv_and_send(port, free_handler, data);
		},
		nullptr);

// Retrieve different properties of the kernel symbol used for launch.		// Retrieve different properties of the kernel symbol used for launch.
uint64_t kernel;		uint64_t kernel;
uint32_t args_size;		uint32_t args_size;
uint32_t group_size;		uint32_t group_size;
uint32_t private_size;		uint32_t private_size;

std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {		std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
__atomic_store_n(&packet->header, header \| (setup << 16), __ATOMIC_RELEASE);		__atomic_store_n(&packet->header, header \| (setup << 16), __ATOMIC_RELEASE);
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);		hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);

// Wait until the kernel has completed execution on the device. Periodically		// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.		// check the RPC client for work to be performed on the server.
while (hsa_signal_wait_scacquire(		while (hsa_signal_wait_scacquire(
packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,		packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
/timeout_hint=/1024, HSA_WAIT_STATE_ACTIVE) != 0)		/timeout_hint=/1024, HSA_WAIT_STATE_ACTIVE) != 0)
handle_server(allocator, deallocator);		if (rpc_status_t err = rpc_handle_server(device_id))
		handle_error(err);

// Handle the server one more time in case the kernel exited with a pending		// Handle the server one more time in case the kernel exited with a pending
// send still in flight.		// send still in flight.
handle_server(allocator, deallocator);		if (rpc_status_t err = rpc_handle_server(device_id))
		handle_error(err);

// Destroy the resources acquired to launch the kernel and return.		// Destroy the resources acquired to launch the kernel and return.
if (hsa_status_t err = hsa_amd_memory_pool_free(args))		if (hsa_status_t err = hsa_amd_memory_pool_free(args))
handle_error(err);		handle_error(err);
if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))		if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
handle_error(err);		handle_error(err);

return HSA_STATUS_SUCCESS;		return HSA_STATUS_SUCCESS;
Show All 10 Lines	if (hsa_status_t err = hsa_amd_register_system_event_handler(
[](const hsa_amd_event_t event, void ) -> hsa_status_t {		[](const hsa_amd_event_t event, void ) -> hsa_status_t {
if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)		if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
return HSA_STATUS_ERROR;		return HSA_STATUS_ERROR;
return HSA_STATUS_SUCCESS;		return HSA_STATUS_SUCCESS;
},		},
nullptr))		nullptr))
handle_error(err);		handle_error(err);

// Obtain an agent for the device and host to use the HSA memory model.		// Obtain a single agent for the device and host to use the HSA memory model.
		uint32_t num_devices = 1;
		uint32_t device_id = 0;
hsa_agent_t dev_agent;		hsa_agent_t dev_agent;
hsa_agent_t host_agent;		hsa_agent_t host_agent;
if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))		if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
handle_error(err);		handle_error(err);
if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))		if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
handle_error(err);		handle_error(err);

// Load the code object's ISA information and executable data segments.		// Load the code object's ISA information and executable data segments.
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines	int load(int argc, char argv, char envp, void *image, size_t size,
void *dev_ret;		void *dev_ret;
if (hsa_status_t err =		if (hsa_status_t err =
hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),		hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
/flags=/0, &dev_ret))		/flags=/0, &dev_ret))
handle_error(err);		handle_error(err);
hsa_amd_memory_fill(dev_ret, 0, sizeof(int));		hsa_amd_memory_fill(dev_ret, 0, sizeof(int));

// Allocate finegrained memory for the RPC server and client to share.		// Allocate finegrained memory for the RPC server and client to share.
uint64_t port_size = __llvm_libc::rpc::DEFAULT_PORT_COUNT;
uint32_t wavefront_size = 0;		uint32_t wavefront_size = 0;
if (hsa_status_t err = hsa_agent_get_info(		if (hsa_status_t err = hsa_agent_get_info(
dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))		dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
handle_error(err);		handle_error(err);

uint64_t rpc_shared_buffer_size =		// Set up the RPC server.
__llvm_libc::rpc::Server::allocation_size(port_size, wavefront_size);		if (rpc_status_t err = rpc_init(num_devices))
void *rpc_shared_buffer;		handle_error(err);
if (hsa_status_t err =		auto tuple = std::make_tuple(dev_agent, finegrained_pool);
hsa_amd_memory_pool_allocate(finegrained_pool, rpc_shared_buffer_size,		auto rpc_alloc = [](uint64_t size, void *data) {
/flags=/0, &rpc_shared_buffer))		auto &[dev_agent, finegrained_pool] = static_cast<decltype(tuple) >(data);
		void *dev_ptr = nullptr;
		if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
		/flags=/0, &dev_ptr))
		handle_error(err);
		hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
		return dev_ptr;
		};
		if (rpc_status_t err = rpc_server_init(device_id, RPC_MAXIMUM_PORT_COUNT,
		wavefront_size, rpc_alloc, &tuple))
handle_error(err);		handle_error(err);
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_shared_buffer);

// Initialize the RPC server's buffer for host-device communication.
server.reset(port_size, wavefront_size, rpc_shared_buffer);

// Obtain a queue with the minimum (power of two) size, used to send commands		// Obtain a queue with the minimum (power of two) size, used to send commands
// to the HSA runtime and launch execution on the device.		// to the HSA runtime and launch execution on the device.
uint64_t queue_size;		uint64_t queue_size;
if (hsa_status_t err = hsa_agent_get_info(		if (hsa_status_t err = hsa_agent_get_info(
dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))		dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
handle_error(err);		handle_error(err);
hsa_queue_t *queue = nullptr;		hsa_queue_t *queue = nullptr;
if (hsa_status_t err =		if (hsa_status_t err =
hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,		hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
nullptr, UINT32_MAX, UINT32_MAX, &queue))		nullptr, UINT32_MAX, UINT32_MAX, &queue))
handle_error(err);		handle_error(err);

LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};		LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer};		begin_args_t init_args = {argc, dev_argv, dev_envp,
		rpc_get_buffer(device_id)};
if (hsa_status_t err = launch_kernel(		if (hsa_status_t err = launch_kernel(
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,		dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
single_threaded_params, "_begin.kd", init_args))		single_threaded_params, "_begin.kd", init_args))
handle_error(err);		handle_error(err);

start_args_t args = {argc, dev_argv, dev_envp, dev_ret};		start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
if (hsa_status_t err =		if (hsa_status_t err =
launch_kernel(dev_agent, executable, kernargs_pool,		launch_kernel(dev_agent, executable, kernargs_pool,
Show All 26 Lines	int load(int argc, char argv, char envp, void *image, size_t size,
int ret = static_cast<int >(host_ret);		int ret = static_cast<int >(host_ret);

end_args_t fini_args = {ret};		end_args_t fini_args = {ret};
if (hsa_status_t err = launch_kernel(		if (hsa_status_t err = launch_kernel(
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,		dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
single_threaded_params, "_end.kd", fini_args))		single_threaded_params, "_end.kd", fini_args))
handle_error(err);		handle_error(err);

		if (rpc_status_t err = rpc_server_shutdown(
		device_id, [](void ptr, void ) { hsa_amd_memory_pool_free(ptr); },
		nullptr))
		handle_error(err);

// Free the memory allocated for the device.		// Free the memory allocated for the device.
if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))		if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
handle_error(err);		handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))		if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
handle_error(err);		handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_shared_buffer))
handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))		if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
handle_error(err);		handle_error(err);

if (hsa_status_t err = hsa_signal_destroy(memory_signal))		if (hsa_status_t err = hsa_signal_destroy(memory_signal))
handle_error(err);		handle_error(err);
if (hsa_status_t err = hsa_queue_destroy(queue))		if (hsa_status_t err = hsa_queue_destroy(queue))
handle_error(err);		handle_error(err);

if (hsa_status_t err = hsa_executable_destroy(executable))		if (hsa_status_t err = hsa_executable_destroy(executable))
handle_error(err);		handle_error(err);

if (hsa_status_t err = hsa_code_object_destroy(object))		if (hsa_status_t err = hsa_code_object_destroy(object))
handle_error(err);		handle_error(err);

		if (rpc_status_t err = rpc_shutdown())
		handle_error(err);
if (hsa_status_t err = hsa_shut_down())		if (hsa_status_t err = hsa_shut_down())
handle_error(err);		handle_error(err);

return ret;		return ret;
}		}

libc/utils/gpu/loader/nvptx/CMakeLists.txt

	add_executable(nvptx_loader Loader.cpp)			add_executable(nvptx_loader Loader.cpp)
	add_dependencies(nvptx_loader libc.src.__support.RPC.rpc)			add_dependencies(nvptx_loader libc.src.__support.RPC.rpc)

	if(NOT LLVM_ENABLE_RTTI)			if(NOT LLVM_ENABLE_RTTI)
	target_compile_options(nvptx_loader PRIVATE -fno-rtti)			target_compile_options(nvptx_loader PRIVATE -fno-rtti)
	endif()			endif()
	target_include_directories(nvptx_loader PRIVATE ${LLVM_INCLUDE_DIRS})			target_include_directories(nvptx_loader PRIVATE ${LLVM_INCLUDE_DIRS})
	target_link_libraries(nvptx_loader			target_link_libraries(nvptx_loader
	PRIVATE			PRIVATE
	gpu_loader			gpu_loader
				rpc_server
	CUDA::cuda_driver			CUDA::cuda_driver
	LLVMObject			LLVMObject
	LLVMSupport			LLVMSupport
	)			)

libc/utils/gpu/loader/nvptx/Loader.cpp

//===-- Loader Implementation for NVPTX devices --------------------------===//		//===-- Loader Implementation for NVPTX devices --------------------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file impelements a simple loader to run images supporting the NVPTX		// This file impelements a simple loader to run images supporting the NVPTX
// architecture. The file launches the '_start' kernel which should be provided		// architecture. The file launches the '_start' kernel which should be provided
// by the device application start code and call ultimately call the 'main'		// by the device application start code and call ultimately call the 'main'
// function.		// function.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "Loader.h"		#include "Loader.h"
#include "Server.h"

#include "cuda.h"		#include "cuda.h"

#include "llvm/Object/ELF.h"		#include "llvm/Object/ELF.h"
#include "llvm/Object/ELFObjectFile.h"		#include "llvm/Object/ELFObjectFile.h"

#include <cstddef>		#include <cstddef>
#include <cstdio>		#include <cstdio>
Show All 11 Lines	static void handle_error(CUresult err) {
const char *err_str = nullptr;		const char *err_str = nullptr;
CUresult result = cuGetErrorString(err, &err_str);		CUresult result = cuGetErrorString(err, &err_str);
if (result != CUDA_SUCCESS)		if (result != CUDA_SUCCESS)
fprintf(stderr, "Unknown Error\n");		fprintf(stderr, "Unknown Error\n");
else		else
fprintf(stderr, "%s\n", err_str);		fprintf(stderr, "%s\n", err_str);
exit(1);		exit(1);
}		}

static void handle_error(const char *msg) {
fprintf(stderr, "%s\n", msg);
exit(EXIT_FAILURE);
}

// Gets the names of all the globals that contain functions to initialize or		// Gets the names of all the globals that contain functions to initialize or
		JonChesterfieldUnsubmitted Done Reply Inline Actions i still really dislike the copy/paste going on here JonChesterfield: i still really dislike the copy/paste going on here
// deinitialize. We need to do this manually because the NVPTX toolchain does		// deinitialize. We need to do this manually because the NVPTX toolchain does
// not contain the necessary binary manipulation tools.		// not contain the necessary binary manipulation tools.
template <typename Alloc>		template <typename Alloc>
Expected<void > get_ctor_dtor_array(const void image, const size_t size,		Expected<void > get_ctor_dtor_array(const void image, const size_t size,
Alloc allocator, CUmodule binary) {		Alloc allocator, CUmodule binary) {
auto mem_buffer = MemoryBuffer::getMemBuffer(		auto mem_buffer = MemoryBuffer::getMemBuffer(
StringRef(reinterpret_cast<const char *>(image), size), "image",		StringRef(reinterpret_cast<const char *>(image), size), "image",
/RequiresNullTerminator=/false);		/RequiresNullTerminator=/false);
▲ Show 20 Lines • Show All 116 Lines • ▼ Show 20 Lines	CUresult launch_kernel(CUmodule binary, CUstream stream,

// Initialize a non-blocking CUDA stream to allocate memory if needed. This		// Initialize a non-blocking CUDA stream to allocate memory if needed. This
// needs to be done on a separate stream or else it will deadlock with the		// needs to be done on a separate stream or else it will deadlock with the
// executing kernel.		// executing kernel.
CUstream memory_stream;		CUstream memory_stream;
if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))		if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
handle_error(err);		handle_error(err);

auto allocator = [&](uint64_t size) -> void * {		// Register RPC callbacks for the malloc and free functions on HSA.
		uint32_t device_id = 0;
		rpc_register_callback(
		device_id, RPC_MALLOC,
		[](rpc_port_t port, void *data) {
		auto malloc_handler = [](rpc_buffer_t buffer, void data) -> void {
		CUstream memory_stream = static_cast<CUstream >(data);
		uint64_t size = buffer->data[0];
CUdeviceptr dev_ptr;		CUdeviceptr dev_ptr;
if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))		if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
handle_error(err);		handle_error(err);

// Wait until the memory allocation is complete.		// Wait until the memory allocation is complete.
while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)		while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
;		;
return reinterpret_cast<void *>(dev_ptr);
};		};
auto deallocator = [&](void *ptr) -> void {		rpc_recv_and_send(port, malloc_handler, data);
if (CUresult err =		},
cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr), memory_stream))		&memory_stream);
		JonChesterfieldUnsubmitted Done Reply Inline Actions Does this work? It looks like the same stream running the kernel is being used to provide malloc/free, and I'd expect that to deadlock JonChesterfield: Does this work? It looks like the same stream running the kernel is being used to provide…
		jhuber6AuthorUnsubmitted Done Reply Inline Actions There's a test for this that's been running on https://lab.llvm.org/buildbot/#/builders/46 for a few weeks now and it hasn't deadlocked as far as I can tell. It's a completely separate stream called `memory_stream` that's just created here. The one running the kernel is just called `stream`. This requires CUDA 11.2 IIRC. jhuber6: There's a test for this that's been running on https://lab.llvm.org/buildbot/#/builders/46 for…
		rpc_register_callback(
		device_id, RPC_FREE,
		[](rpc_port_t port, void *data) {
		auto free_handler = [](rpc_buffer_t buffer, void data) {
		CUstream memory_stream = static_cast<CUstream >(data);
		if (CUresult err = cuMemFreeAsync(
		static_cast<CUdeviceptr>(buffer->data[0]), memory_stream))
handle_error(err);		handle_error(err);
};		};
		rpc_recv_and_send(port, free_handler, data);
		},
		&memory_stream);

// Call the kernel with the given arguments.		// Call the kernel with the given arguments.
if (CUresult err = cuLaunchKernel(		if (CUresult err = cuLaunchKernel(
function, params.num_blocks_x, params.num_blocks_y,		function, params.num_blocks_x, params.num_blocks_y,
params.num_blocks_z, params.num_threads_x, params.num_threads_y,		params.num_blocks_z, params.num_threads_x, params.num_threads_y,
params.num_threads_z, 0, stream, nullptr, args_config))		params.num_threads_z, 0, stream, nullptr, args_config))
handle_error(err);		handle_error(err);

// Wait until the kernel has completed execution on the device. Periodically		// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.		// check the RPC client for work to be performed on the server.
while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)		while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
handle_server(allocator, deallocator);		if (rpc_status_t err = rpc_handle_server(device_id))
		handle_error(err);

// Handle the server one more time in case the kernel exited with a pending		// Handle the server one more time in case the kernel exited with a pending
// send still in flight.		// send still in flight.
handle_server(allocator, deallocator);		if (rpc_status_t err = rpc_handle_server(device_id))
		handle_error(err);

return CUDA_SUCCESS;		return CUDA_SUCCESS;
}		}

int load(int argc, char argv, char envp, void *image, size_t size,		int load(int argc, char argv, char envp, void *image, size_t size,
const LaunchParameters &params) {		const LaunchParameters &params) {

if (CUresult err = cuInit(0))		if (CUresult err = cuInit(0))
handle_error(err);		handle_error(err);
// Obtain the first device found on the system.		// Obtain the first device found on the system.
		uint32_t num_devices = 1;
		uint32_t device_id = 0;
CUdevice device;		CUdevice device;
if (CUresult err = cuDeviceGet(&device, 0))		if (CUresult err = cuDeviceGet(&device, device_id))
handle_error(err);		handle_error(err);

// Initialize the CUDA context and claim it for this execution.		// Initialize the CUDA context and claim it for this execution.
CUcontext context;		CUcontext context;
if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))		if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
handle_error(err);		handle_error(err);
if (CUresult err = cuCtxSetCurrent(context))		if (CUresult err = cuCtxSetCurrent(context))
handle_error(err);		handle_error(err);
Show All 39 Lines	int load(int argc, char argv, char envp, void *image, size_t size,

// Allocate space for the return pointer and initialize it to zero.		// Allocate space for the return pointer and initialize it to zero.
CUdeviceptr dev_ret;		CUdeviceptr dev_ret;
if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))		if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
handle_error(err);		handle_error(err);
if (CUresult err = cuMemsetD32(dev_ret, 0, 1))		if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
handle_error(err);		handle_error(err);

uint64_t port_size = __llvm_libc::rpc::DEFAULT_PORT_COUNT;		if (rpc_status_t err = rpc_init(num_devices))
uint32_t warp_size = 32;		handle_error(err);

uint64_t rpc_shared_buffer_size =
__llvm_libc::rpc::Server::allocation_size(port_size, warp_size);
void *rpc_shared_buffer = allocator(rpc_shared_buffer_size);

if (!rpc_shared_buffer)
handle_error("Failed to allocate memory the RPC client / server.");

// Initialize the RPC server's buffer for host-device communication.		uint32_t warp_size = 32;
server.reset(port_size, warp_size, rpc_shared_buffer);		auto rpc_alloc = [](uint64_t size, void ) -> void {
		void *dev_ptr;
		if (CUresult err = cuMemAllocHost(&dev_ptr, size))
		handle_error(err);
		return dev_ptr;
		};
		if (rpc_status_t err = rpc_server_init(device_id, RPC_MAXIMUM_PORT_COUNT,
		warp_size, rpc_alloc, nullptr))
		handle_error(err);

LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};		LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
// Call the kernel to		// Call the kernel to
begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer};		begin_args_t init_args = {argc, dev_argv, dev_envp,
		rpc_get_buffer(device_id)};
if (CUresult err = launch_kernel(binary, stream, single_threaded_params,		if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
"_begin", init_args))		"_begin", init_args))
handle_error(err);		handle_error(err);

start_args_t args = {argc, dev_argv, dev_envp,		start_args_t args = {argc, dev_argv, dev_envp,
reinterpret_cast<void *>(dev_ret)};		reinterpret_cast<void *>(dev_ret)};
if (CUresult err = launch_kernel(binary, stream, params, "_start", args))		if (CUresult err = launch_kernel(binary, stream, params, "_start", args))
handle_error(err);		handle_error(err);
Show All 13 Lines	int load(int argc, char argv, char envp, void *image, size_t size,

// Free the memory allocated for the device.		// Free the memory allocated for the device.
if (CUresult err = cuMemFreeHost(*memory_or_err))		if (CUresult err = cuMemFreeHost(*memory_or_err))
handle_error(err);		handle_error(err);
if (CUresult err = cuMemFree(dev_ret))		if (CUresult err = cuMemFree(dev_ret))
handle_error(err);		handle_error(err);
if (CUresult err = cuMemFreeHost(dev_argv))		if (CUresult err = cuMemFreeHost(dev_argv))
handle_error(err);		handle_error(err);
if (CUresult err = cuMemFreeHost(rpc_shared_buffer))		if (rpc_status_t err = rpc_server_shutdown(
		device_id, [](void ptr, void ) { cuMemFreeHost(ptr); }, nullptr))
handle_error(err);		handle_error(err);

// Destroy the context and the loaded binary.		// Destroy the context and the loaded binary.
if (CUresult err = cuModuleUnload(binary))		if (CUresult err = cuModuleUnload(binary))
handle_error(err);		handle_error(err);
if (CUresult err = cuDevicePrimaryCtxRelease(device))		if (CUresult err = cuDevicePrimaryCtxRelease(device))
handle_error(err);		handle_error(err);
		if (rpc_status_t err = rpc_shutdown())
		handle_error(err);
return host_ret;		return host_ret;
}		}

libc/utils/gpu/server/CMakeLists.txt

This file was added.

				add_library(rpc_server STATIC Server.cpp)

				# Include the RPC implemenation from libc.
				add_dependencies(rpc_server libc.src.__support.RPC.rpc)
				target_include_directories(rpc_server PRIVATE ${LIBC_SOURCE_DIR})
				target_include_directories(rpc_server PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

libc/utils/gpu/server/Server.h

This file was added.

				//===-- Shared memory RPC server instantiation ------------------- C++ --===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//

				#ifndef LLVM_LIBC_UTILS_GPU_SERVER_RPC_SERVER_H
				#define LLVM_LIBC_UTILS_GPU_SERVER_RPC_SERVER_H

				#include <stdint.h>

				#ifdef __cplusplus
				extern "C" {
				#endif

				/// The maxium number of ports that can be opened for any server.
				const uint64_t RPC_MAXIMUM_PORT_COUNT = 64;

				// TODO: Move these to a header exported by the C library.
				typedef enum : uint16_t {
				RPC_NOOP = 0,
				RPC_EXIT = 1,
				RPC_WRITE_TO_STDOUT = 2,
				RPC_WRITE_TO_STDERR = 3,
				RPC_WRITE_TO_STREAM = 4,
				RPC_MALLOC = 5,
				RPC_FREE = 6,
				} rpc_opcode_t;

				/// status codes.
				typedef enum {
				RPC_STATUS_SUCCESS = 0x0,
				RPC_STATUS_ERROR = 0x1000,
				RPC_STATUS_OUT_OF_RANGE = 0x1001,
				RPC_STATUS_UNHANDLED_OPCODE = 0x1002,
				} rpc_status_t;

				/// A struct containing an opaque handle to an RPC port. This is what allows the
				/// server to communicate with the client.
				typedef struct rpc_port_s {
				uint64_t handle;
				} rpc_port_t;

				/// A fixed-size buffer containing the payload sent from the client.
				typedef struct rpc_buffer_s {
				sivachandraUnsubmitted Not Done Reply Inline Actions Why are there separate functions allocation and deallocation? sivachandra: Why are there separate functions allocation and deallocation?
				jhuber6AuthorUnsubmitted Done Reply Inline Actions I'll change the name to `free` but we need both to adequately de allocate the shared memory. jhuber6: I'll change the name to `free` but we need both to adequately de allocate the shared memory.
				uint64_t data[8];
				} rpc_buffer_t;

				/// A function used to allocate \p bytes for use by the RPC server and client.
				/// The memory should support asynchronous and atomic access from both the
				sivachandraUnsubmitted Done Reply Inline Actions Update comment. sivachandra: Update comment.
				/// client and server.
				typedef void (rpc_alloc_ty)(uint64_t size, void *data);

				sivachandraUnsubmitted Done Reply Inline Actions Ditto. sivachandra: Ditto.
				/// A function used to free the \p ptr previously allocated.
				typedef void (rpc_free_ty)(void ptr, void *data);

				JonChesterfieldUnsubmitted Done Reply Inline Actions This looks like the type of a function, not the type of a function pointer. It's used as an argument to functions where it'll decay to the pointer type. More conventionally written with an extra * typedef void(rpc_free_ty)(void ptr, void data); Is there a benefit to declaring this as the function type as opposed to the function pointer type? JonChesterfield:* This looks like the type of a function, not the type of a function pointer. It's used as an…
				jhuber6AuthorUnsubmitted Done Reply Inline Actions I just forgot to add it and my IDE took care of the conversions when I got an error, will change. jhuber6: I just forgot to add it and my IDE took care of the conversions when I got an error, will…
				/// A callback function provided with a \p port to communicate with the RPC
				/// client. This will be called by the server to handle an opcode.
				typedef void (rpc_opcode_callback_ty)(rpc_port_t port, void data);

				sivachandraUnsubmitted Done Reply Inline Actions Why not just `rpc_shutdown`? sivachandra: Why not just `rpc_shutdown`?
				/// A callback function to use the port to receive or send a \p buffer.
				typedef void (rpc_port_callback_ty)(rpc_buffer_t buffer, void *data);

				/// Initialize the rpc library for general use on \p num_devices.
				rpc_status_t rpc_init(uint32_t num_devices);

				/// Shut down the rpc interface.
				rpc_status_t rpc_shutdown(void);

				JonChesterfieldUnsubmitted Done Reply Inline Actions Want `(void)` if this is meant to be usable from C (the guards about suggest it is) C++ thinks foo() is a function of no arguments. C thinks it's some aberration from the past (though that might have been dropped in the last standard). JonChesterfield: Want `(void)` if this is meant to be usable from C (the guards about suggest it is) C++ thinks…
				jhuber6AuthorUnsubmitted Done Reply Inline Actions Forgot about that quirk of C, thanks. jhuber6: Forgot about that quirk of C, thanks.
				/// Initialize the server for a given device.
				rpc_status_t rpc_server_init(uint32_t device_id, uint64_t num_ports,
				uint32_t lane_size, rpc_alloc_ty alloc,
				void *data);

				/// Shut down the server for a given device.
				rpc_status_t rpc_server_shutdown(uint32_t device_id, rpc_free_ty dealloc,
				void *data);

				/// Queries the RPC clients at least once and performs server-side work if there
				/// are any active requests. Runs until all work on the server is completed.
				rpc_status_t rpc_handle_server(uint32_t device_id);

				/// Register a callback to handle an opcode from the RPC client. The associated
				/// data must remain accessible as long as the user intends to handle the server
				/// with this callback.
				rpc_status_t rpc_register_callback(uint32_t device_id, rpc_opcode_t opcode,
				rpc_opcode_callback_ty callback, void *data);

				/// Obtain a pointer to the memory buffer used to run the RPC client and server.
				void *rpc_get_buffer(uint32_t device_id);

				/// Use the \p port to receive and send a buffer using the \p callback.
				void rpc_recv_and_send(rpc_port_t port, rpc_port_callback_ty callback,
				void *data);

				#ifdef __cplusplus
				}
				#endif

				#endif

libc/utils/gpu/server/Server.cpp

This file was added.

				//===-- Shared memory RPC server instantiation ------------------- C++ --===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//

				#include "Server.h"

				#include "src/__support/RPC/rpc.h"
				#include <atomic>
				#include <cstdio>
				#include <memory>
				#include <mutex>
				#include <unordered_map>

				using namespace __llvm_libc;

				static_assert(sizeof(rpc_buffer_t) == sizeof(rpc::Buffer),
				"Buffer size mismatch");

				static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::DEFAULT_PORT_COUNT,
				"Incorrect maximum port count");
				struct Device {
				rpc::Server server;
				std::unordered_map<rpc_opcode_t, rpc_opcode_callback_ty> callbacks;
				std::unordered_map<rpc_opcode_t, void *> callback_data;
				};

				// A struct containing all the runtime state required to run the RPC server.
				struct State {
				State(uint32_t num_devices)
				: num_devices(num_devices),
				devices(std::unique_ptr<Device[]>(new Device[num_devices])),
				reference_count(0u) {}
				JonChesterfieldUnsubmitted Not Done Reply Inline Actions Could we go with a vector of Device instead of the new+array construct? JonChesterfield: Could we go with a vector of Device instead of the new+array construct?
				jhuber6AuthorUnsubmitted Done Reply Inline Actions It's a static size so a constant sized array should be more correct. jhuber6: It's a static size so a constant sized array should be more correct.
				uint32_t num_devices;
				std::unique_ptr<Device[]> devices;
				std::atomic_uint32_t reference_count;
				};

				static std::mutex startup_mutex;

				static State *state;

				JonChesterfieldUnsubmitted Not Done Reply Inline Actions Why is this a heap allocated thing, as opposed to `static State state;` ? JonChesterfield: Why is this a heap allocated thing, as opposed to `static State state;` ?
				jhuber6AuthorUnsubmitted Done Reply Inline Actions It's just easier to check if it's been initialized because the pointer is nullable. We coiuld probably make it a static thing and have a flag instead if you'd like. jhuber6: It's just easier to check if it's been initialized because the pointer is nullable. We coiuld…
				rpc_status_t rpc_init(uint32_t num_devices) {
				std::scoped_lock<decltype(startup_mutex)> lock(startup_mutex);
				if (!state)
				state = new State(num_devices);

				if (state->reference_count == std::numeric_limits<uint32_t>::max())
				return RPC_STATUS_ERROR;
				JonChesterfieldUnsubmitted Not Done Reply Inline Actions Could make the counter 64 bit and delete the test against max as a counter >= address space size can't overflow In general the DIY reference counting is a bit odd - is there a reason this isn't a shared_ptr? JonChesterfield: Could make the counter 64 bit and delete the test against max as a counter >= address space…
				jhuber6AuthorUnsubmitted Done Reply Inline Actions Would a shared pointer give us the same semantics? We would be allocating it multiple times and not copying it. jhuber6: Would a shared pointer give us the same semantics? We would be allocating it multiple times and…

				state->reference_count++;

				return RPC_STATUS_SUCCESS;
				}

				rpc_status_t rpc_shutdown(void) {
				if (state->reference_count-- == 1)
				delete state;

				return RPC_STATUS_SUCCESS;
				}

				rpc_status_t rpc_server_init(uint32_t device_id, uint64_t num_ports,
				uint32_t lane_size, rpc_alloc_ty alloc,
				void *data) {
				if (device_id >= state->num_devices)
				return RPC_STATUS_OUT_OF_RANGE;

				uint64_t buffer_size =
				__llvm_libc::rpc::Server::allocation_size(num_ports, lane_size);
				void *buffer = alloc(buffer_size, data);

				if (!buffer)
				return RPC_STATUS_ERROR;

				state->devices[device_id].server.reset(num_ports, lane_size, buffer);

				return RPC_STATUS_SUCCESS;
				}

				rpc_status_t rpc_server_shutdown(uint32_t device_id, rpc_free_ty dealloc,
				void *data) {
				if (device_id >= state->num_devices)
				return RPC_STATUS_OUT_OF_RANGE;

				dealloc(rpc_get_buffer(device_id), data);

				return RPC_STATUS_SUCCESS;
				}

				rpc_status_t rpc_handle_server(uint32_t device_id) {
				if (device_id >= state->num_devices)
				return RPC_STATUS_OUT_OF_RANGE;

				for (;;) {
				auto port = state->devices[device_id].server.try_open();
				if (!port)
				return RPC_STATUS_SUCCESS;

				switch (port->get_opcode()) {
				case rpc::Opcode::WRITE_TO_STREAM:
				case rpc::Opcode::WRITE_TO_STDERR:
				case rpc::Opcode::WRITE_TO_STDOUT: {
				uint64_t sizes[rpc::MAX_LANE_SIZE] = {0};
				void *strs[rpc::MAX_LANE_SIZE] = {nullptr};
				JonChesterfieldUnsubmitted Not Done Reply Inline Actions I'm still hopeful that we'll come up with a better idea than rpc::MAX_LANE_SIZE JonChesterfield: I'm still hopeful that we'll come up with a better idea than rpc::MAX_LANE_SIZE
				jhuber6AuthorUnsubmitted Done Reply Inline Actions We could use a vector and push back into it instead, or preallocate according to the size above, but this was the easiest solution. jhuber6: We could use a vector and push back into it instead, or preallocate according to the size above…
				FILE *files[rpc::MAX_LANE_SIZE] = {nullptr};
				if (port->get_opcode() == rpc::Opcode::WRITE_TO_STREAM)
				port->recv([&](rpc::Buffer *buffer, uint32_t id) {
				files[id] = reinterpret_cast<FILE *>(buffer->data[0]);
				});
				port->recv_n(strs, sizes, [&](uint64_t size) { return new char[size]; });
				port->send([&](rpc::Buffer *buffer, uint32_t id) {
				FILE *file = port->get_opcode() == rpc::Opcode::WRITE_TO_STDOUT
				? stdout
				: (port->get_opcode() == rpc::Opcode::WRITE_TO_STDERR
				? stderr
				: files[id]);
				int ret = fwrite(strs[id], sizes[id], 1, file);
				reinterpret_cast<int *>(buffer->data)[0] = ret >= 0 ? sizes[id] : ret;
				});
				for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) {
				if (strs[i])
				delete[] reinterpret_cast<uint8_t *>(strs[i]);
				}
				break;
				}
				case rpc::Opcode::EXIT: {
				port->recv([](rpc::Buffer *buffer) {
				exit(reinterpret_cast<uint32_t *>(buffer->data)[0]);
				});
				break;
				}
				// TODO: Move handling of these test cases to the loader implementation.
				case rpc::Opcode::TEST_INCREMENT: {
				port->recv_and_send([](rpc::Buffer *buffer) {
				reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
				});
				break;
				}
				case rpc::Opcode::TEST_INTERFACE: {
				uint64_t cnt = 0;
				bool end_with_recv;
				port->recv([&](rpc::Buffer *buffer) { end_with_recv = buffer->data[0]; });
				port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
				port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
				port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
				port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
				port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
				port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
				port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
				port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
				if (end_with_recv)
				port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
				else
				port->send(
				[&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
				break;
				}
				case rpc::Opcode::TEST_STREAM: {
				uint64_t sizes[rpc::MAX_LANE_SIZE] = {0};
				void *dst[rpc::MAX_LANE_SIZE] = {nullptr};
				port->recv_n(dst, sizes, [](uint64_t size) { return new char[size]; });
				port->send_n(dst, sizes);
				for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) {
				if (dst[i])
				delete[] reinterpret_cast<uint8_t *>(dst[i]);
				}
				break;
				}
				case rpc::Opcode::NOOP: {
				port->recv([](rpc::Buffer *buffer) {});
				break;
				}
				default: {
				auto handler = state->devices[device_id].callbacks.find(
				static_cast<rpc_opcode_t>(port->get_opcode()));

				// We error out on an unhandled opcode.
				if (handler == state->devices[device_id].callbacks.end())
				return RPC_STATUS_UNHANDLED_OPCODE;

				// Invoke the registered callback with a reference to the port.
				void *data = state->devices[device_id].callback_data.at(
				static_cast<rpc_opcode_t>(port->get_opcode()));
				rpc_port_t port_ref{reinterpret_cast<uint64_t>(&*port)};
				(handler->second)(port_ref, data);
				}
				}
				port->close();
				}
				}

				rpc_status_t rpc_register_callback(uint32_t device_id, rpc_opcode_t opcode,
				rpc_opcode_callback_ty callback,
				void *data) {
				if (device_id >= state->num_devices)
				return RPC_STATUS_OUT_OF_RANGE;

				state->devices[device_id].callbacks[opcode] = callback;
				state->devices[device_id].callback_data[opcode] = data;
				return RPC_STATUS_SUCCESS;
				}

				void *rpc_get_buffer(uint32_t device_id) {
				if (device_id >= state->num_devices)
				return nullptr;
				return state->devices[device_id].server.get_buffer_start();
				}

				void rpc_recv_and_send(rpc_port_t ref, rpc_port_callback_ty callback,
				void *data) {
				rpc::Server::Port port = reinterpret_cast<rpc::Server::Port >(ref.handle);
				port->recv_and_send([=](rpc::Buffer *buffer) {
				callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
				});
				}

This is an archive of the discontinued LLVM Phabricator instance.

[libc] Begin implementing a library for the RPC server
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 531782

libc/src/__support/RPC/rpc.h

libc/utils/gpu/CMakeLists.txt

libc/utils/gpu/loader/Loader.h

libc/utils/gpu/loader/Server.h

libc/utils/gpu/loader/amdgpu/CMakeLists.txt

libc/utils/gpu/loader/amdgpu/Loader.cpp

libc/utils/gpu/loader/nvptx/CMakeLists.txt

libc/utils/gpu/loader/nvptx/Loader.cpp

libc/utils/gpu/server/CMakeLists.txt

libc/utils/gpu/server/Server.h

libc/utils/gpu/server/Server.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[libc] Begin implementing a library for the RPC serverClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 531782

libc/src/__support/RPC/rpc.h

libc/utils/gpu/CMakeLists.txt

libc/utils/gpu/loader/Loader.h

libc/utils/gpu/loader/Server.h

libc/utils/gpu/loader/amdgpu/CMakeLists.txt

libc/utils/gpu/loader/amdgpu/Loader.cpp

libc/utils/gpu/loader/nvptx/CMakeLists.txt

libc/utils/gpu/loader/nvptx/Loader.cpp

libc/utils/gpu/server/CMakeLists.txt

libc/utils/gpu/server/Server.h

libc/utils/gpu/server/Server.cpp

[libc] Begin implementing a library for the RPC server
ClosedPublic