Diff 514929

libc/utils/gpu/loader/Loader.h

	//===-- Generic device loader interface -----------------------------------===//			//===-- Generic device loader interface -----------------------------------===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H			#ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
	#define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H			#define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H

	#include <cstdint>			#include <cstdint>
	#include <cstring>			#include <cstring>
	#include <stddef.h>			#include <stddef.h>

				/// Generic launch parameters for configuration the number of blocks / threads.
				struct LaunchParameters {
				uint32_t num_threads_x;
				uint32_t num_threads_y;
				uint32_t num_threads_z;
				uint32_t num_blocks_x;
				uint32_t num_blocks_y;
				uint32_t num_blocks_z;
				};

	/// Generic interface to load the \p image and launch execution of the _start			/// Generic interface to load the \p image and launch execution of the _start
	/// kernel on the target device. Copies \p argc and \p argv to the device.			/// kernel on the target device. Copies \p argc and \p argv to the device.
	/// Returns the final value of the `main` function on the device.			/// Returns the final value of the `main` function on the device.
	int load(int argc, char argv, char evnp, void *image, size_t size);			int load(int argc, char argv, char evnp, void *image, size_t size,
				const LaunchParameters &params);

	/// Copy the system's argument vector to GPU memory allocated using \p alloc.			/// Copy the system's argument vector to GPU memory allocated using \p alloc.
	template <typename Allocator>			template <typename Allocator>
	void copy_argument_vector(int argc, char *argv, Allocator alloc) {			void copy_argument_vector(int argc, char *argv, Allocator alloc) {
	size_t argv_size = sizeof(char ) (argc + 1);			size_t argv_size = sizeof(char ) (argc + 1);
	size_t str_size = 0;			size_t str_size = 0;
	for (int i = 0; i < argc; ++i)			for (int i = 0; i < argc; ++i)
	str_size += strlen(argv[i]) + 1;			str_size += strlen(argv[i]) + 1;
	Show All 31 Lines

libc/utils/gpu/loader/Main.cpp

	Show All 9 Lines
	// one of the loader implementations for launch.			// one of the loader implementations for launch.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#include "Loader.h"			#include "Loader.h"

	#include <cstdio>			#include <cstdio>
	#include <cstdlib>			#include <cstdlib>
				#include <string>
				#include <vector>

	int main(int argc, char argv, char envp) {			int main(int argc, char argv, char envp) {
	if (argc < 2) {			if (argc < 2) {
	printf("USAGE: ./loader <device_image> <args>, ...\n");			printf("USAGE: ./loader [--threads <n>, --blocks <n>] <device_image> "
				"<args>, ...\n");
	return EXIT_SUCCESS;			return EXIT_SUCCESS;
	}			}

	// TODO: We should perform some validation on the file.			int offset = 0;
	FILE *file = fopen(argv[1], "r");			FILE *file = nullptr;
				char *ptr;
				LaunchParameters params = {1, 1, 1, 1, 1, 1};
				while (!file && ++offset < argc) {
				if (argv[offset] == std::string("--threads") \|\|
				traUnsubmitted Not Done Reply Inline Actions Nit: Can we use `strtoul` instead? Sometimes it's convenient to be able to use hex numbers and you would also get an indication if the input is not a number. tra: Nit: Can we use `strtoul` instead? Sometimes it's convenient to be able to use hex numbers and…
				jhuber6AuthorUnsubmitted Done Reply Inline Actions Good point, just used to using the quick-and-dirty `atoi`. jhuber6: Good point, just used to using the quick-and-dirty `atoi`.
				argv[offset] == std::string("--threads-x")) {
				params.num_threads_x =
				offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
				offset++;
				continue;
				} else if (argv[offset] == std::string("--threads-y")) {
				jdoerfertUnsubmitted Not Done Reply Inline Actions move this first, then emit an error if the file was not opened. Right now random args will be ignored. jdoerfert: move this first, then emit an error if the file was not opened. Right now random args will be…
				params.num_threads_y =
				offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
				offset++;
				continue;
				} else if (argv[offset] == std::string("--threads-z")) {
				params.num_threads_z =
				offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
				offset++;
				continue;
				} else if (argv[offset] == std::string("--blocks") \|\|
				argv[offset] == std::string("--blocks-x")) {
				params.num_blocks_x =
				offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
				offset++;
				continue;
				} else if (argv[offset] == std::string("--blocks-y")) {
				params.num_blocks_y =
				offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
				offset++;
				continue;
				} else if (argv[offset] == std::string("--blocks-z")) {
				params.num_blocks_z =
				offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
				offset++;
				continue;
				} else {
				file = fopen(argv[offset], "r");
				if (!file) {
				fprintf(stderr, "Failed to open image file '%s'\n", argv[offset]);
				return EXIT_FAILURE;
				}
				break;
				}
				}

	if (!file) {			if (!file) {
	fprintf(stderr, "Failed to open image file %s\n", argv[1]);			fprintf(stderr, "No image file provided\n");
	return EXIT_FAILURE;			return EXIT_FAILURE;
	}			}

				// TODO: We should perform some validation on the file.
	fseek(file, 0, SEEK_END);			fseek(file, 0, SEEK_END);
	const auto size = ftell(file);			const auto size = ftell(file);
	fseek(file, 0, SEEK_SET);			fseek(file, 0, SEEK_SET);

	void image = malloc(size sizeof(char));			void image = malloc(size sizeof(char));
	fread(image, sizeof(char), size, file);			fread(image, sizeof(char), size, file);
	fclose(file);			fclose(file);

	// Drop the loader from the program arguments.			// Drop the loader from the program arguments.
	int ret = load(argc - 1, &argv[1], envp, image, size);			int ret = load(argc - offset, &argv[offset], envp, image, size, params);

	free(image);			free(image);
	return ret;			return ret;
	}			}

libc/utils/gpu/loader/amdgpu/Loader.cpp

Show First 20 Lines • Show All 164 Lines • ▼ Show 20 Lines	auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
if (flags & flag)		if (flags & flag)
*output_pool = memory_pool;		*output_pool = memory_pool;

return HSA_STATUS_SUCCESS;		return HSA_STATUS_SUCCESS;
};		};
return iterate_agent_memory_pools(agent, cb);		return iterate_agent_memory_pools(agent, cb);
}		}

int load(int argc, char argv, char envp, void *image, size_t size) {		int load(int argc, char argv, char envp, void *image, size_t size,
		const LaunchParameters &params) {
// Initialize the HSA runtime used to communicate with the device.		// Initialize the HSA runtime used to communicate with the device.
if (hsa_status_t err = hsa_init())		if (hsa_status_t err = hsa_init())
handle_error(err);		handle_error(err);

// Register a callback when the device encounters a memory fault.		// Register a callback when the device encounters a memory fault.
if (hsa_status_t err = hsa_amd_register_system_event_handler(		if (hsa_status_t err = hsa_amd_register_system_event_handler(
[](const hsa_amd_event_t event, void ) -> hsa_status_t {		[](const hsa_amd_event_t event, void ) -> hsa_status_t {
if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)		if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
▲ Show 20 Lines • Show All 168 Lines • ▼ Show 20 Lines	int load(int argc, char argv, char envp, void *image, size_t size,
const uint32_t mask = queue_size - 1;		const uint32_t mask = queue_size - 1;
hsa_kernel_dispatch_packet_t *packet =		hsa_kernel_dispatch_packet_t *packet =
(hsa_kernel_dispatch_packet_t *)queue->base_address + (packet_id & mask);		(hsa_kernel_dispatch_packet_t *)queue->base_address + (packet_id & mask);

// Set up the packet for exeuction on the device. We currently only launch		// Set up the packet for exeuction on the device. We currently only launch
// with one thread on the device, forcing the rest of the wavefront to be		// with one thread on the device, forcing the rest of the wavefront to be
// masked off.		// masked off.
std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));		std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
packet->setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;		packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
packet->workgroup_size_x = 1;		(params.num_blocks_z * params.num_threads_z != 1))
packet->workgroup_size_y = 1;		<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet->workgroup_size_z = 1;		packet->workgroup_size_x = params.num_threads_x;
packet->grid_size_x = 1;		packet->workgroup_size_y = params.num_threads_y;
packet->grid_size_y = 1;		packet->workgroup_size_z = params.num_threads_z;
packet->grid_size_z = 1;		packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
		packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
		packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
packet->private_segment_size = private_size;		packet->private_segment_size = private_size;
packet->group_segment_size = group_size;		packet->group_segment_size = group_size;
packet->kernel_object = kernel;		packet->kernel_object = kernel;
packet->kernarg_address = args;		packet->kernarg_address = args;

// Create a signal to indicate when this packet has been completed.		// Create a signal to indicate when this packet has been completed.
if (hsa_status_t err =		if (hsa_status_t err =
hsa_signal_create(1, 0, nullptr, &packet->completion_signal))		hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
▲ Show 20 Lines • Show All 83 Lines • Show Last 20 Lines

libc/utils/gpu/loader/nvptx/Loader.cpp

Show First 20 Lines • Show All 70 Lines • ▼ Show 20 Lines	static void handle_error(CUresult err) {
exit(1);		exit(1);
}		}

static void handle_error(const char *msg) {		static void handle_error(const char *msg) {
fprintf(stderr, "%s\n", msg);		fprintf(stderr, "%s\n", msg);
exit(EXIT_FAILURE);		exit(EXIT_FAILURE);
}		}

int load(int argc, char argv, char envp, void *image, size_t size) {		int load(int argc, char argv, char envp, void *image, size_t size,
		const LaunchParameters &params) {
if (CUresult err = cuInit(0))		if (CUresult err = cuInit(0))
handle_error(err);		handle_error(err);

// Obtain the first device found on the system.		// Obtain the first device found on the system.
CUdevice device;		CUdevice device;
if (CUresult err = cuDeviceGet(&device, 0))		if (CUresult err = cuDeviceGet(&device, 0))
handle_error(err);		handle_error(err);

▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines	int load(int argc, char argv, char envp, void *image, size_t size,
args.buffer = buffer;		args.buffer = buffer;
void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,		void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,		CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
CU_LAUNCH_PARAM_END};		CU_LAUNCH_PARAM_END};

// Initialize the RPC server's buffer for host-device communication.		// Initialize the RPC server's buffer for host-device communication.
server.reset(server_inbox, server_outbox, buffer);		server.reset(server_inbox, server_outbox, buffer);

// Call the kernel with the given arguments.		// Call the kernel with the given arguments.
if (CUresult err =		if (CUresult err = cuLaunchKernel(
cuLaunchKernel(function, /gridDimX=/1, /gridDimY=/1,		function, params.num_blocks_x, params.num_blocks_y,
/gridDimZ=/1, /blockDimX=/1, /blockDimY=/1,		params.num_blocks_z, params.num_threads_x, params.num_threads_y,
/bloackDimZ=/1, 0, stream, nullptr, args_config))		params.num_threads_z, 0, stream, nullptr, args_config))
		traUnsubmitted Not Done Reply Inline Actions If we're allowing controlling the number of blocks/threads at all, is there a reason not to allow specifying all dimensions? tra: If we're allowing controlling the number of blocks/threads at all, is there a reason not to…
		jhuber6AuthorUnsubmitted Done Reply Inline Actions I wasn't sure if I should bother, but I could definitely add it since it's hardly anymore work from what's here. I think internally for implementations we'll need to generate all of our thread id's using the full dimensions as well, but that's probably standard. jhuber6: I wasn't sure if I should bother, but I could definitely add it since it's hardly anymore work…
		traUnsubmitted Not Done Reply Inline Actions That's one common pattern. However, there are also use cases when small kernels benefit performance-wise from being able to use x/y/z indices directly, without having to calculate the single thread ID and then split it into sub-indices. There's also a limit on how large the individual dimensions can be, so specifying a single one may not be sufficient for large inputs: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability tra: That's one common pattern. However, there are also use cases when small kernels benefit…
handle_error(err);		handle_error(err);

// Wait until the kernel has completed execution on the device. Periodically		// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.		// check the RPC client for work to be performed on the server.
while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)		while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
handle_server();		handle_server();

// Copy the return value back from the kernel and wait.		// Copy the return value back from the kernel and wait.
Show All 26 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[libc] Add the '--threads' and '--blocks' option to the GPU loaders
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 514929

libc/utils/gpu/loader/Loader.h

libc/utils/gpu/loader/Main.cpp

libc/utils/gpu/loader/amdgpu/Loader.cpp

libc/utils/gpu/loader/nvptx/Loader.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[libc] Add the '--threads' and '--blocks' option to the GPU loadersClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 514929

libc/utils/gpu/loader/Loader.h

libc/utils/gpu/loader/Main.cpp

libc/utils/gpu/loader/amdgpu/Loader.cpp

libc/utils/gpu/loader/nvptx/Loader.cpp

[libc] Add the '--threads' and '--blocks' option to the GPU loaders
ClosedPublic