diff --git a/libc/utils/CMakeLists.txt b/libc/utils/CMakeLists.txt --- a/libc/utils/CMakeLists.txt +++ b/libc/utils/CMakeLists.txt @@ -2,3 +2,6 @@ add_subdirectory(MPFRWrapper) add_subdirectory(testutils) endif() +if(LIBC_TARGET_ARCHITECTURE_IS_GPU) + add_subdirectory(gpu) +endif() diff --git a/libc/utils/gpu/CMakeLists.txt b/libc/utils/gpu/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(loader) diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/loader/CMakeLists.txt @@ -0,0 +1,7 @@ +add_library(gpu_loader OBJECT Main.cpp) +target_include_directories(gpu_loader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm) +if(hsa-runtime64_FOUND) + add_subdirectory(amdgpu) +endif() diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/loader/Loader.h @@ -0,0 +1,14 @@ +//===-- Generic device loader interface -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +/// Generic interface to load the \p image and launch execution of the _start +/// kernel on the target device. Copies \p argc and \p argv to the device. +/// Returns the final value of the `main` function on the device. +int load(int argc, char **argv, void *image, size_t size); diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/loader/Main.cpp @@ -0,0 +1,46 @@ +//===-- Main entry into the loader interface ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file opens a device image passed on the command line and passes it to +// one of the loader implementations for launch. +// +//===----------------------------------------------------------------------===// + +#include "Loader.h" + +#include +#include + +int main(int argc, char **argv) { + if (argc < 2) { + printf("USAGE: ./loader , ...\n"); + return EXIT_SUCCESS; + } + + // TODO: We should perform some validation on the file. + FILE *file = fopen(argv[1], "r"); + + if (!file) { + fprintf(stderr, "Failed to open image file %s\n", argv[1]); + return EXIT_FAILURE; + } + + fseek(file, 0, SEEK_END); + const auto size = ftell(file); + fseek(file, 0, SEEK_SET); + + void *image = malloc(size * sizeof(char)); + fread(image, sizeof(char), size, file); + fclose(file); + + // Drop the loader from the program arguments. + int ret = load(argc - 1, &argv[1], image, size); + + free(image); + return ret; +} diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt @@ -0,0 +1,6 @@ +add_executable(amdhsa_loader Loader.cpp) +target_link_libraries(amdhsa_loader + PRIVATE + hsa-runtime64::hsa-runtime64 + gpu_loader +) diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -0,0 +1,379 @@ +//===-- Loader Implementation for AMDHSA devices --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file impelements a simple loader to run images supporting the AMDHSA +// architecture. The file launches the '_start' kernel which should be provided +// by the device application start code and call ultimately call the 'main' +// function. +// +//===----------------------------------------------------------------------===// + +#include "Loader.h" + +#include +#include + +#include +#include +#include +#include + +/// The name of the kernel we will launch. All AMDHSA kernels end with '.kd'. +constexpr const char *KERNEL_START = "_start.kd"; + +/// The arguments to the '_start' kernel. +struct kernel_args_t { + int argc; + void *argv; + void *ret; +}; + +/// Print the error code and exit if \p code indicates an error. +static void handle_error(hsa_status_t code) { + if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK) + return; + + const char *desc; + if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS) + desc = "Unknown error"; + fprintf(stderr, "%s\n", desc); + exit(EXIT_FAILURE); +} + +/// Generic interface for iterating using the HSA callbacks. +template +hsa_status_t iterate(func_ty func, callback_ty cb) { + auto l = [](elem_ty elem, void *data) -> hsa_status_t { + callback_ty *unwrapped = static_cast(data); + return (*unwrapped)(elem); + }; + return func(l, static_cast(&cb)); +} + +/// Generic interface for iterating using the HSA callbacks. +template +hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) { + auto l = [](elem_ty elem, void *data) -> hsa_status_t { + callback_ty *unwrapped = static_cast(data); + return (*unwrapped)(elem); + }; + return func(func_arg, l, static_cast(&cb)); +} + +/// Iterate through all availible agents. +template +hsa_status_t iterate_agents(callback_ty callback) { + return iterate(hsa_iterate_agents, callback); +} + +/// Iterate through all availible memory pools. +template +hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) { + return iterate(hsa_amd_agent_iterate_memory_pools, + agent, cb); +} + +template +hsa_status_t get_agent(hsa_agent_t *output_agent) { + // Find the first agent with a matching device type. + auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t { + hsa_device_type_t type; + hsa_status_t status = + hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type); + if (status != HSA_STATUS_SUCCESS) + return status; + + if (type == flag) { + // Ensure that a GPU agent supports kernel dispatch packets. + if (type == HSA_DEVICE_TYPE_GPU) { + hsa_agent_feature_t features; + status = + hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features); + if (status != HSA_STATUS_SUCCESS) + return status; + if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) + *output_agent = hsa_agent; + } else { + *output_agent = hsa_agent; + } + return HSA_STATUS_INFO_BREAK; + } + return HSA_STATUS_SUCCESS; + }; + + return iterate_agents(cb); +} + +/// Retrieve a global memory pool with a \p flag from the agent. +template +hsa_status_t get_agent_memory_pool(hsa_agent_t agent, + hsa_amd_memory_pool_t *output_pool) { + auto cb = [&](hsa_amd_memory_pool_t memory_pool) { + uint32_t flags; + hsa_amd_segment_t segment; + if (auto err = hsa_amd_memory_pool_get_info( + memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment)) + return err; + if (auto err = hsa_amd_memory_pool_get_info( + memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags)) + return err; + + if (segment != HSA_AMD_SEGMENT_GLOBAL) + return HSA_STATUS_SUCCESS; + + if (flags & flag) + *output_pool = memory_pool; + + return HSA_STATUS_SUCCESS; + }; + return iterate_agent_memory_pools(agent, cb); +} + +int load(int argc, char **argv, void *image, size_t size) { + // Initialize the HSA runtime used to communicate with the device. + if (hsa_status_t err = hsa_init()) + handle_error(err); + + // Register a callback when the device encounters a memory fault. + if (hsa_status_t err = hsa_amd_register_system_event_handler( + [](const hsa_amd_event_t *event, void *) -> hsa_status_t { + if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) + return HSA_STATUS_ERROR; + return HSA_STATUS_SUCCESS; + }, + nullptr)) + handle_error(err); + + // Obtain an agent for the device and host to use the HSA memory model. + hsa_agent_t dev_agent; + hsa_agent_t host_agent; + if (hsa_status_t err = get_agent(&dev_agent)) + handle_error(err); + if (hsa_status_t err = get_agent(&host_agent)) + handle_error(err); + + // Obtain a queue with the minimum (power of two) size, used to send commands + // to the HSA runtime and launch execution on the device. + uint64_t queue_size; + if (hsa_status_t err = hsa_agent_get_info( + dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size)) + handle_error(err); + hsa_queue_t *queue = nullptr; + if (hsa_status_t err = + hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_SINGLE, + nullptr, nullptr, UINT32_MAX, UINT32_MAX, &queue)) + handle_error(err); + + // Load the code object's ISA information and executable data segments. + hsa_code_object_t object; + if (hsa_status_t err = hsa_code_object_deserialize(image, size, "", &object)) + handle_error(err); + + hsa_executable_t executable; + if (hsa_status_t err = hsa_executable_create_alt( + HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", + &executable)) + handle_error(err); + + if (hsa_status_t err = + hsa_executable_load_code_object(executable, dev_agent, object, "")) + handle_error(err); + + // No modifications to the executable are allowed after this point. + if (hsa_status_t err = hsa_executable_freeze(executable, "")) + handle_error(err); + + // Check the validity of the loaded executable. If the agents ISA features do + // not match the executable's code object it will fail here. + uint32_t result; + if (hsa_status_t err = hsa_executable_validate(executable, &result)) + handle_error(err); + if (result) + handle_error(HSA_STATUS_ERROR); + + // Obtain memory pools to exchange data between the host and the device. The + // fine-grained pool acts as pinned memory on the host for DMA transfers to + // the device, the coarse-grained pool is for allocations directly on the + // device, and the kernerl-argument pool is for executing the kernel. + hsa_amd_memory_pool_t kernargs_pool; + hsa_amd_memory_pool_t finegrained_pool; + hsa_amd_memory_pool_t coarsegrained_pool; + if (hsa_status_t err = + get_agent_memory_pool( + host_agent, &kernargs_pool)) + handle_error(err); + if (hsa_status_t err = + get_agent_memory_pool( + host_agent, &finegrained_pool)) + handle_error(err); + if (hsa_status_t err = + get_agent_memory_pool( + dev_agent, &coarsegrained_pool)) + handle_error(err); + + // Look up the '_start' kernel in the loaded executable. + hsa_executable_symbol_t symbol; + if (hsa_status_t err = hsa_executable_get_symbol_by_name( + executable, KERNEL_START, &dev_agent, &symbol)) + handle_error(err); + + // Retrieve different properties of the kernel symbol used for launch. + uint64_t kernel; + uint32_t args_size; + uint32_t group_size; + uint32_t private_size; + + std::pair symbol_infos[] = { + {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel}, + {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size}, + {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size}, + {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}}; + + for (auto &[info, value] : symbol_infos) + if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value)) + handle_error(err); + + // Allocate space for the kernel arguments on the host and allow the GPU agent + // to access it. + void *args; + if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size, + /*flags=*/0, &args)) + handle_error(err); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args); + + // Allocate fine-grained memory on the host to hold the pointer array for the + // copied argv and allow the GPU agent to access it. + void *dev_argv; + if (hsa_status_t err = + hsa_amd_memory_pool_allocate(finegrained_pool, argc * sizeof(char *), + /*flags=*/0, &dev_argv)) + handle_error(err); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_argv); + + // Copy each string in the argument vector to global memory on the device. + for (int i = 0; i < argc; ++i) { + size_t size = strlen(argv[i]) + 1; + void *dev_str; + if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, + /*flags=*/0, &dev_str)) + handle_error(err); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_str); + // Load the host memory buffer with the pointer values of the newly + // allocated strings. + std::memcpy(dev_str, argv[i], size); + static_cast(dev_argv)[i] = dev_str; + } + + // Allocate space for the return pointer and initialize it to zero. + void *dev_ret; + if (hsa_status_t err = + hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int), + /*flags=*/0, &dev_ret)) + handle_error(err); + hsa_amd_memory_fill(dev_ret, 0, sizeof(int)); + + // Initialie all the arguments (explicit and implicit) to zero, then set the + // explicit arguments to the values created above. + std::memset(args, 0, args_size); + kernel_args_t *kernel_args = reinterpret_cast(args); + kernel_args->argc = argc; + kernel_args->argv = dev_argv; + kernel_args->ret = dev_ret; + + // Obtain a packet from the queue. + uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); + while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue_size) + ; + + const uint32_t mask = queue_size - 1; + hsa_kernel_dispatch_packet_t *packet = + (hsa_kernel_dispatch_packet_t *)queue->base_address + (packet_id & mask); + + // Set up the packet for exeuction on the device. We currently only launch + // with one thread on the device, forcing the rest of the wavefront to be + // masked off. + std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t)); + packet->setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + packet->workgroup_size_x = 1; + packet->workgroup_size_y = 1; + packet->workgroup_size_z = 1; + packet->grid_size_x = 1; + packet->grid_size_y = 1; + packet->grid_size_z = 1; + packet->private_segment_size = private_size; + packet->group_segment_size = group_size; + packet->kernel_object = kernel; + packet->kernarg_address = args; + + // Create a signal to indicate when this packet has been completed. + if (hsa_status_t err = + hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) + handle_error(err); + + // Initialize the packet header and set the doorbell signal to begin execution + // by the HSA runtime. + uint16_t header = + (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + __atomic_store_n(&packet->header, header | (packet->setup << 16), + __ATOMIC_RELEASE); + hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); + + // Wait until the kernel has completed execution on the device. + while (hsa_signal_wait_scacquire(packet->completion_signal, + HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, + HSA_WAIT_STATE_ACTIVE) != 0) + ; + + // Create a memory signal and copy the return value back from the device into + // a new buffer. + hsa_signal_t memory_signal; + if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) + handle_error(err); + + void *host_ret; + if (hsa_status_t err = + hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int), + /*flags=*/0, &host_ret)) + handle_error(err); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret); + + if (hsa_status_t err = + hsa_amd_memory_async_copy(host_ret, host_agent, dev_ret, dev_agent, + sizeof(int), 0, nullptr, memory_signal)) + handle_error(err); + + while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, + UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) + ; + + // Save the return value and perform basic clean-up. + int ret = *static_cast(host_ret); + + if (hsa_status_t err = hsa_signal_destroy(memory_signal)) + handle_error(err); + + if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal)) + handle_error(err); + + if (hsa_status_t err = hsa_queue_destroy(queue)) + handle_error(err); + + if (hsa_status_t err = hsa_executable_destroy(executable)) + handle_error(err); + + if (hsa_status_t err = hsa_code_object_destroy(object)) + handle_error(err); + + if (hsa_status_t err = hsa_shut_down()) + handle_error(err); + + return ret; +}