diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt --- a/libc/utils/gpu/loader/CMakeLists.txt +++ b/libc/utils/gpu/loader/CMakeLists.txt @@ -8,6 +8,13 @@ message(STATUS "Skipping HSA loader for gpu target, no HSA was detected") endif() +find_package(CUDAToolkit QUIET) +if(CUDAToolkit_FOUND) + add_subdirectory(nvptx) +else() + message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected") +endif() + # Add a custom target to be used for testing. if(TARGET amdhsa_loader AND LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU) add_custom_target(libc.utils.gpu.loader) diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h --- a/libc/utils/gpu/loader/Loader.h +++ b/libc/utils/gpu/loader/Loader.h @@ -6,9 +6,46 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H +#define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H + +#include #include /// Generic interface to load the \p image and launch execution of the _start /// kernel on the target device. Copies \p argc and \p argv to the device. /// Returns the final value of the `main` function on the device. int load(int argc, char **argv, char **evnp, void *image, size_t size); + +/// Copy the system's argument vector to GPU memory allocated using \p alloc. +template +void *copy_argument_vector(int argc, char **argv, Allocator alloc) { + void *dev_argv = alloc(argc * sizeof(char *)); + if (dev_argv == nullptr) + return nullptr; + + for (int i = 0; i < argc; ++i) { + size_t size = strlen(argv[i]) + 1; + void *dev_str = alloc(size); + if (dev_str == nullptr) + return nullptr; + + // Load the host memory buffer with the pointer values of the newly + // allocated strings. + std::memcpy(dev_str, argv[i], size); + static_cast(dev_argv)[i] = dev_str; + } + return dev_argv; +}; + +/// Copy the system's environment to GPU memory allocated using \p alloc. +template +void *copy_environment(char **envp, Allocator alloc) { + int envc = 0; + for (char **env = envp; *env != 0; ++env) + ++envc; + + return copy_argument_vector(envc, envp, alloc); +}; + +#endif diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -75,6 +75,11 @@ exit(EXIT_FAILURE); } +static void handle_error(const char *msg) { + fprintf(stderr, "%s\n", msg); + exit(EXIT_FAILURE); +} + /// Generic interface for iterating using the HSA callbacks. template hsa_status_t iterate(func_ty func, callback_ty cb) { @@ -279,50 +284,23 @@ // Allocate fine-grained memory on the host to hold the pointer array for the // copied argv and allow the GPU agent to access it. - void *dev_argv; - if (hsa_status_t err = - hsa_amd_memory_pool_allocate(finegrained_pool, argc * sizeof(char *), - /*flags=*/0, &dev_argv)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_argv); - - // Copy each string in the argument vector to global memory on the device. - for (int i = 0; i < argc; ++i) { - size_t size = strlen(argv[i]) + 1; - void *dev_str; + auto allocator = [&](uint64_t size) -> void * { + void *dev_ptr = nullptr; if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, - /*flags=*/0, &dev_str)) + /*flags=*/0, &dev_ptr)) handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_str); - // Load the host memory buffer with the pointer values of the newly - // allocated strings. - std::memcpy(dev_str, argv[i], size); - static_cast(dev_argv)[i] = dev_str; - } + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); + return dev_ptr; + }; + void *dev_argv = copy_argument_vector(argc, argv, allocator); + if (!dev_argv) + handle_error("Failed to allocate device argv"); // Allocate fine-grained memory on the host to hold the pointer array for the // copied environment array and allow the GPU agent to access it. - int envc = 0; - for (char **env = envp; *env != 0; ++env) - ++envc; - void *dev_envp; - if (hsa_status_t err = - hsa_amd_memory_pool_allocate(finegrained_pool, envc * sizeof(char *), - /*flags=*/0, &dev_envp)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_envp); - for (int i = 0; i < envc; ++i) { - size_t size = strlen(envp[i]) + 1; - void *dev_str; - if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, - /*flags=*/0, &dev_str)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_str); - // Load the host memory buffer with the pointer values of the newly - // allocated strings. - std::memcpy(dev_str, envp[i], size); - static_cast(dev_envp)[i] = dev_str; - } + void *dev_envp = copy_environment(envp, allocator); + if (!dev_envp) + handle_error("Failed to allocate device environment"); // Allocate space for the return pointer and initialize it to zero. void *dev_ret; diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt @@ -0,0 +1,9 @@ +add_executable(nvptx_loader Loader.cpp) +add_dependencies(nvptx_loader libc.src.__support.RPC.rpc) + +target_include_directories(nvptx_loader PRIVATE ${LIBC_SOURCE_DIR}) +target_link_libraries(nvptx_loader + PRIVATE + gpu_loader + CUDA::cuda_driver +) diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -0,0 +1,147 @@ +//===-- Loader Implementation for NVPTX devices --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file impelements a simple loader to run images supporting the NVPTX +// architecture. The file launches the '_start' kernel which should be provided +// by the device application start code and call ultimately call the 'main' +// function. +// +//===----------------------------------------------------------------------===// + +#include "Loader.h" + +#include "cuda.h" +#include +#include +#include +#include + +/// The arguments to the '_start' kernel. +struct kernel_args_t { + int argc; + void *argv; + void *envp; + void *ret; + void *inbox; + void *outbox; + void *buffer; +}; + +static void handle_error(CUresult err) { + if (err == CUDA_SUCCESS) + return; + + const char *err_str = nullptr; + CUresult result = cuGetErrorString(err, &err_str); + if (result != CUDA_SUCCESS) + fprintf(stderr, "Unknown Error\n"); + else + fprintf(stderr, "%s\n", err_str); + exit(1); +} + +static void handle_error(const char *msg) { + fprintf(stderr, "%s\n", msg); + exit(EXIT_FAILURE); +} + +int load(int argc, char **argv, char **envp, void *image, size_t size) { + if (CUresult err = cuInit(0)) + handle_error(err); + + // Obtain the first device found on the system. + CUdevice device; + if (CUresult err = cuDeviceGet(&device, 0)) + handle_error(err); + + // Initialize the CUDA context and claim it for this execution. + CUcontext context; + if (CUresult err = cuDevicePrimaryCtxRetain(&context, device)) + handle_error(err); + if (CUresult err = cuCtxSetCurrent(context)) + handle_error(err); + + // Initialize a non-blocking CUDA stream to execute the kernel. + CUstream stream; + if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)) + handle_error(err); + + // Load the image into a CUDA module. + CUmodule binary; + if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr)) + handle_error(err); + + // look up the '_start' kernel in the loaded module. + CUfunction function; + if (CUresult err = cuModuleGetFunction(&function, binary, "_start")) + handle_error(err); + + // Allocate pinned memory on the host to hold the pointer array for the + // copied argv and allow the GPU device to access it. + auto allocator = [&](uint64_t size) -> void * { + void *dev_ptr; + if (CUresult err = cuMemAllocHost(&dev_ptr, size)) + handle_error(err); + return dev_ptr; + }; + void *dev_argv = copy_argument_vector(argc, argv, allocator); + if (!dev_argv) + handle_error("Failed to allocate device argv"); + + // Allocate pinned memory on the host to hold the pointer array for the + // copied environment array and allow the GPU device to access it. + void *dev_envp = copy_environment(envp, allocator); + if (!dev_envp) + handle_error("Failed to allocate device environment"); + + // Allocate space for the return pointer and initialize it to zero. + CUdeviceptr dev_ret; + if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int))) + handle_error(err); + if (CUresult err = cuMemsetD32(dev_ret, 0, 1)) + handle_error(err); + + // Set up the arguments to the '_start' kernel on the GPU. + // TODO: Setup RPC server implementation; + uint64_t args_size = sizeof(kernel_args_t); + kernel_args_t args; + std::memset(&args, 0, args_size); + args.argc = argc; + args.argv = dev_argv; + args.envp = dev_envp; + args.ret = reinterpret_cast(dev_ret); + void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args, + CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, + CU_LAUNCH_PARAM_END}; + + // Call the kernel with the given arguments. + if (CUresult err = + cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1, + /*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1, + /*bloackDimZ=*/1, 0, stream, nullptr, args_config)) + handle_error(err); + + // TODO: Query the RPC server periodically while the kernel is running. + while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY) + ; + + // Copy the return value back from the kernel and wait. + int host_ret = 0; + if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int))) + handle_error(err); + + if (CUresult err = cuStreamSynchronize(stream)) + handle_error(err); + + // Destroy the context and the loaded binary. + if (CUresult err = cuModuleUnload(binary)) + handle_error(err); + if (CUresult err = cuDevicePrimaryCtxRelease(device)) + handle_error(err); + return host_ret; +}