diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt --- a/libc/utils/gpu/loader/CMakeLists.txt +++ b/libc/utils/gpu/loader/CMakeLists.txt @@ -8,6 +8,13 @@ message(STATUS "Skipping HSA loader for gpu target, no HSA was detected") endif() +find_package(CUDAToolkit QUIET) +if(CUDAToolkit_FOUND) + add_subdirectory(nvptx) +else() + message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected") +endif() + # Add a custom target to be used for testing. if(TARGET amdhsa_loader AND LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU) add_custom_target(libc.utils.gpu.loader) diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt @@ -0,0 +1,9 @@ +add_executable(nvptx_loader Loader.cpp) +add_dependencies(nvptx_loader libc.src.__support.RPC.rpc) + +target_include_directories(nvptx_loader PRIVATE ${LIBC_SOURCE_DIR}) +target_link_libraries(nvptx_loader + PRIVATE + gpu_loader + CUDA::cuda_driver +) diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -0,0 +1,159 @@ +//===-- Loader Implementation for NVPTX devices --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file impelements a simple loader to run images supporting the NVPTX +// architecture. The file launches the '_start' kernel which should be provided +// by the device application start code and call ultimately call the 'main' +// function. +// +//===----------------------------------------------------------------------===// + +#include "cuda.h" +#include +#include +#include + +/// The arguments to the '_start' kernel. +struct kernel_args_t { + int argc; + void *argv; + void *envp; + void *ret; + void *inbox; + void *outbox; + void *buffer; +}; + +static void handle_error(CUresult err) { + if (err == CUDA_SUCCESS) + return; + + const char *err_str = nullptr; + CUresult result = cuGetErrorString(err, &err_str); + if (result != CUDA_SUCCESS) + fprintf(stderr, "Unknown Error\n"); + else + fprintf(stderr, "%s\n", err_str); + exit(1); +} + +int load(int argc, char **argv, char **envp, void *image, size_t size) { + if (CUresult err = cuInit(0)) + handle_error(err); + + // Obtain the first device found on the system. + CUdevice device; + if (CUresult err = cuDeviceGet(&device, 0)) + handle_error(err); + + // Initialize the CUDA context and claim it for this execution. + CUcontext context; + if (CUresult err = cuDevicePrimaryCtxRetain(&context, device)) + handle_error(err); + if (CUresult err = cuCtxSetCurrent(context)) + handle_error(err); + + // Initialize a non-blocking CUDA stream to execute the kernel. + CUstream stream; + if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)) + handle_error(err); + + // Load the image into a CUDA module. + CUmodule binary; + if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr)) + handle_error(err); + + // look up the '_start' kernel in the loaded module. + CUfunction function; + if (CUresult err = cuModuleGetFunction(&function, binary, "_start")) + handle_error(err); + + // Allocate pinned memory on the host to hold the pointer array for the + // copied argv and allow the GPU device to access it. + void *dev_argv; + if (CUresult err = cuMemAllocHost(&dev_argv, sizeof(char *) * argc)) + handle_error(err); + + // Copy each string in the argument vector to shared memory on the device. + for (int i = 0; i < argc; ++i) { + size_t size = strlen(argv[i]) + 1; + void *dev_str; + if (CUresult err = cuMemAllocHost(&dev_str, size)) + handle_error(err); + // Load the host memory buffer with the pointer values of the newly + // allocated strings. + std::memcpy(dev_str, argv[i], size); + static_cast(dev_argv)[i] = dev_str; + } + + // Allocate fine-grained memory on the host to hold the pointer array for the + // copied environment array and allow the GPU agent to access it. + int envc = 0; + for (char **env = envp; *env != 0; ++env) + ++envc; + void *dev_envp; + if (CUresult err = cuMemAllocHost(&dev_envp, sizeof(char *) * envc)) + handle_error(err); + + for (int i = 0; i < envc; ++i) { + size_t size = strlen(envp[i]) + 1; + void *dev_str; + if (CUresult err = cuMemAllocHost(&dev_str, size)) + handle_error(err); + // Load the host memory buffer with the pointer values of the newly + // allocated strings. + std::memcpy(dev_str, envp[i], size); + static_cast(dev_envp)[i] = dev_str; + } + + // Allocate space for the return pointer and initialize it to zero. + CUdeviceptr dev_ret; + if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int))) + handle_error(err); + if (CUresult err = cuMemsetD32(dev_ret, 0, 1)) + handle_error(err); + + // Set up the arguments to the '_start' kernel on the GPU. + // TODO: Setup RPC server implementation; + uint64_t args_size = sizeof(kernel_args_t); + kernel_args_t args; + std::memset(&args, 0, args_size); + args.argc = argc; + args.argv = dev_argv; + args.envp = dev_argv; + args.ret = reinterpret_cast(dev_ret); + void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args, + CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, + CU_LAUNCH_PARAM_END}; + + // Call the kernel with the given arguments. + if (CUresult err = + cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1, + /*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1, + /*bloackDimZ=*/1, 0, stream, nullptr, args_config)) + handle_error(err); + + // TODO: Query the RPC server periodically while the kernel is running. + while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY) + ; + + // Copy the return value back from the kernel and wait. + int host_ret = 0; + if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int))) + handle_error(err); + + if (CUresult err = cuStreamSynchronize(stream)) + handle_error(err); + + // Destroy the context and the loaded binary. + if (CUresult err = cuModuleUnload(binary)) + handle_error(err); + if (CUresult err = cuDevicePrimaryCtxRelease(device)) + handle_error(err); + return host_ret; +}