diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -35,9 +35,11 @@ NOOP = 0, PRINT_TO_STDERR = 1, EXIT = 2, - TEST_INCREMENT = 3, - TEST_INTERFACE = 4, - TEST_STREAM = 5, + MALLOC = 3, + FREE = 4, + TEST_INCREMENT = 5, + TEST_INTERFACE = 6, + TEST_STREAM = 7, }; /// A fixed size channel used to communicate between the RPC client and server. diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -287,10 +287,23 @@ DEPENDS ${SCUDO_DEPS} ) +elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU) + add_entrypoint_external( + calloc + ) + add_entrypoint_external( + realloc + ) + add_entrypoint_external( + aligned_alloc + ) else() add_entrypoint_external( malloc ) + add_entrypoint_external( + free + ) add_entrypoint_external( calloc ) @@ -300,9 +313,6 @@ add_entrypoint_external( aligned_alloc ) - add_entrypoint_external( - free - ) endif() if(NOT LLVM_LIBC_FULL_BUILD) @@ -356,3 +366,19 @@ DEPENDS .${LIBC_TARGET_OS}.abort ) + +if(LIBC_TARGET_ARCHITECTURE_IS_GPU) + add_entrypoint_object( + malloc + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.malloc + ) + + add_entrypoint_object( + free + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.free + ) +endif() diff --git a/libc/src/stdlib/free.h b/libc/src/stdlib/free.h new file mode 100644 --- /dev/null +++ b/libc/src/stdlib/free.h @@ -0,0 +1,20 @@ +//===-- Implementation header for free --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#ifndef LLVM_LIBC_SRC_STDLIB_FREE_H +#define LLVM_LIBC_SRC_STDLIB_FREE_H + +namespace __llvm_libc { + +void free(void *ptr); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STDLIB_LDIV_H diff --git a/libc/src/stdlib/gpu/CMakeLists.txt b/libc/src/stdlib/gpu/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/src/stdlib/gpu/CMakeLists.txt @@ -0,0 +1,21 @@ +add_entrypoint_object( + malloc + SRCS + malloc.cpp + HDRS + ../malloc.h + DEPENDS + libc.include.stdlib + libc.src.__support.RPC.rpc_client +) + +add_entrypoint_object( + free + SRCS + free.cpp + HDRS + ../free.h + DEPENDS + libc.include.stdlib + libc.src.__support.RPC.rpc_client +) diff --git a/libc/src/stdlib/gpu/free.cpp b/libc/src/stdlib/gpu/free.cpp new file mode 100644 --- /dev/null +++ b/libc/src/stdlib/gpu/free.cpp @@ -0,0 +1,23 @@ +//===-- GPU Implementation of free ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/free.h" +#include "src/__support/RPC/rpc_client.h" +#include "src/__support/common.h" + +namespace __llvm_libc { + +LLVM_LIBC_FUNCTION(void, free, (void *ptr)) { + rpc::Client::Port port = rpc::client.open(); + port.send([=](rpc::Buffer *buffer) { + buffer->data[0] = reinterpret_cast(ptr); + }); + port.close(); +} + +} // namespace __llvm_libc diff --git a/libc/src/stdlib/gpu/malloc.cpp b/libc/src/stdlib/gpu/malloc.cpp new file mode 100644 --- /dev/null +++ b/libc/src/stdlib/gpu/malloc.cpp @@ -0,0 +1,26 @@ +//===-- GPU Implementation of malloc --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/malloc.h" +#include "src/__support/RPC/rpc_client.h" +#include "src/__support/common.h" + +namespace __llvm_libc { + +LLVM_LIBC_FUNCTION(void *, malloc, (size_t size)) { + void *ptr = nullptr; + rpc::Client::Port port = rpc::client.open(); + port.send_and_recv([=](rpc::Buffer *buffer) { buffer->data[0] = size; }, + [&](rpc::Buffer *buffer) { + ptr = reinterpret_cast(buffer->data[0]); + }); + port.close(); + return ptr; +} + +} // namespace __llvm_libc diff --git a/libc/src/stdlib/malloc.h b/libc/src/stdlib/malloc.h new file mode 100644 --- /dev/null +++ b/libc/src/stdlib/malloc.h @@ -0,0 +1,20 @@ +//===-- Implementation header for malloc ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#ifndef LLVM_LIBC_SRC_STDLIB_MALLOC_H +#define LLVM_LIBC_SRC_STDLIB_MALLOC_H + +namespace __llvm_libc { + +void *malloc(size_t size); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STDLIB_LDIV_H diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -320,4 +320,19 @@ libc.src.signal.raise ) + # Only the GPU has an in-tree 'malloc' implementation. + if(LIBC_TARGET_ARCHITECTURE_IS_GPU) + add_libc_test( + malloc_test + HERMETIC_TEST_ONLY + SUITE + libc-stdlib-tests + SRCS + malloc_test.cpp + DEPENDS + libc.include.stdlib + libc.src.stdlib.malloc + libc.src.stdlib.free + ) + endif() endif() diff --git a/libc/test/src/stdlib/malloc_test.cpp b/libc/test/src/stdlib/malloc_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/stdlib/malloc_test.cpp @@ -0,0 +1,19 @@ +//===-- Unittests for malloc ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/free.h" +#include "src/stdlib/malloc.h" +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcMallocTest, Allocate) { + int *ptr = reinterpret_cast(__llvm_libc::malloc(sizeof(int))); + EXPECT_NE(reinterpret_cast(ptr), static_cast(nullptr)); + *ptr = 1; + EXPECT_EQ(*ptr, 1); + __llvm_libc::free(ptr); +} diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt --- a/libc/utils/gpu/loader/CMakeLists.txt +++ b/libc/utils/gpu/loader/CMakeLists.txt @@ -14,10 +14,17 @@ find_package(CUDAToolkit QUIET) # The CUDA loader requires LLVM to traverse the ELF image for symbols. find_package(LLVM QUIET) -if(CUDAToolkit_FOUND AND LLVM_FOUND) +if(CUDAToolkit_FOUND AND LLVM_FOUND AND + ${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11.2") add_subdirectory(nvptx) else() - message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected") + if(${CUDAToolkit_VERSION} VERSION_LESS "11.2") + message(WARNING + "Skipping CUDA loader for gpu target, CUDA must be version 11.2 or later. + Found CUDA Version ${CUDAToolkit_VERSION}") + else() + message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected") + endif() endif() # Add a custom target to be used for testing. diff --git a/libc/utils/gpu/loader/Server.h b/libc/utils/gpu/loader/Server.h --- a/libc/utils/gpu/loader/Server.h +++ b/libc/utils/gpu/loader/Server.h @@ -21,7 +21,8 @@ /// Queries the RPC client at least once and performs server-side work if there /// are any active requests. -void handle_server() { +template +void handle_server(Alloc allocator, Dealloc deallocator) { using namespace __llvm_libc; // Continue servicing the client until there is no work left and we return. @@ -50,6 +51,19 @@ }); break; } + case rpc::Opcode::MALLOC: { + port->recv_and_send([&](rpc::Buffer *buffer) { + buffer->data[0] = + reinterpret_cast(allocator(buffer->data[0])); + }); + break; + } + case rpc::Opcode::FREE: { + port->recv([&](rpc::Buffer *buffer) { + deallocator(reinterpret_cast(buffer->data[0])); + }); + break; + } case rpc::Opcode::TEST_INCREMENT: { port->recv_and_send([](rpc::Buffer *buffer) { reinterpret_cast(buffer->data)[0] += 1; diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -134,6 +134,7 @@ template hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, hsa_amd_memory_pool_t kernargs_pool, + hsa_amd_memory_pool_t coarsegrained_pool, hsa_queue_t *queue, const LaunchParameters ¶ms, const char *kernel_name, args_t kernel_args) { // Look up the '_start' kernel in the loaded executable. @@ -142,6 +143,21 @@ executable, kernel_name, &dev_agent, &symbol)) return err; + auto allocator = [&](uint64_t size) -> void * { + void *dev_ptr = nullptr; + if (hsa_status_t err = + hsa_amd_memory_pool_allocate(coarsegrained_pool, size, + /*flags=*/0, &dev_ptr)) + handle_error(err); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); + return dev_ptr; + }; + + auto deallocator = [](void *ptr) -> void { + if (hsa_status_t err = hsa_amd_memory_pool_free(ptr)) + handle_error(err); + }; + // Retrieve different properties of the kernel symbol used for launch. uint64_t kernel; uint32_t args_size; @@ -219,11 +235,11 @@ while (hsa_signal_wait_scacquire( packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0, /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0) - handle_server(); + handle_server(allocator, deallocator); // Handle the server one more time in case the kernel exited with a pending // send still in flight. - handle_server(); + handle_server(allocator, deallocator); // Destroy the resources acquired to launch the kernel and return. if (hsa_status_t err = hsa_amd_memory_pool_free(args)) @@ -366,14 +382,15 @@ LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer}; - if (hsa_status_t err = - launch_kernel(dev_agent, executable, kernargs_pool, queue, - single_threaded_params, "_begin.kd", init_args)) + if (hsa_status_t err = launch_kernel( + dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, + single_threaded_params, "_begin.kd", init_args)) handle_error(err); start_args_t args = {argc, dev_argv, dev_envp, dev_ret}; - if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, - queue, params, "_start.kd", args)) + if (hsa_status_t err = + launch_kernel(dev_agent, executable, kernargs_pool, + coarsegrained_pool, queue, params, "_start.kd", args)) handle_error(err); // Create a memory signal and copy the return value back from the device into @@ -402,9 +419,9 @@ int ret = *static_cast(host_ret); end_args_t fini_args = {ret}; - if (hsa_status_t err = - launch_kernel(dev_agent, executable, kernargs_pool, queue, - single_threaded_params, "_end.kd", fini_args)) + if (hsa_status_t err = launch_kernel( + dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, + single_threaded_params, "_end.kd", fini_args)) handle_error(err); // Free the memory allocated for the device. diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -174,6 +174,29 @@ CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, CU_LAUNCH_PARAM_END}; + // Initialize a non-blocking CUDA stream to allocate memory if needed. This + // needs to be done on a separate stream or else it will deadlock with the + // executing kernel. + CUstream memory_stream; + if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING)) + handle_error(err); + + auto allocator = [&](uint64_t size) -> void * { + CUdeviceptr dev_ptr; + if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream)) + handle_error(err); + + // Wait until the memory allocation is complete. + while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY) + ; + return reinterpret_cast(dev_ptr); + }; + auto deallocator = [&](void *ptr) -> void { + if (CUresult err = + cuMemFreeAsync(reinterpret_cast(ptr), memory_stream)) + handle_error(err); + }; + // Call the kernel with the given arguments. if (CUresult err = cuLaunchKernel( function, params.num_blocks_x, params.num_blocks_y, @@ -184,11 +207,11 @@ // Wait until the kernel has completed execution on the device. Periodically // check the RPC client for work to be performed on the server. while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY) - handle_server(); + handle_server(allocator, deallocator); // Handle the server one more time in case the kernel exited with a pending // send still in flight. - handle_server(); + handle_server(allocator, deallocator); return CUDA_SUCCESS; }