diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -92,7 +92,7 @@ if(LIBC_TARGET_ARCHITECTURE_IS_GPU) set(LIBC_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include) set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm) - set(LIBC_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/lib) + set(LIBC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}) elseif(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND LIBC_ENABLE_USE_BY_CLANG) set(LIBC_INCLUDE_DIR ${LLVM_BINARY_DIR}/include/${LLVM_DEFAULT_TARGET_TRIPLE}) set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}) diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -519,17 +519,25 @@ } /// Wait until the signal gets a zero value. - Error wait(const uint64_t ActiveTimeout = 0) const { - if (ActiveTimeout) { + Error wait(const uint64_t ActiveTimeout = 0, + RPCHandleTy *RPCHandle = nullptr) const { + if (ActiveTimeout && !RPCHandle) { hsa_signal_value_t Got = 1; Got = hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, ActiveTimeout, HSA_WAIT_STATE_ACTIVE); if (Got == 0) return Plugin::success(); } + + // If there is an RPC device attached to this stream we run it as a server. + uint64_t Timeout = RPCHandle ? 8192 : UINT64_MAX; + auto WaitState = RPCHandle ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED; while (hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, - UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) - ; + Timeout, WaitState) != 0) { + if (RPCHandle) + if (auto Err = RPCHandle->runServer()) + return Err; + } return Plugin::success(); } @@ -895,6 +903,11 @@ /// operation that was already finalized in a previous stream sycnhronize. uint32_t SyncCycle; + /// A pointer associated with an RPC server running on the given device. If + /// RPC is not being used this will be a null pointer. Otherwise, this + /// indicates that an RPC server is expected to be run on this stream. + RPCHandleTy *RPCHandle; + /// Mutex to protect stream's management. mutable std::mutex Mutex; @@ -1050,6 +1063,9 @@ /// Deinitialize the stream's signals. Error deinit() { return Plugin::success(); } + /// Attach an RPC handle to this stream. + void setRPCHandle(RPCHandleTy *Handle) { RPCHandle = Handle; } + /// Push a asynchronous kernel to the stream. The kernel arguments must be /// placed in a special allocation for kernel args and must keep alive until /// the kernel finalizes. Once the kernel is finished, the stream will release @@ -1264,7 +1280,8 @@ return Plugin::success(); // Wait until all previous operations on the stream have completed. - if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds)) + if (auto Err = + Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, RPCHandle)) return Err; // Reset the stream and perform all pending post actions. @@ -1786,6 +1803,12 @@ /// AMDGPU devices do not have the concept of contexts. Error setContext() override { return Plugin::success(); } + /// We want to set up the RPC server for host services to the GPU if it is + /// availible. + bool shouldSetupRPCServer() const override { + return libomptargetSupportsRPC(); + } + /// Get the stream of the asynchronous info sructure or get a new one. AMDGPUStreamTy &getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) { AMDGPUStreamTy *&Stream = AsyncInfoWrapper.getQueueAs(); @@ -2507,7 +2530,7 @@ : Agent(Device.getAgent()), Queue(Device.getNextQueue()), SignalManager(Device.getSignalManager()), // Initialize the std::deque with some empty positions. - Slots(32), NextSlot(0), SyncCycle(0), + Slots(32), NextSlot(0), SyncCycle(0), RPCHandle(nullptr), StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {} /// Class implementing the AMDGPU-specific functionalities of the global @@ -2837,6 +2860,10 @@ AMDGPUDeviceTy &AMDGPUDevice = static_cast(GenericDevice); AMDGPUStreamTy &Stream = AMDGPUDevice.getStream(AsyncInfoWrapper); + // If this kernel requires an RPC server we attach its pointer to the stream. + if (GenericDevice.getRPCHandle()) + Stream.setRPCHandle(GenericDevice.getRPCHandle()); + // Push the kernel launch into the stream. return Stream.pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, GroupSize, ArgsMemoryManager); diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt @@ -13,7 +13,7 @@ # NOTE: Don't try to build `PluginInterface` using `add_llvm_library` because we # don't want to export `PluginInterface` while `add_llvm_library` requires that. add_library(PluginInterface OBJECT - PluginInterface.cpp GlobalHandler.cpp JIT.cpp) + PluginInterface.cpp GlobalHandler.cpp JIT.cpp RPC.cpp) # Only enable JIT for those targets that LLVM can support. string(TOUPPER "${LLVM_TARGETS_TO_BUILD}" TargetsSupported) @@ -62,6 +62,25 @@ MemoryManager ) +# Include the RPC server from the `libc` project if availible. +set(libomptarget_supports_rpc FALSE) +if(TARGET llvmlibc_rpc_server) + target_link_libraries(PluginInterface PRIVATE llvmlibc_rpc_server) + target_compile_definitions(PluginInterface PRIVATE LIBOMPTARGET_RPC_SUPPORT) + set(libomptarget_supports_rpc TRUE) +else() + find_library(llvmlibc_rpc_server NAMES llvmlibc_rpc_server + PATHS ${LIBOMPTARGET_LLVM_LIBRARY_DIR} NO_DEFAULT_PATH) + if(llvmlibc_rpc_server) + message(WARNING ${llvmlibc_rpc_server}) + target_link_libraries(PluginInterface PRIVATE llvmlibc_rpc_server) + target_compile_definitions(PluginInterface PRIVATE LIBOMPTARGET_RPC_SUPPORT) + set(libomptarget_supports_rpc TRUE) + endif() +endif() +set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${libomptarget_supports_rpc} CACHE BOOL + "Libomptarget support for the GPU libc") + if ((OMPT_TARGET_DEFAULT) AND (LIBOMPTARGET_OMPT_SUPPORT)) target_link_libraries(PluginInterface PUBLIC OMPT) endif() diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -24,6 +24,7 @@ #include "GlobalHandler.h" #include "JIT.h" #include "MemoryManager.h" +#include "RPC.h" #include "Utilities.h" #include "omptarget.h" @@ -600,6 +601,11 @@ /// this behavior by overriding the shouldSetupDeviceEnvironment function. Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image); + // Setup the RPC server for this device if needed. This may not run on some + // plugins like the CPU targets. By default, it will not be executed so it is + // up to the target to override this using the shouldSetupRPCServer function. + Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image); + /// Register the offload entries for a specific image on the device. Error registerOffloadEntries(DeviceImageTy &Image); @@ -751,6 +757,9 @@ return OMPX_MinThreadsForLowTripCount; } + /// Get the RPC server running on this device. + RPCHandleTy *getRPCHandle() const { return RPCHandle; } + private: /// Register offload entry for global variable. Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, @@ -780,6 +789,10 @@ /// setupDeviceEnvironment() function. virtual bool shouldSetupDeviceEnvironment() const { return true; } + /// Indicate whether or not the device should setup the RPC server. This is + /// only necessary for unhosted targets like the GPU. + virtual bool shouldSetupRPCServer() const { return false; } + /// Pointer to the memory manager or nullptr if not available. MemoryManagerTy *MemoryManager; @@ -837,6 +850,10 @@ /// Map of host pinned allocations used for optimize device transfers. PinnedAllocationMapTy PinnedAllocs; + + /// A pointer to an RPC server instance attached to this device if present. + /// This is used to run the RPC server during task synchronization. + RPCHandleTy *RPCHandle; }; /// Class implementing common functionalities of offload plugins. Each plugin @@ -892,6 +909,12 @@ /// plugin. JITEngine &getJIT() { return JIT; } + /// Get a reference to the RPC server used to provide host services. + RPCServerTy &getRPCServer() { + assert(RPCServer && "RPC server not initialized"); + return *RPCServer; + } + /// Get the OpenMP requires flags set for this plugin. int64_t getRequiresFlags() const { return RequiresFlags; } @@ -946,6 +969,9 @@ /// The JIT engine shared by all devices connected to this plugin. JITEngine JIT; + + /// The interface between the plugin and the GPU for host services. + RPCServerTy *RPCServer; }; /// Class for simplifying the getter operation of the plugin. Anywhere on the @@ -1209,6 +1235,9 @@ std::deque ResourcePool; }; +/// A static check on whether or not we support RPC in libomptarget. +const bool libomptargetSupportsRPC(); + } // namespace plugin } // namespace target } // namespace omp diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -396,7 +396,7 @@ OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 32), DeviceId(DeviceId), GridValues(OMPGridValues), PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(), - PinnedAllocs(*this) {} + PinnedAllocs(*this), RPCHandle(nullptr) {} Error GenericDeviceTy::init(GenericPluginTy &Plugin) { if (auto Err = initImpl(Plugin)) @@ -453,6 +453,10 @@ if (RecordReplay.isRecordingOrReplaying()) RecordReplay.deinit(); + if (RPCHandle) + if (auto Err = RPCHandle->deinitDevice()) + return std::move(Err); + return deinitImpl(); } @@ -493,6 +497,9 @@ if (auto Err = registerOffloadEntries(*Image)) return std::move(Err); + if (auto Err = setupRPCServer(Plugin, *Image)) + return std::move(Err); + // Return the pointer to the table of entries. return Image->getOffloadEntryTable(); } @@ -525,6 +532,33 @@ return Plugin::success(); } +Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin, + DeviceImageTy &Image) { + // The plugin either does not need an RPC server or it is unavailible. + if (!shouldSetupRPCServer()) + return Plugin::success(); + + // Check if this device needs to run an RPC server. + RPCServerTy &Server = Plugin.getRPCServer(); + auto UsingOrErr = + Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image); + if (!UsingOrErr) + return UsingOrErr.takeError(); + + if (!UsingOrErr.get()) + return Plugin::success(); + + if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image)) + return std::move(Err); + + auto DeviceOrErr = Server.getDevice(*this); + if (!DeviceOrErr) + return DeviceOrErr.takeError(); + RPCHandle = *DeviceOrErr; + DP("Running an RPC server on device %d\n", getDeviceId()); + return Plugin::success(); +} + Error GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) { const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin; const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd; @@ -1088,6 +1122,9 @@ GlobalHandler = Plugin::createGlobalHandler(); assert(GlobalHandler && "Invalid global handler"); + RPCServer = new RPCServerTy(NumDevices); + assert(RPCServer && "Invalid RPC server"); + return Plugin::success(); } @@ -1105,6 +1142,9 @@ assert(!Devices[DeviceId] && "Device was not deinitialized"); } + if (RPCServer) + delete RPCServer; + // Perform last deinitializations on the plugin. return deinitImpl(); } @@ -1139,6 +1179,14 @@ return Plugin::success(); } +const bool llvm::omp::target::plugin::libomptargetSupportsRPC() { +#ifdef LIBOMPTARGET_RPC_SUPPORT + return true; +#else + return false; +#endif +} + /// Exposed library API function, basically wrappers around the GenericDeviceTy /// functionality with the same name. All non-async functions are redirected /// to the async versions right away with a NULL AsyncInfoPtr. diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h @@ -0,0 +1,87 @@ +//===- RPC.h - Interface for remote procedure calls from the GPU ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides the interface to support remote procedure calls (RPC) from +// the GPU. This is required to implement host services like printf or malloc. +// The interface to the RPC server is provided by the 'libc' project in LLVM. +// For more information visit https://libc.llvm.org/gpu/. +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H + +#include "llvm/Support/Error.h" + +#include + +namespace llvm::omp::target { +namespace plugin { +struct GenericDeviceTy; +struct GenericGlobalHandlerTy; +class DeviceImageTy; +} // namespace plugin + +/// A generic class implementing the interface between the RPC server provided +/// by the 'libc' project and 'libomptarget'. If the RPC server is not availible +/// these routines will perform no action. +struct RPCServerTy { +public: + /// A wrapper around a single instance of the RPC server for a given device. + /// This is provided to simplify ownership of the underlying device. + struct RPCHandleTy { + RPCHandleTy(RPCServerTy &Server, plugin::GenericDeviceTy &Device) + : Server(Server), Device(Device) {} + + llvm::Error runServer() { return Server.runServer(Device); } + + llvm::Error deinitDevice() { return Server.deinitDevice(Device); } + + private: + RPCServerTy &Server; + plugin::GenericDeviceTy &Device; + }; + + RPCServerTy(uint32_t NumDevices); + + /// Check if this device image is using an RPC server. This checks for the + /// precense of an externally visible symbol in the device image that will + /// be present whenever RPC code is called. + llvm::Expected isDeviceUsingRPC(plugin::GenericDeviceTy &Device, + plugin::GenericGlobalHandlerTy &Handler, + plugin::DeviceImageTy &Image); + + /// Initialize the RPC server for the given device. This will allocate host + /// memory for the internal server and copy the data to the client on the + /// device. The device must be loaded before this is valid. + llvm::Error initDevice(plugin::GenericDeviceTy &Device, + plugin::GenericGlobalHandlerTy &Handler, + plugin::DeviceImageTy &Image); + + /// Gets a reference to this server for a specific device. + llvm::Expected getDevice(plugin::GenericDeviceTy &Device); + + /// Runs the RPC server associated with the \p Device until the pending work + /// is cleared. + llvm::Error runServer(plugin::GenericDeviceTy &Device); + + /// Deinitialize the RPC server for the given device. This will free the + /// memory associated with the k + llvm::Error deinitDevice(plugin::GenericDeviceTy &Device); + + ~RPCServerTy(); + +private: + llvm::SmallVector> Handles; +}; + +using RPCHandleTy = RPCServerTy::RPCHandleTy; + +} // namespace llvm::omp::target + +#endif diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp @@ -0,0 +1,173 @@ +//===- RPC.h - Interface for remote procedure calls from the GPU ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RPC.h" +#include "Debug.h" +#include "PluginInterface.h" + +// This header file may be present in-tree or from an LLVM installation. The +// installed version lives alongside the GPU headers so we do not want to +// include it directly. +#if __has_include() +#include +#elif defined(LIBOMPTARGET_RPC_SUPPORT) +#include +#endif + +using namespace llvm; +using namespace omp; +using namespace target; + +RPCServerTy::RPCServerTy(uint32_t NumDevices) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + // If this fails then something is catastrophically wrong, just exit. + if (rpc_status_t Err = rpc_init(NumDevices)) + FATAL_MESSAGE(1, "Error initializing the RPC server: %d\n", Err); + Handles.resize(NumDevices); +#endif +} + +llvm::Expected +RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device, + plugin::GenericGlobalHandlerTy &Handler, + plugin::DeviceImageTy &Image) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + void *ClientPtr; + plugin::GlobalTy Global(rpc_client_symbol_name, sizeof(void *), &ClientPtr); + if (auto Err = Handler.readGlobalFromImage(Device, Image, Global)) { + llvm::consumeError(std::move(Err)); + return false; + } + + return true; +#else + return false; +#endif +} + +Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, + plugin::GenericGlobalHandlerTy &Handler, + plugin::DeviceImageTy &Image) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + uint32_t DeviceId = Device.getDeviceId(); + auto Alloc = [](uint64_t Size, void *Data) { + plugin::GenericDeviceTy &Device = + *reinterpret_cast(Data); + return Device.allocate(Size, nullptr, TARGET_ALLOC_HOST); + }; + // TODO: Allow the device to declare its requested port count. + if (rpc_status_t Err = rpc_server_init(DeviceId, RPC_MAXIMUM_PORT_COUNT, + Device.getWarpSize(), Alloc, &Device)) + return plugin::Plugin::error( + "Failed to initialize RPC server for device %d: %d", DeviceId, Err); + + // Register a custom opcode handler to perform plugin specific allocation. + // FIXME: We need to make sure this uses asynchronous allocations on CUDA. + auto MallocHandler = [](rpc_port_t Port, void *Data) { + rpc_recv_and_send( + Port, + [](rpc_buffer_t *Buffer, void *Data) { + plugin::GenericDeviceTy &Device = + *reinterpret_cast(Data); + Buffer->data[0] = reinterpret_cast( + Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE)); + }, + Data); + }; + if (rpc_status_t Err = + rpc_register_callback(DeviceId, RPC_MALLOC, MallocHandler, &Device)) + return plugin::Plugin::error( + "Failed to register RPC malloc handler for device %d: %d\n", DeviceId, + Err); + + // Register a custom opcode handler to perform plugin specific deallocation. + auto FreeHandler = [](rpc_port_t Port, void *Data) { + rpc_recv( + Port, + [](rpc_buffer_t *Buffer, void *Data) { + plugin::GenericDeviceTy &Device = + *reinterpret_cast(Data); + Device.free(reinterpret_cast(Buffer->data[0]), + TARGET_ALLOC_DEVICE); + }, + Data); + }; + if (rpc_status_t Err = + rpc_register_callback(DeviceId, RPC_FREE, FreeHandler, &Device)) + return plugin::Plugin::error( + "Failed to register RPC free handler for device %d: %d\n", DeviceId, + Err); + + // Get the address of the RPC client from the device. + void *ClientPtr; + plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *)); + if (auto Err = + Handler.getGlobalMetadataFromDevice(Device, Image, ClientGlobal)) + return Err; + + if (auto Err = Device.dataRetrieve(&ClientPtr, ClientGlobal.getPtr(), + sizeof(void *), nullptr)) + return Err; + + const void *ClientBuffer = rpc_get_client_buffer(DeviceId); + if (auto Err = Device.dataSubmit(ClientPtr, ClientBuffer, + rpc_get_client_size(), nullptr)) + return Err; + + Handles[DeviceId] = std::make_unique(*this, Device); +#endif + return Error::success(); +} + +llvm::Expected +RPCServerTy::getDevice(plugin::GenericDeviceTy &Device) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + uint32_t DeviceId = Device.getDeviceId(); + if (!Handles[DeviceId] || !rpc_get_buffer(DeviceId) || + !rpc_get_client_buffer(DeviceId)) + return plugin::Plugin::error( + "Attempt to get an RPC device while not initialized"); + + return Handles[DeviceId].get(); +#else + return plugin::Plugin::error( + "Attempt to get an RPC device while not available"); +#endif +} + +Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + if (rpc_status_t Err = rpc_handle_server(Device.getDeviceId())) + return plugin::Plugin::error( + "Error while running RPC server on device %d: %d", Device.getDeviceId(), + Err); +#endif + return Error::success(); +} + +Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + auto Dealloc = [](void *Ptr, void *Data) { + plugin::GenericDeviceTy &Device = + *reinterpret_cast(Data); + Device.free(Ptr, TARGET_ALLOC_HOST); + }; + if (rpc_status_t Err = + rpc_server_shutdown(Device.getDeviceId(), Dealloc, &Device)) + return plugin::Plugin::error( + "Failed to shut down RPC server for device %d: %d", + Device.getDeviceId(), Err); +#endif + return Error::success(); +} + +RPCServerTy::~RPCServerTy() { +#ifdef LIBOMPTARGET_RPC_SUPPORT + rpc_shutdown(); +#endif +} diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -366,6 +366,12 @@ return Plugin::check(Res, "Error in cuCtxSetCurrent: %s"); } + /// We want to set up the RPC server for host services to the GPU if it is + /// availible. + bool shouldSetupRPCServer() const override { + return libomptargetSupportsRPC(); + } + /// Get the stream of the asynchronous info sructure or get a new one. CUstream getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) { CUstream &Stream = AsyncInfoWrapper.getQueueAs(); @@ -464,7 +470,18 @@ /// Synchronize current thread with the pending operations on the async info. Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { CUstream Stream = reinterpret_cast(AsyncInfo.Queue); - CUresult Res = cuStreamSynchronize(Stream); + CUresult Res; + // If we have an RPC server running on this device we will continuously + // query it for work rather than blocking. + if (!getRPCHandle()) { + Res = cuStreamSynchronize(Stream); + } else { + do { + Res = cuStreamQuery(Stream); + if (auto Err = getRPCHandle()->runServer()) + return Err; + } while (Res == CUDA_ERROR_NOT_READY); + } // Once the stream is synchronized, return it to stream pool and reset // AsyncInfo. This is to make sure the synchronization only works for its diff --git a/openmp/libomptarget/test/CMakeLists.txt b/openmp/libomptarget/test/CMakeLists.txt --- a/openmp/libomptarget/test/CMakeLists.txt +++ b/openmp/libomptarget/test/CMakeLists.txt @@ -19,6 +19,7 @@ string(REGEX MATCHALL "([^\ ]+\ |[^\ ]+$)" SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}") foreach(CURRENT_TARGET IN LISTS SYSTEM_TARGETS) string(STRIP "${CURRENT_TARGET}" CURRENT_TARGET) + add_openmp_testsuite(check-libomptarget-${CURRENT_TARGET} "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR}/${CURRENT_TARGET} diff --git a/openmp/libomptarget/test/libc/malloc.c b/openmp/libomptarget/test/libc/malloc.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/libc/malloc.c @@ -0,0 +1,33 @@ +// RUN: %libomptarget-compile-run-and-check-generic + +// REQUIRES: libc + +// TODO: This requires async malloc on CUDA which is an 11.2 feature. +// UNSUPPORTED: nvptx64-nvidia-cuda +// UNSUPPORTED: nvptx64-nvidia-cuda-LTO + +#include +#include + +#pragma omp declare target to(malloc) +#pragma omp declare target to(free) + +int main() { + unsigned h_x; + unsigned *d_x; +#pragma omp target map(from : d_x) + { + d_x = malloc(sizeof(unsigned)); + *d_x = 1; + } + +#pragma omp target is_device_ptr(d_x) map(from : h_x) + { h_x = *d_x; } + +#pragma omp target is_device_ptr(d_x) + { free(d_x); } + + // CHECK: PASS + if (h_x == 1) + fputs("PASS\n", stdout); +} diff --git a/openmp/libomptarget/test/libc/puts.c b/openmp/libomptarget/test/libc/puts.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/libc/puts.c @@ -0,0 +1,35 @@ +// RUN: %libomptarget-compile-run-and-check-generic + +// REQUIRES: libc + +#include + +#pragma omp declare target to(stdout) + +int main() { +// CHECK: PASS +#pragma omp target + { fputs("PASS\n", stdout); } + +// CHECK: PASS +#pragma omp target nowait + { fputs("PASS\n", stdout); } + +// CHECK: PASS +#pragma omp target nowait + { fputs("PASS\n", stdout); } + +#pragma omp taskwait + +// CHECK: PASS +// CHECK: PASS +// CHECK: PASS +// CHECK: PASS +// CHECK: PASS +// CHECK: PASS +// CHECK: PASS +// CHECK: PASS +#pragma omp target teams num_teams(4) +#pragma omp parallel num_threads(2) + { fputs("PASS\n", stdout); } +} diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg --- a/openmp/libomptarget/test/lit.cfg +++ b/openmp/libomptarget/test/lit.cfg @@ -96,6 +96,9 @@ config.available_features.add('flang') tools.append(ToolSubst('%flang', command=FindTool('flang-new'), unresolved='fatal')) +if config.libomptarget_has_libc: + config.available_features.add('libc') + # Determine whether the test system supports unified memory. # For CUDA, this is the case with compute capability 70 (Volta) or higher. # For all other targets, we currently assume it is. @@ -213,10 +216,12 @@ "%libomptarget-run-" + libomptarget_target)) config.substitutions.append(("%libomptarget-compilexx-" + \ libomptarget_target, \ - "%clangxx-" + libomptarget_target + " %s -o %t")) + "%clangxx-" + libomptarget_target + " %s -o %t" \ + " -lcgpu" if config.libomptarget_has_libc else "")) config.substitutions.append(("%libomptarget-compile-" + \ libomptarget_target, \ - "%clang-" + libomptarget_target + " %s -o %t")) + "%clang-" + libomptarget_target + " %s -o %t" + " -lcgpu" if config.libomptarget_has_libc else "")) config.substitutions.append(("%libomptarget-compileoptxx-run-and-check-" + \ libomptarget_target, \ "%libomptarget-compileoptxx-and-run-" + libomptarget_target + \ @@ -235,10 +240,12 @@ "%libomptarget-run-" + libomptarget_target)) config.substitutions.append(("%libomptarget-compileoptxx-" + \ libomptarget_target, \ - "%clangxx-" + libomptarget_target + " -O3 %s -o %t")) + "%clangxx-" + libomptarget_target + " -O3 %s -o %t" + " -lcgpu" if config.libomptarget_has_libc else "")) config.substitutions.append(("%libomptarget-compileopt-" + \ libomptarget_target, \ - "%clang-" + libomptarget_target + " -O3 %s -o %t")) + "%clang-" + libomptarget_target + " -O3 %s -o %t" + " -lcgpu" if config.libomptarget_has_libc else "")) config.substitutions.append(("%libomptarget-run-" + \ libomptarget_target, \ "%t")) diff --git a/openmp/libomptarget/test/lit.site.cfg.in b/openmp/libomptarget/test/lit.site.cfg.in --- a/openmp/libomptarget/test/lit.site.cfg.in +++ b/openmp/libomptarget/test/lit.site.cfg.in @@ -22,6 +22,7 @@ config.libomptarget_debug = @LIBOMPTARGET_DEBUG@ config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@ config.llvm_enabled_projects = "@LLVM_ENABLE_PROJECTS@".split(";") +config.libomptarget_has_libc = "@LIBOMPTARGET_GPU_LIBC_SUPPORT@" import lit.llvm lit.llvm.initialize(lit_config, config)