diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -519,7 +519,19 @@ } /// Wait until the signal gets a zero value. - Error wait(const uint64_t ActiveTimeout = 0) const { + Error wait(const uint64_t ActiveTimeout = 0, + RPCInstanceTy *RPCDevice = nullptr) const { + // If there is an RPC device attached to this stream we run it as a server. + if (RPCDevice) { + while (hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, + /*timeout_hint=*/8192, + HSA_WAIT_STATE_ACTIVE) != 0) { + if (auto Err = RPCDevice->runServer()) + return Err; + } + return Plugin::success(); + } + if (ActiveTimeout) { hsa_signal_value_t Got = 1; Got = hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, @@ -895,6 +907,11 @@ /// operation that was already finalized in a previous stream sycnhronize. uint32_t SyncCycle; + /// A pointer associated with an RPC server running on the given device. If + /// RPC is not being used this will be a null pointer. Otherwise, this + /// indicates that an RPC server is expected to be run on this stream. + RPCInstanceTy *RPCDevice; + /// Mutex to protect stream's management. mutable std::mutex Mutex; @@ -1050,6 +1067,9 @@ /// Deinitialize the stream's signals. Error deinit() { return Plugin::success(); } + /// Attach an RPC device to this stream. + void setRPCDevice(RPCInstanceTy *Device) { RPCDevice = Device; } + /// Push a asynchronous kernel to the stream. The kernel arguments must be /// placed in a special allocation for kernel args and must keep alive until /// the kernel finalizes. Once the kernel is finished, the stream will release @@ -1264,7 +1284,8 @@ return Plugin::success(); // Wait until all previous operations on the stream have completed. - if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds)) + if (auto Err = + Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, RPCDevice)) return Err; // Reset the stream and perform all pending post actions. @@ -1778,6 +1799,12 @@ /// AMDGPU devices do not have the concept of contexts. Error setContext() override { return Plugin::success(); } + /// We want to set up the RPC server for host services to the GPU if it is + /// availible. + bool shouldSetupRPCServer() const override { + return libomptargetSupportsRPC(); + } + /// Get the stream of the asynchronous info sructure or get a new one. AMDGPUStreamTy &getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) { AMDGPUStreamTy *&Stream = AsyncInfoWrapper.getQueueAs(); @@ -2496,7 +2523,7 @@ : Agent(Device.getAgent()), Queue(Device.getNextQueue()), SignalManager(Device.getSignalManager()), // Initialize the std::deque with some empty positions. - Slots(32), NextSlot(0), SyncCycle(0), + Slots(32), NextSlot(0), SyncCycle(0), RPCDevice(nullptr), StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {} /// Class implementing the AMDGPU-specific functionalities of the global @@ -2826,6 +2853,10 @@ AMDGPUDeviceTy &AMDGPUDevice = static_cast(GenericDevice); AMDGPUStreamTy &Stream = AMDGPUDevice.getStream(AsyncInfoWrapper); + // If this kernel requires an RPC server we attach its pointer to the stream. + if (GenericDevice.getRPCInstance()) + Stream.setRPCDevice(GenericDevice.getRPCInstance()); + // Push the kernel launch into the stream. return Stream.pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, GroupSize, ArgsMemoryManager); diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt @@ -13,7 +13,7 @@ # NOTE: Don't try to build `PluginInterface` using `add_llvm_library` because we # don't want to export `PluginInterface` while `add_llvm_library` requires that. add_library(PluginInterface OBJECT - PluginInterface.cpp GlobalHandler.cpp JIT.cpp) + PluginInterface.cpp GlobalHandler.cpp JIT.cpp RPC.cpp) # Only enable JIT for those targets that LLVM can support. string(TOUPPER "${LLVM_TARGETS_TO_BUILD}" TargetsSupported) @@ -62,6 +62,20 @@ MemoryManager ) +# Include the RPC server from the `libc` project if availible. +if(TARGET llvmlibc_rpc_server) + target_link_libraries(PluginInterface PRIVATE llvmlibc_rpc_server) + target_compile_definitions(PluginInterface PRIVATE LIBOMPTARGET_RPC_SUPPORT) +else() + find_library(llvmlibc_rpc_server NAMES llvmlibc_rpc_server + PATHS ${LIBOMPTARGET_LLVM_LIBRARY_DIR} NO_DEFAULT_PATH) + if(llvmlibc_rpc_server) + message(WARNING ${llvmlibc_rpc_server}) + target_link_libraries(PluginInterface PRIVATE llvmlibc_rpc_server) + target_compile_definitions(PluginInterface PRIVATE LIBOMPTARGET_RPC_SUPPORT) + endif() +endif() + if ((OMPT_TARGET_DEFAULT) AND (LIBOMPTARGET_OMPT_SUPPORT)) target_link_libraries(PluginInterface PUBLIC OMPT) endif() diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -24,6 +24,7 @@ #include "GlobalHandler.h" #include "JIT.h" #include "MemoryManager.h" +#include "RPC.h" #include "Utilities.h" #include "omptarget.h" @@ -600,6 +601,11 @@ /// this behavior by overriding the shouldSetupDeviceEnvironment function. Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image); + // Setup the RPC server for this device if needed. This may not run on some + // plugins like the CPU targets. By default, it will not be executed so it is + // up to the target to override this using the shouldSetupRPCServer function. + Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image); + /// Register the offload entries for a specific image on the device. Error registerOffloadEntries(DeviceImageTy &Image); @@ -750,6 +756,9 @@ return OMPX_MinThreadsForLowTripCount; } + /// Get the RPC server running on this device. + RPCInstanceTy *getRPCInstance() const { return RPCInstance; } + private: /// Register offload entry for global variable. Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, @@ -779,6 +788,10 @@ /// setupDeviceEnvironment() function. virtual bool shouldSetupDeviceEnvironment() const { return true; } + /// Indicate whether or not the device should setup the RPC server. This is + /// only necessary for unhosted targets like the GPU. + virtual bool shouldSetupRPCServer() const { return false; } + /// Pointer to the memory manager or nullptr if not available. MemoryManagerTy *MemoryManager; @@ -836,6 +849,10 @@ /// Map of host pinned allocations used for optimize device transfers. PinnedAllocationMapTy PinnedAllocs; + + /// A pointer to an RPC server instance attached to this device if present. + /// This is used to run the RPC server during task synchronization. + RPCInstanceTy *RPCInstance; }; /// Class implementing common functionalities of offload plugins. Each plugin @@ -891,6 +908,12 @@ /// plugin. JITEngine &getJIT() { return JIT; } + /// Get a reference to the RPC server used to provide host services. + RPCServerTy &getRPCServer() { + assert(RPCServer && "RPC server not initialized"); + return *RPCServer; + } + /// Get the OpenMP requires flags set for this plugin. int64_t getRequiresFlags() const { return RequiresFlags; } @@ -945,6 +968,9 @@ /// The JIT engine shared by all devices connected to this plugin. JITEngine JIT; + + /// The interface between the plugin and the GPU for host services. + RPCServerTy *RPCServer; }; /// Class for simplifying the getter operation of the plugin. Anywhere on the @@ -1208,6 +1234,9 @@ std::deque ResourcePool; }; +/// A static check on whether or not we support RPC in libomptarget. +const bool libomptargetSupportsRPC(); + } // namespace plugin } // namespace target } // namespace omp diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -396,7 +396,7 @@ OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 32), DeviceId(DeviceId), GridValues(OMPGridValues), PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(), - PinnedAllocs(*this) {} + PinnedAllocs(*this), RPCInstance(nullptr) {} Error GenericDeviceTy::init(GenericPluginTy &Plugin) { if (auto Err = initImpl(Plugin)) @@ -453,6 +453,10 @@ if (RecordReplay.isRecordingOrReplaying()) RecordReplay.deinit(); + if (RPCInstance) + if (auto Err = RPCInstance->deinitDevice()) + return std::move(Err); + return deinitImpl(); } @@ -493,6 +497,9 @@ if (auto Err = registerOffloadEntries(*Image)) return std::move(Err); + if (auto Err = setupRPCServer(Plugin, *Image)) + return std::move(Err); + // Return the pointer to the table of entries. return Image->getOffloadEntryTable(); } @@ -524,6 +531,33 @@ return Plugin::success(); } +Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin, + DeviceImageTy &Image) { + // The plugin either does not need an RPC server or it is unavailible. + if (!shouldSetupRPCServer()) + return Plugin::success(); + + // Check if this device needs to run an RPC server. + RPCServerTy &Server = Plugin.getRPCServer(); + auto UsingOrErr = + Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image); + if (!UsingOrErr) + return UsingOrErr.takeError(); + + if (!UsingOrErr.get()) + return Plugin::success(); + + if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image)) + return std::move(Err); + + auto DeviceOrErr = Server.getDevice(*this); + if (!DeviceOrErr) + return DeviceOrErr.takeError(); + RPCInstance = *DeviceOrErr; + DP("Running an RPC server on device %d\n", getDeviceId()); + return Plugin::success(); +} + Error GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) { const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin; const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd; @@ -1087,6 +1121,9 @@ GlobalHandler = Plugin::createGlobalHandler(); assert(GlobalHandler && "Invalid global handler"); + RPCServer = new RPCServerTy(NumDevices); + assert(RPCServer && "Invalid RPC server"); + return Plugin::success(); } @@ -1104,6 +1141,9 @@ assert(!Devices[DeviceId] && "Device was not deinitialized"); } + if (RPCServer) + delete RPCServer; + // Perform last deinitializations on the plugin. return deinitImpl(); } @@ -1138,6 +1178,14 @@ return Plugin::success(); } +const bool llvm::omp::target::plugin::libomptargetSupportsRPC() { +#ifdef LIBOMPTARGET_RPC_SUPPORT + return true; +#else + return false; +#endif +} + /// Exposed library API function, basically wrappers around the GenericDeviceTy /// functionality with the same name. All non-async functions are redirected /// to the async versions right away with a NULL AsyncInfoPtr. diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h @@ -0,0 +1,86 @@ +//===- RPC.h - Interface for remote procedure calls from the GPU ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides the interface to support remote procedure calls (RPC) from +// the GPU. This is required to implement host services like printf or malloc. +// The interface to the RPC server is provided by the 'libc' project in LLVM. +// For more information visit https://libc.llvm.org/gpu/. +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H + +#include "llvm/Support/Error.h" + +#include + +namespace llvm::omp::target { +namespace plugin { +struct GenericDeviceTy; +struct GenericGlobalHandlerTy; +class DeviceImageTy; +} // namespace plugin + +/// A generic class implementing the interface between the RPC server provided +/// by the 'libc' project and 'libomptarget'. If the RPC server is not availible +/// these routines will perform no action. +struct RPCServerTy { +public: + /// A wrapper around a single instance of the RPC server for a given device. + /// This is provided to simplify ownership of the underlying device. + struct RPCInstanceTy { + RPCInstanceTy(RPCServerTy &Server, plugin::GenericDeviceTy &Device) + : Server(Server), Device(Device) {} + + llvm::Error runServer() { return Server.runServer(Device); } + + llvm::Error deinitDevice() { return Server.deinitDevice(Device); } + + RPCServerTy &Server; + plugin::GenericDeviceTy &Device; + }; + + RPCServerTy(uint32_t NumDevices); + + /// Check if this device image is using an RPC server. This checks for the + /// precense of an externally visible symbol in the device image that will + /// be present whenever RPC code is called. + llvm::Expected isDeviceUsingRPC(plugin::GenericDeviceTy &Device, + plugin::GenericGlobalHandlerTy &Handler, + plugin::DeviceImageTy &Image); + + /// Initialize the RPC server for the given device. This will allocate host + /// memory for the internal server and copy the data to the client on the + /// device. The device must be loaded before this is valid. + llvm::Error initDevice(plugin::GenericDeviceTy &Device, + plugin::GenericGlobalHandlerTy &Handler, + plugin::DeviceImageTy &Image); + + /// Gets a reference to this server for a specific device. + llvm::Expected getDevice(plugin::GenericDeviceTy &Device); + + /// Runs the RPC server associated with the \p Device until the pending work + /// is cleared. + llvm::Error runServer(plugin::GenericDeviceTy &Device); + + /// Deinitialize the RPC server for the given device. This will free the + /// memory associated with the k + llvm::Error deinitDevice(plugin::GenericDeviceTy &Device); + + ~RPCServerTy(); + +private: + llvm::SmallVector> Instances; +}; + +using RPCInstanceTy = RPCServerTy::RPCInstanceTy; + +} // namespace llvm::omp::target + +#endif diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp @@ -0,0 +1,175 @@ +//===- RPC.h - Interface for remote procedure calls from the GPU ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RPC.h" +#include "Debug.h" +#include "PluginInterface.h" + +// This header file may be present in-tree or from an LLVM installation. The +// installed version lives alongside the GPU headers so we do not want to +// include it directly. +#if __has_include() +#include +#elif defined(LIBOMPTARGET_RPC_SUPPORT) +#include +#endif + +using namespace llvm; +using namespace omp; +using namespace target; + +RPCServerTy::RPCServerTy(uint32_t NumDevices) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + // If this fails then something is catastrophically wrong, just exit. + if (rpc_status_t Err = rpc_init(NumDevices)) + FATAL_MESSAGE(1, "Error initializing the RPC server: %d\n", Err); + Instances.resize(NumDevices); +#endif +} + +llvm::Expected +RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device, + plugin::GenericGlobalHandlerTy &Handler, + plugin::DeviceImageTy &Image) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + void *ClientPtr; + plugin::GlobalTy Global(rpc_client_symbol_name, sizeof(void *), &ClientPtr); + if (auto Err = Handler.readGlobalFromImage(Device, Image, Global)) { + llvm::consumeError(std::move(Err)); + return false; + } + + return true; +#else + return false; +#endif +} + +Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, + plugin::GenericGlobalHandlerTy &Handler, + plugin::DeviceImageTy &Image) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + auto Alloc = [](uint64_t Size, void *Data) { + plugin::GenericDeviceTy &Device = + *reinterpret_cast(Data); + return Device.allocate(Size, nullptr, TARGET_ALLOC_HOST); + }; + // TODO: Allow the device to declare its requested port count. + if (rpc_status_t Err = + rpc_server_init(Device.getDeviceId(), RPC_MAXIMUM_PORT_COUNT, + Device.getWarpSize(), Alloc, &Device)) + return plugin::Plugin::error( + "Failed to initialize RPC server for device %d: %d", + Device.getDeviceId(), Err); + + // Register a custom opcode handler to perform plugin specific allocation. + // FIXME: We need to make sure this uses asynchronous allocations on CUDA. + auto MallocHandler = [](rpc_port_t Port, void *Data) { + rpc_recv_and_send( + Port, + [](rpc_buffer_t *Buffer, void *Data) { + plugin::GenericDeviceTy &Device = + *reinterpret_cast(Data); + Buffer->data[0] = reinterpret_cast( + Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE)); + }, + Data); + }; + if (rpc_status_t Err = rpc_register_callback(Device.getDeviceId(), RPC_MALLOC, + MallocHandler, &Device)) + return plugin::Plugin::error( + "Failed to register RPC malloc handler for device %d: %d\n", + Device.getDeviceId(), Err); + + // Register a custom opcode handler to perform plugin specific deallocation. + auto FreeHandler = [](rpc_port_t Port, void *Data) { + rpc_recv( + Port, + [](rpc_buffer_t *Buffer, void *Data) { + plugin::GenericDeviceTy &Device = + *reinterpret_cast(Data); + Device.free(reinterpret_cast(Buffer->data[0]), + TARGET_ALLOC_DEVICE); + }, + Data); + }; + if (rpc_status_t Err = rpc_register_callback(Device.getDeviceId(), RPC_FREE, + FreeHandler, &Device)) + return plugin::Plugin::error( + "Failed to register RPC free handler for device %d: %d\n", + Device.getDeviceId(), Err); + + // Get the address of the RPC client from the device. + void *ClientPtr; + plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *)); + if (auto Err = + Handler.getGlobalMetadataFromDevice(Device, Image, ClientGlobal)) + return Err; + + if (auto Err = Device.dataRetrieve(&ClientPtr, ClientGlobal.getPtr(), + sizeof(void *), nullptr)) + return Err; + + const void *ClientBuffer = rpc_get_client_buffer(Device.getDeviceId()); + if (auto Err = Device.dataSubmit(ClientPtr, ClientBuffer, + rpc_get_client_size(), nullptr)) + return Err; + + Instances[Device.getDeviceId()] = + std::make_unique(*this, Device); +#endif + return Error::success(); +} + +llvm::Expected +RPCServerTy::getDevice(plugin::GenericDeviceTy &Device) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + if (!Instances[Device.getDeviceId()] || + !rpc_get_buffer(Device.getDeviceId()) || + !rpc_get_client_buffer(Device.getDeviceId())) + return plugin::Plugin::error( + "Attempt to get an RPC device while not initialized"); + + return Instances[Device.getDeviceId()].get(); +#else + return plugin::Plugin::error( + "Attempt to get an RPC device while not availible"); +#endif +} + +Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + if (rpc_status_t Err = rpc_handle_server(Device.getDeviceId())) + return plugin::Plugin::error( + "Error while running RPC server on device %d: %d", Device.getDeviceId(), + Err); +#endif + return Error::success(); +} + +Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) { +#ifdef LIBOMPTARGET_RPC_SUPPORT + auto Dealloc = [](void *Ptr, void *Data) { + plugin::GenericDeviceTy &Device = + *reinterpret_cast(Data); + Device.free(Ptr, TARGET_ALLOC_HOST); + }; + if (rpc_status_t Err = + rpc_server_shutdown(Device.getDeviceId(), Dealloc, &Device)) + return plugin::Plugin::error( + "Failed to shut down RPC server for device %d: %d", + Device.getDeviceId(), Err); +#endif + return Error::success(); +} + +RPCServerTy::~RPCServerTy() { +#ifdef LIBOMPTARGET_RPC_SUPPORT + rpc_shutdown(); +#endif +} diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -366,6 +366,12 @@ return Plugin::check(Res, "Error in cuCtxSetCurrent: %s"); } + /// We want to set up the RPC server for host services to the GPU if it is + /// availible. + bool shouldSetupRPCServer() const override { + return libomptargetSupportsRPC(); + } + /// Get the stream of the asynchronous info sructure or get a new one. CUstream getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) { CUstream &Stream = AsyncInfoWrapper.getQueueAs(); @@ -464,7 +470,16 @@ /// Synchronize current thread with the pending operations on the async info. Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { CUstream Stream = reinterpret_cast(AsyncInfo.Queue); - CUresult Res = cuStreamSynchronize(Stream); + CUresult Res; + if (getRPCInstance()) { + do { + Res = cuStreamQuery(Stream); + if (auto Err = getRPCInstance()->runServer()) + return Err; + } while (Res == CUDA_ERROR_NOT_READY); + } else { + Res = cuStreamSynchronize(Stream); + } // Once the stream is synchronized, return it to stream pool and reset // AsyncInfo. This is to make sure the synchronization only works for its