diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -92,7 +92,7 @@
 if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
   set(LIBC_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
   set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm)
-  set(LIBC_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/lib)
+  set(LIBC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR})
 elseif(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND LIBC_ENABLE_USE_BY_CLANG)
   set(LIBC_INCLUDE_DIR ${LLVM_BINARY_DIR}/include/${LLVM_DEFAULT_TARGET_TRIPLE})
   set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -519,17 +519,25 @@
   }
 
   /// Wait until the signal gets a zero value.
-  Error wait(const uint64_t ActiveTimeout = 0) const {
-    if (ActiveTimeout) {
+  Error wait(const uint64_t ActiveTimeout = 0,
+             RPCHandleTy *RPCHandle = nullptr) const {
+    if (ActiveTimeout && !RPCHandle) {
       hsa_signal_value_t Got = 1;
       Got = hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0,
                                       ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
       if (Got == 0)
         return Plugin::success();
     }
+
+    // If there is an RPC device attached to this stream we run it as a server.
+    uint64_t Timeout = RPCHandle ? 8192 : UINT64_MAX;
+    auto WaitState = RPCHandle ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
     while (hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0,
-                                     UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0)
-      ;
+                                     Timeout, WaitState) != 0) {
+      if (RPCHandle)
+        if (auto Err = RPCHandle->runServer())
+          return Err;
+    }
     return Plugin::success();
   }
 
@@ -895,6 +903,11 @@
   /// operation that was already finalized in a previous stream sycnhronize.
   uint32_t SyncCycle;
 
+  /// A pointer associated with an RPC server running on the given device. If
+  /// RPC is not being used this will be a null pointer. Otherwise, this
+  /// indicates that an RPC server is expected to be run on this stream.
+  RPCHandleTy *RPCHandle;
+
   /// Mutex to protect stream's management.
   mutable std::mutex Mutex;
 
@@ -1050,6 +1063,9 @@
   /// Deinitialize the stream's signals.
   Error deinit() { return Plugin::success(); }
 
+  /// Attach an RPC handle to this stream.
+  void setRPCHandle(RPCHandleTy *Handle) { RPCHandle = Handle; }
+
   /// Push a asynchronous kernel to the stream. The kernel arguments must be
   /// placed in a special allocation for kernel args and must keep alive until
   /// the kernel finalizes. Once the kernel is finished, the stream will release
@@ -1264,7 +1280,8 @@
       return Plugin::success();
 
     // Wait until all previous operations on the stream have completed.
-    if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds))
+    if (auto Err =
+            Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, RPCHandle))
       return Err;
 
     // Reset the stream and perform all pending post actions.
@@ -1786,6 +1803,12 @@
   /// AMDGPU devices do not have the concept of contexts.
   Error setContext() override { return Plugin::success(); }
 
+  /// We want to set up the RPC server for host services to the GPU if it is
+  /// availible.
+  bool shouldSetupRPCServer() const override {
+    return libomptargetSupportsRPC();
+  }
+
   /// Get the stream of the asynchronous info sructure or get a new one.
   AMDGPUStreamTy &getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) {
     AMDGPUStreamTy *&Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
@@ -2507,7 +2530,7 @@
     : Agent(Device.getAgent()), Queue(Device.getNextQueue()),
       SignalManager(Device.getSignalManager()),
       // Initialize the std::deque with some empty positions.
-      Slots(32), NextSlot(0), SyncCycle(0),
+      Slots(32), NextSlot(0), SyncCycle(0), RPCHandle(nullptr),
       StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
 
 /// Class implementing the AMDGPU-specific functionalities of the global
@@ -2837,6 +2860,10 @@
   AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
   AMDGPUStreamTy &Stream = AMDGPUDevice.getStream(AsyncInfoWrapper);
 
+  // If this kernel requires an RPC server we attach its pointer to the stream.
+  if (GenericDevice.getRPCHandle())
+    Stream.setRPCHandle(GenericDevice.getRPCHandle());
+
   // Push the kernel launch into the stream.
   return Stream.pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
                                  GroupSize, ArgsMemoryManager);
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
@@ -13,7 +13,7 @@
 # NOTE: Don't try to build `PluginInterface` using `add_llvm_library` because we
 # don't want to export `PluginInterface` while `add_llvm_library` requires that.
 add_library(PluginInterface OBJECT
-  PluginInterface.cpp GlobalHandler.cpp JIT.cpp)
+  PluginInterface.cpp GlobalHandler.cpp JIT.cpp RPC.cpp)
 
 # Only enable JIT for those targets that LLVM can support.
 string(TOUPPER "${LLVM_TARGETS_TO_BUILD}" TargetsSupported)
@@ -62,6 +62,25 @@
     MemoryManager
 )
 
+# Include the RPC server from the `libc` project if availible.
+set(libomptarget_supports_rpc FALSE)
+if(TARGET llvmlibc_rpc_server)
+  target_link_libraries(PluginInterface PRIVATE llvmlibc_rpc_server)
+  target_compile_definitions(PluginInterface PRIVATE LIBOMPTARGET_RPC_SUPPORT)
+  set(libomptarget_supports_rpc TRUE)
+else()
+  find_library(llvmlibc_rpc_server NAMES llvmlibc_rpc_server
+               PATHS ${LIBOMPTARGET_LLVM_LIBRARY_DIR} NO_DEFAULT_PATH)
+  if(llvmlibc_rpc_server)
+    message(WARNING ${llvmlibc_rpc_server})
+    target_link_libraries(PluginInterface PRIVATE llvmlibc_rpc_server)
+    target_compile_definitions(PluginInterface PRIVATE LIBOMPTARGET_RPC_SUPPORT)
+    set(libomptarget_supports_rpc TRUE)
+  endif()
+endif()
+set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${libomptarget_supports_rpc} CACHE BOOL
+    "Libomptarget support for the GPU libc")
+
 if ((OMPT_TARGET_DEFAULT) AND (LIBOMPTARGET_OMPT_SUPPORT))
   target_link_libraries(PluginInterface PUBLIC OMPT)
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -24,6 +24,7 @@
 #include "GlobalHandler.h"
 #include "JIT.h"
 #include "MemoryManager.h"
+#include "RPC.h"
 #include "Utilities.h"
 #include "omptarget.h"
 
@@ -600,6 +601,11 @@
   /// this behavior by overriding the shouldSetupDeviceEnvironment function.
   Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
 
+  // Setup the RPC server for this device if needed. This may not run on some
+  // plugins like the CPU targets. By default, it will not be executed so it is
+  // up to the target to override this using the shouldSetupRPCServer function.
+  Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
+
   /// Register the offload entries for a specific image on the device.
   Error registerOffloadEntries(DeviceImageTy &Image);
 
@@ -751,6 +757,9 @@
     return OMPX_MinThreadsForLowTripCount;
   }
 
+  /// Get the RPC server running on this device.
+  RPCHandleTy *getRPCHandle() const { return RPCHandle; }
+
 private:
   /// Register offload entry for global variable.
   Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -780,6 +789,10 @@
   /// setupDeviceEnvironment() function.
   virtual bool shouldSetupDeviceEnvironment() const { return true; }
 
+  /// Indicate whether or not the device should setup the RPC server. This is
+  /// only necessary for unhosted targets like the GPU.
+  virtual bool shouldSetupRPCServer() const { return false; }
+
   /// Pointer to the memory manager or nullptr if not available.
   MemoryManagerTy *MemoryManager;
 
@@ -837,6 +850,10 @@
 
   /// Map of host pinned allocations used for optimize device transfers.
   PinnedAllocationMapTy PinnedAllocs;
+
+  /// A pointer to an RPC server instance attached to this device if present.
+  /// This is used to run the RPC server during task synchronization.
+  RPCHandleTy *RPCHandle;
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
@@ -892,6 +909,12 @@
   /// plugin.
   JITEngine &getJIT() { return JIT; }
 
+  /// Get a reference to the RPC server used to provide host services.
+  RPCServerTy &getRPCServer() {
+    assert(RPCServer && "RPC server not initialized");
+    return *RPCServer;
+  }
+
   /// Get the OpenMP requires flags set for this plugin.
   int64_t getRequiresFlags() const { return RequiresFlags; }
 
@@ -946,6 +969,9 @@
 
   /// The JIT engine shared by all devices connected to this plugin.
   JITEngine JIT;
+
+  /// The interface between the plugin and the GPU for host services.
+  RPCServerTy *RPCServer;
 };
 
 /// Class for simplifying the getter operation of the plugin. Anywhere on the
@@ -1209,6 +1235,9 @@
   std::deque<ResourceRef> ResourcePool;
 };
 
+/// A static check on whether or not we support RPC in libomptarget.
+const bool libomptargetSupportsRPC();
+
 } // namespace plugin
 } // namespace target
 } // namespace omp
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -396,7 +396,7 @@
       OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 32),
       DeviceId(DeviceId), GridValues(OMPGridValues),
       PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
-      PinnedAllocs(*this) {}
+      PinnedAllocs(*this), RPCHandle(nullptr) {}
 
 Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
   if (auto Err = initImpl(Plugin))
@@ -453,6 +453,10 @@
   if (RecordReplay.isRecordingOrReplaying())
     RecordReplay.deinit();
 
+  if (RPCHandle)
+    if (auto Err = RPCHandle->deinitDevice())
+      return std::move(Err);
+
   return deinitImpl();
 }
 
@@ -493,6 +497,9 @@
   if (auto Err = registerOffloadEntries(*Image))
     return std::move(Err);
 
+  if (auto Err = setupRPCServer(Plugin, *Image))
+    return std::move(Err);
+
   // Return the pointer to the table of entries.
   return Image->getOffloadEntryTable();
 }
@@ -525,6 +532,33 @@
   return Plugin::success();
 }
 
+Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
+                                      DeviceImageTy &Image) {
+  // The plugin either does not need an RPC server or it is unavailible.
+  if (!shouldSetupRPCServer())
+    return Plugin::success();
+
+  // Check if this device needs to run an RPC server.
+  RPCServerTy &Server = Plugin.getRPCServer();
+  auto UsingOrErr =
+      Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image);
+  if (!UsingOrErr)
+    return UsingOrErr.takeError();
+
+  if (!UsingOrErr.get())
+    return Plugin::success();
+
+  if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
+    return std::move(Err);
+
+  auto DeviceOrErr = Server.getDevice(*this);
+  if (!DeviceOrErr)
+    return DeviceOrErr.takeError();
+  RPCHandle = *DeviceOrErr;
+  DP("Running an RPC server on device %d\n", getDeviceId());
+  return Plugin::success();
+}
+
 Error GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) {
   const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin;
   const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd;
@@ -1088,6 +1122,9 @@
   GlobalHandler = Plugin::createGlobalHandler();
   assert(GlobalHandler && "Invalid global handler");
 
+  RPCServer = new RPCServerTy(NumDevices);
+  assert(RPCServer && "Invalid RPC server");
+
   return Plugin::success();
 }
 
@@ -1105,6 +1142,9 @@
     assert(!Devices[DeviceId] && "Device was not deinitialized");
   }
 
+  if (RPCServer)
+    delete RPCServer;
+
   // Perform last deinitializations on the plugin.
   return deinitImpl();
 }
@@ -1139,6 +1179,14 @@
   return Plugin::success();
 }
 
+const bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  return true;
+#else
+  return false;
+#endif
+}
+
 /// Exposed library API function, basically wrappers around the GenericDeviceTy
 /// functionality with the same name. All non-async functions are redirected
 /// to the async versions right away with a NULL AsyncInfoPtr.
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h
@@ -0,0 +1,87 @@
+//===- RPC.h - Interface for remote procedure calls from the GPU ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface to support remote procedure calls (RPC) from
+// the GPU. This is required to implement host services like printf or malloc.
+// The interface to the RPC server is provided by the 'libc' project in LLVM.
+// For more information visit https://libc.llvm.org/gpu/.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H
+
+#include "llvm/Support/Error.h"
+
+#include <stdint.h>
+
+namespace llvm::omp::target {
+namespace plugin {
+struct GenericDeviceTy;
+struct GenericGlobalHandlerTy;
+class DeviceImageTy;
+} // namespace plugin
+
+/// A generic class implementing the interface between the RPC server provided
+/// by the 'libc' project and 'libomptarget'. If the RPC server is not availible
+/// these routines will perform no action.
+struct RPCServerTy {
+public:
+  /// A wrapper around a single instance of the RPC server for a given device.
+  /// This is provided to simplify ownership of the underlying device.
+  struct RPCHandleTy {
+    RPCHandleTy(RPCServerTy &Server, plugin::GenericDeviceTy &Device)
+        : Server(Server), Device(Device) {}
+
+    llvm::Error runServer() { return Server.runServer(Device); }
+
+    llvm::Error deinitDevice() { return Server.deinitDevice(Device); }
+
+  private:
+    RPCServerTy &Server;
+    plugin::GenericDeviceTy &Device;
+  };
+
+  RPCServerTy(uint32_t NumDevices);
+
+  /// Check if this device image is using an RPC server. This checks for the
+  /// precense of an externally visible symbol in the device image that will
+  /// be present whenever RPC code is called.
+  llvm::Expected<bool> isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
+                                        plugin::GenericGlobalHandlerTy &Handler,
+                                        plugin::DeviceImageTy &Image);
+
+  /// Initialize the RPC server for the given device. This will allocate host
+  /// memory for the internal server and copy the data to the client on the
+  /// device. The device must be loaded before this is valid.
+  llvm::Error initDevice(plugin::GenericDeviceTy &Device,
+                         plugin::GenericGlobalHandlerTy &Handler,
+                         plugin::DeviceImageTy &Image);
+
+  /// Gets a reference to this server for a specific device.
+  llvm::Expected<RPCHandleTy *> getDevice(plugin::GenericDeviceTy &Device);
+
+  /// Runs the RPC server associated with the \p Device until the pending work
+  /// is cleared.
+  llvm::Error runServer(plugin::GenericDeviceTy &Device);
+
+  /// Deinitialize the RPC server for the given device. This will free the
+  /// memory associated with the k
+  llvm::Error deinitDevice(plugin::GenericDeviceTy &Device);
+
+  ~RPCServerTy();
+
+private:
+  llvm::SmallVector<std::unique_ptr<RPCHandleTy>> Handles;
+};
+
+using RPCHandleTy = RPCServerTy::RPCHandleTy;
+
+} // namespace llvm::omp::target
+
+#endif
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
@@ -0,0 +1,173 @@
+//===- RPC.h - Interface for remote procedure calls from the GPU ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RPC.h"
+#include "Debug.h"
+#include "PluginInterface.h"
+
+// This header file may be present in-tree or from an LLVM installation. The
+// installed version lives alongside the GPU headers so we do not want to
+// include it directly.
+#if __has_include(<gpu-none-llvm/rpc_server.h>)
+#include <gpu-none-llvm/rpc_server.h>
+#elif defined(LIBOMPTARGET_RPC_SUPPORT)
+#include <rpc_server.h>
+#endif
+
+using namespace llvm;
+using namespace omp;
+using namespace target;
+
+RPCServerTy::RPCServerTy(uint32_t NumDevices) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  // If this fails then something is catastrophically wrong, just exit.
+  if (rpc_status_t Err = rpc_init(NumDevices))
+    FATAL_MESSAGE(1, "Error initializing the RPC server: %d\n", Err);
+  Handles.resize(NumDevices);
+#endif
+}
+
+llvm::Expected<bool>
+RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
+                              plugin::GenericGlobalHandlerTy &Handler,
+                              plugin::DeviceImageTy &Image) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  void *ClientPtr;
+  plugin::GlobalTy Global(rpc_client_symbol_name, sizeof(void *), &ClientPtr);
+  if (auto Err = Handler.readGlobalFromImage(Device, Image, Global)) {
+    llvm::consumeError(std::move(Err));
+    return false;
+  }
+
+  return true;
+#else
+  return false;
+#endif
+}
+
+Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
+                              plugin::GenericGlobalHandlerTy &Handler,
+                              plugin::DeviceImageTy &Image) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  uint32_t DeviceId = Device.getDeviceId();
+  auto Alloc = [](uint64_t Size, void *Data) {
+    plugin::GenericDeviceTy &Device =
+        *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
+    return Device.allocate(Size, nullptr, TARGET_ALLOC_HOST);
+  };
+  // TODO: Allow the device to declare its requested port count.
+  if (rpc_status_t Err = rpc_server_init(DeviceId, RPC_MAXIMUM_PORT_COUNT,
+                                         Device.getWarpSize(), Alloc, &Device))
+    return plugin::Plugin::error(
+        "Failed to initialize RPC server for device %d: %d", DeviceId, Err);
+
+  // Register a custom opcode handler to perform plugin specific allocation.
+  // FIXME: We need to make sure this uses asynchronous allocations on CUDA.
+  auto MallocHandler = [](rpc_port_t Port, void *Data) {
+    rpc_recv_and_send(
+        Port,
+        [](rpc_buffer_t *Buffer, void *Data) {
+          plugin::GenericDeviceTy &Device =
+              *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
+          Buffer->data[0] = reinterpret_cast<uintptr_t>(
+              Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
+        },
+        Data);
+  };
+  if (rpc_status_t Err =
+          rpc_register_callback(DeviceId, RPC_MALLOC, MallocHandler, &Device))
+    return plugin::Plugin::error(
+        "Failed to register RPC malloc handler for device %d: %d\n", DeviceId,
+        Err);
+
+  // Register a custom opcode handler to perform plugin specific deallocation.
+  auto FreeHandler = [](rpc_port_t Port, void *Data) {
+    rpc_recv(
+        Port,
+        [](rpc_buffer_t *Buffer, void *Data) {
+          plugin::GenericDeviceTy &Device =
+              *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
+          Device.free(reinterpret_cast<void *>(Buffer->data[0]),
+                      TARGET_ALLOC_DEVICE);
+        },
+        Data);
+  };
+  if (rpc_status_t Err =
+          rpc_register_callback(DeviceId, RPC_FREE, FreeHandler, &Device))
+    return plugin::Plugin::error(
+        "Failed to register RPC free handler for device %d: %d\n", DeviceId,
+        Err);
+
+  // Get the address of the RPC client from the device.
+  void *ClientPtr;
+  plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *));
+  if (auto Err =
+          Handler.getGlobalMetadataFromDevice(Device, Image, ClientGlobal))
+    return Err;
+
+  if (auto Err = Device.dataRetrieve(&ClientPtr, ClientGlobal.getPtr(),
+                                     sizeof(void *), nullptr))
+    return Err;
+
+  const void *ClientBuffer = rpc_get_client_buffer(DeviceId);
+  if (auto Err = Device.dataSubmit(ClientPtr, ClientBuffer,
+                                   rpc_get_client_size(), nullptr))
+    return Err;
+
+  Handles[DeviceId] = std::make_unique<RPCHandleTy>(*this, Device);
+#endif
+  return Error::success();
+}
+
+llvm::Expected<RPCHandleTy *>
+RPCServerTy::getDevice(plugin::GenericDeviceTy &Device) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  uint32_t DeviceId = Device.getDeviceId();
+  if (!Handles[DeviceId] || !rpc_get_buffer(DeviceId) ||
+      !rpc_get_client_buffer(DeviceId))
+    return plugin::Plugin::error(
+        "Attempt to get an RPC device while not initialized");
+
+  return Handles[DeviceId].get();
+#else
+  return plugin::Plugin::error(
+      "Attempt to get an RPC device while not available");
+#endif
+}
+
+Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  if (rpc_status_t Err = rpc_handle_server(Device.getDeviceId()))
+    return plugin::Plugin::error(
+        "Error while running RPC server on device %d: %d", Device.getDeviceId(),
+        Err);
+#endif
+  return Error::success();
+}
+
+Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  auto Dealloc = [](void *Ptr, void *Data) {
+    plugin::GenericDeviceTy &Device =
+        *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
+    Device.free(Ptr, TARGET_ALLOC_HOST);
+  };
+  if (rpc_status_t Err =
+          rpc_server_shutdown(Device.getDeviceId(), Dealloc, &Device))
+    return plugin::Plugin::error(
+        "Failed to shut down RPC server for device %d: %d",
+        Device.getDeviceId(), Err);
+#endif
+  return Error::success();
+}
+
+RPCServerTy::~RPCServerTy() {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  rpc_shutdown();
+#endif
+}
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -366,6 +366,12 @@
     return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
   }
 
+  /// We want to set up the RPC server for host services to the GPU if it is
+  /// availible.
+  bool shouldSetupRPCServer() const override {
+    return libomptargetSupportsRPC();
+  }
+
   /// Get the stream of the asynchronous info sructure or get a new one.
   CUstream getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) {
     CUstream &Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
@@ -464,7 +470,18 @@
   /// Synchronize current thread with the pending operations on the async info.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
-    CUresult Res = cuStreamSynchronize(Stream);
+    CUresult Res;
+    // If we have an RPC server running on this device we will continuously
+    // query it for work rather than blocking.
+    if (!getRPCHandle()) {
+      Res = cuStreamSynchronize(Stream);
+    } else {
+      do {
+        Res = cuStreamQuery(Stream);
+        if (auto Err = getRPCHandle()->runServer())
+          return Err;
+      } while (Res == CUDA_ERROR_NOT_READY);
+    }
 
     // Once the stream is synchronized, return it to stream pool and reset
     // AsyncInfo. This is to make sure the synchronization only works for its
diff --git a/openmp/libomptarget/test/CMakeLists.txt b/openmp/libomptarget/test/CMakeLists.txt
--- a/openmp/libomptarget/test/CMakeLists.txt
+++ b/openmp/libomptarget/test/CMakeLists.txt
@@ -19,6 +19,7 @@
 string(REGEX MATCHALL "([^\ ]+\ |[^\ ]+$)" SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}")
 foreach(CURRENT_TARGET IN LISTS SYSTEM_TARGETS)
   string(STRIP "${CURRENT_TARGET}" CURRENT_TARGET)
+
   add_openmp_testsuite(check-libomptarget-${CURRENT_TARGET}
     "Running libomptarget tests"
     ${CMAKE_CURRENT_BINARY_DIR}/${CURRENT_TARGET}
diff --git a/openmp/libomptarget/test/libc/malloc.c b/openmp/libomptarget/test/libc/malloc.c
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/libc/malloc.c
@@ -0,0 +1,33 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// REQUIRES: libc
+
+// TODO: This requires async malloc on CUDA which is an 11.2 feature.
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#pragma omp declare target to(malloc)
+#pragma omp declare target to(free)
+
+int main() {
+  unsigned h_x;
+  unsigned *d_x;
+#pragma omp target map(from : d_x)
+  {
+    d_x = malloc(sizeof(unsigned));
+    *d_x = 1;
+  }
+
+#pragma omp target is_device_ptr(d_x) map(from : h_x)
+  { h_x = *d_x; }
+
+#pragma omp target is_device_ptr(d_x)
+  { free(d_x); }
+
+  // CHECK: PASS
+  if (h_x == 1)
+    fputs("PASS\n", stdout);
+}
diff --git a/openmp/libomptarget/test/libc/puts.c b/openmp/libomptarget/test/libc/puts.c
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/libc/puts.c
@@ -0,0 +1,35 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// REQUIRES: libc
+
+#include <stdio.h>
+
+#pragma omp declare target to(stdout)
+
+int main() {
+// CHECK: PASS
+#pragma omp target
+  { fputs("PASS\n", stdout); }
+
+// CHECK: PASS
+#pragma omp target nowait
+  { fputs("PASS\n", stdout); }
+
+// CHECK: PASS
+#pragma omp target nowait
+  { fputs("PASS\n", stdout); }
+
+#pragma omp taskwait
+
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+#pragma omp target teams num_teams(4)
+#pragma omp parallel num_threads(2)
+  { fputs("PASS\n", stdout); }
+}
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -96,6 +96,9 @@
   config.available_features.add('flang')
   tools.append(ToolSubst('%flang', command=FindTool('flang-new'), unresolved='fatal'))
 
+if config.libomptarget_has_libc:
+  config.available_features.add('libc')
+
 # Determine whether the test system supports unified memory.
 # For CUDA, this is the case with compute capability 70 (Volta) or higher.
 # For all other targets, we currently assume it is.
@@ -213,10 +216,12 @@
             "%libomptarget-run-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compilexx-" + \
             libomptarget_target, \
-            "%clangxx-" + libomptarget_target + " %s -o %t"))
+            "%clangxx-" + libomptarget_target + " %s -o %t" \
+            " -lcgpu" if config.libomptarget_has_libc else ""))
         config.substitutions.append(("%libomptarget-compile-" + \
             libomptarget_target, \
-            "%clang-" + libomptarget_target + " %s -o %t"))
+            "%clang-" + libomptarget_target + " %s -o %t"
+            " -lcgpu" if config.libomptarget_has_libc else ""))
         config.substitutions.append(("%libomptarget-compileoptxx-run-and-check-" + \
             libomptarget_target, \
             "%libomptarget-compileoptxx-and-run-" + libomptarget_target + \
@@ -235,10 +240,12 @@
             "%libomptarget-run-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compileoptxx-" + \
             libomptarget_target, \
-            "%clangxx-" + libomptarget_target + " -O3 %s -o %t"))
+            "%clangxx-" + libomptarget_target + " -O3 %s -o %t"
+            " -lcgpu" if config.libomptarget_has_libc else ""))
         config.substitutions.append(("%libomptarget-compileopt-" + \
             libomptarget_target, \
-            "%clang-" + libomptarget_target + " -O3 %s -o %t"))
+            "%clang-" + libomptarget_target + " -O3 %s -o %t"
+            " -lcgpu" if config.libomptarget_has_libc else ""))
         config.substitutions.append(("%libomptarget-run-" + \
             libomptarget_target, \
             "%t"))
diff --git a/openmp/libomptarget/test/lit.site.cfg.in b/openmp/libomptarget/test/lit.site.cfg.in
--- a/openmp/libomptarget/test/lit.site.cfg.in
+++ b/openmp/libomptarget/test/lit.site.cfg.in
@@ -22,6 +22,7 @@
 config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
 config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@
 config.llvm_enabled_projects = "@LLVM_ENABLE_PROJECTS@".split(";")
+config.libomptarget_has_libc = "@LIBOMPTARGET_GPU_LIBC_SUPPORT@"
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)