diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -204,6 +204,7 @@
 add_subdirectory(FPUtil)
 add_subdirectory(OSUtil)
 add_subdirectory(StringUtil)
+add_subdirectory(RPC)
 
 # Thread support is used by other "File". So, we add the "threads"
 # before "File".
diff --git a/libc/src/__support/OSUtil/CMakeLists.txt b/libc/src/__support/OSUtil/CMakeLists.txt
--- a/libc/src/__support/OSUtil/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/CMakeLists.txt
@@ -8,12 +8,23 @@
   return()
 endif()
 
-add_header_library(
-  osutil
-  HDRS
-    io.h
-    quick_exit.h
-    syscall.h
-  DEPENDS
-    ${target_os_util}
-)
+# The OSUtil is an object library in GPU mode.
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  add_header_library(
+    osutil
+    HDRS
+      io.h
+      quick_exit.h
+      syscall.h
+    DEPENDS
+      ${target_os_util}
+  )
+else()
+  add_object_library(
+    osutil
+    ALIAS
+      ${target_os_util}
+    DEPENDS
+      ${target_os_util}
+  )
+endif()
diff --git a/libc/src/__support/OSUtil/gpu/CMakeLists.txt b/libc/src/__support/OSUtil/gpu/CMakeLists.txt
--- a/libc/src/__support/OSUtil/gpu/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/gpu/CMakeLists.txt
@@ -2,8 +2,11 @@
   gpu_util
   SRCS
     quick_exit.cpp
+    io.cpp
   HDRS
     quick_exit.h
+    io.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.RPC.rpc_client
 )
diff --git a/libc/src/__support/OSUtil/io.h b/libc/src/__support/OSUtil/gpu/io.h
copy from libc/src/__support/OSUtil/io.h
copy to libc/src/__support/OSUtil/gpu/io.h
--- a/libc/src/__support/OSUtil/io.h
+++ b/libc/src/__support/OSUtil/gpu/io.h
@@ -1,4 +1,4 @@
-//===---------------- Implementation of IO utils ----------------*- C++ -*-===//
+//===-------------- GPU implementation of IO utils --------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H
-#define LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H
+#ifndef LLVM_LIBC_SRC_SUPPORT_OSUTIL_GPU_IO_H
+#define LLVM_LIBC_SRC_SUPPORT_OSUTIL_GPU_IO_H
 
-#ifdef __unix__
-#include "linux/io.h"
-#endif
+namespace __llvm_libc {
 
-#endif // LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H
+void write_to_stderr(const char *msg);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SUPPORT_OSUTIL_LINUX_IO_H
diff --git a/libc/src/__support/OSUtil/gpu/io.cpp b/libc/src/__support/OSUtil/gpu/io.cpp
new file mode 100644
--- /dev/null
+++ b/libc/src/__support/OSUtil/gpu/io.cpp
@@ -0,0 +1,29 @@
+//===-------------- GPU implementation of IO utils --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "io.h"
+
+#include "src/__support/RPC/rpc_client.h"
+#include "src/string/string_utils.h"
+
+namespace __llvm_libc {
+
+void write_to_stderr(const char *msg) {
+  uint64_t length = internal::string_length(msg) + 1;
+  uint64_t buffer_len = sizeof(rpc::Buffer) - sizeof(uint64_t);
+  for (uint64_t i = 0; i < length; i += buffer_len)
+    rpc::client.run(
+        [&](rpc::Buffer *buffer) {
+          buffer->data[0] = rpc::Opcode::print;
+          inline_memcpy(reinterpret_cast<char *>(&buffer->data[1]), &msg[i],
+                        (length > buffer_len ? buffer_len : length));
+        },
+        [](rpc::Buffer *) {});
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/__support/OSUtil/gpu/quick_exit.cpp b/libc/src/__support/OSUtil/gpu/quick_exit.cpp
--- a/libc/src/__support/OSUtil/gpu/quick_exit.cpp
+++ b/libc/src/__support/OSUtil/gpu/quick_exit.cpp
@@ -11,11 +11,21 @@
 
 #include "quick_exit.h"
 
+#include "src/__support/RPC/rpc_client.h"
 #include "src/__support/macros/properties/architectures.h"
 
 namespace __llvm_libc {
 
 void quick_exit(int status) {
+  // TODO: Support asynchronous calls so we don't wait and exit from the GPU
+  // immediately.
+  rpc::client.run(
+      [&](rpc::Buffer *buffer) {
+        buffer->data[0] = rpc::Opcode::exit;
+        buffer->data[1] = status;
+      },
+      [](rpc::Buffer *) {});
+
 #if defined(LIBC_TARGET_ARCH_IS_NVPTX)
   asm("exit" ::: "memory");
 #elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
diff --git a/libc/src/__support/OSUtil/io.h b/libc/src/__support/OSUtil/io.h
--- a/libc/src/__support/OSUtil/io.h
+++ b/libc/src/__support/OSUtil/io.h
@@ -9,7 +9,11 @@
 #ifndef LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H
 #define LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H
 
-#ifdef __unix__
+#include "src/__support/macros/properties/architectures.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_GPU)
+#include "gpu/io.h"
+#elif defined(__unix__)
 #include "linux/io.h"
 #endif
 
diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/libc/src/__support/RPC/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_header_library(
+  rpc
+  HDRS
+    rpc.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.CPP.atomic
+)
+
+add_object_library(
+  rpc_client
+  SRCS
+    rpc_client.cpp
+  HDRS
+    rpc_client.h
+  DEPENDS
+    .rpc
+)
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
new file mode 100644
--- /dev/null
+++ b/libc/src/__support/RPC/rpc.h
@@ -0,0 +1,140 @@
+//===-- Shared memory RPC client / server interface -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_H
+#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_H
+
+#include "src/__support/CPP/atomic.h"
+
+#include <stdint.h>
+
+namespace __llvm_libc {
+namespace rpc {
+
+/// A list of opcodes that we use to invoke certain actions on the server. We
+/// reserve the first 255 values for internal libc usage.
+enum Opcode : uint64_t {
+  NOOP = 0,
+  PRINT_TO_STDERR = 1,
+  EXIT = 2,
+  LIBC_LAST = (1UL << 8) - 1,
+};
+
+/// A fixed size channel used to communicate between the RPC client and server.
+struct Buffer {
+  uint64_t data[8];
+};
+
+/// A common process used to synchronize communication between a client and a
+/// server. The process contains an inbox and an outbox used for signaling
+/// ownership of the shared buffer.
+struct Process {
+  cpp::Atomic<uint32_t> *inbox;
+  cpp::Atomic<uint32_t> *outbox;
+  Buffer *buffer;
+
+  /// Initialize the communicaiton channels.
+  void reset(void *inbox, void *outbox, void *buffer) {
+    *this = {
+        reinterpret_cast<cpp::Atomic<uint32_t> *>(inbox),
+        reinterpret_cast<cpp::Atomic<uint32_t> *>(outbox),
+        reinterpret_cast<Buffer *>(buffer),
+    };
+  }
+};
+
+/// The RPC client used to make requests to the server.
+struct Client : public Process {
+  template <typename F, typename U> void run(F fill, U use);
+};
+
+/// The RPC server used to respond to the client.
+struct Server : public Process {
+  template <typename W, typename C> bool run(W work, C clean);
+};
+
+/// Run the RPC client protocol to communicate with the server. We perform the
+/// following high level actions to complete a communication:
+///   - Apply \p fill to the shared buffer and write 1 to the outbox.
+///   - Wait until the inbox is 1.
+///   - Apply \p use to the shared buffer and write 0 to the outbox.
+///   - Wait until the inbox is 0.
+template <typename F, typename U> void Client::run(F fill, U use) {
+  bool in = inbox->load(cpp::MemoryOrder::RELAXED);
+  bool out = outbox->load(cpp::MemoryOrder::RELAXED);
+  atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+  // Write to buffer then to the outbox.
+  if (!in & !out) {
+    fill(buffer);
+    atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    outbox->store(1, cpp::MemoryOrder::RELEASE);
+    out = 1;
+  }
+  // Wait for the result from the server.
+  if (!in & out) {
+    while (!in)
+      in = inbox->load(cpp::MemoryOrder::RELAXED);
+    atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+  }
+  // Read from the buffer and then write to outbox.
+  if (in & out) {
+    use(buffer);
+    atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    outbox->store(0, cpp::MemoryOrder::RELEASE);
+    out = 0;
+  }
+  // Wait for server to complete the communication.
+  if (in & !out) {
+    while (in)
+      in = inbox->load(cpp::MemoryOrder::RELAXED);
+    atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+  }
+}
+
+/// Run the RPC server protocol to communicate with the client. This is
+/// non-blocking and only checks the server a single time. We perform the
+/// following high level actions to complete a communication:
+///   - Query if the inbox is 1 and exit if there is no work to do.
+///   - Apply \p work to the shared buffer and write 1 to the outbox.
+///   - Wait until the inbox is 0.
+///   - Apply \p clean to the shared buffer and write 0 to the outbox.
+template <typename W, typename C> bool Server::run(W work, C clean) {
+  bool in = inbox->load(cpp::MemoryOrder::RELAXED);
+  bool out = outbox->load(cpp::MemoryOrder::RELAXED);
+  atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+  // No work to do, exit.
+  if (!in & !out)
+    return false;
+  // Do work then write to the outbox.
+  if (in & !out) {
+    work(buffer);
+    atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    outbox->store(1, cpp::MemoryOrder::RELEASE);
+    out = 1;
+  }
+  // Wait for the client to read the result.
+  if (in & out) {
+    while (in)
+      in = inbox->load(cpp::MemoryOrder::RELAXED);
+    atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+  }
+  // Clean up the buffer and signal the client.
+  if (!in & out) {
+    clean(buffer);
+    atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    outbox->store(0, cpp::MemoryOrder::RELEASE);
+    out = 0;
+  }
+
+  return true;
+}
+
+} // namespace rpc
+} // namespace __llvm_libc
+
+#endif
diff --git a/libc/src/__support/RPC/rpc_client.h b/libc/src/__support/RPC/rpc_client.h
new file mode 100644
--- /dev/null
+++ b/libc/src/__support/RPC/rpc_client.h
@@ -0,0 +1,23 @@
+//===-- Shared memory RPC client instantiation ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H
+#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H
+
+#include "rpc.h"
+
+namespace __llvm_libc {
+namespace rpc {
+
+/// The libc client instance used to communicate with the server.
+extern Client client;
+
+} // namespace rpc
+} // namespace __llvm_libc
+
+#endif
diff --git a/libc/src/__support/RPC/rpc_client.cpp b/libc/src/__support/RPC/rpc_client.cpp
new file mode 100644
--- /dev/null
+++ b/libc/src/__support/RPC/rpc_client.cpp
@@ -0,0 +1,27 @@
+//===-- Shared memory RPC client instantiation ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H
+#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H
+
+#include "rpc.h"
+
+namespace __llvm_libc {
+namespace rpc {
+
+/// The libc client instance used to communicate with the server.
+Client client;
+
+/// Externally visible symbol to signify the usage of an RPC client to
+/// whomever needs to run the server.
+extern "C" [[gnu::visibility("protected")]] const bool __llvm_libc_rpc = false;
+
+} // namespace rpc
+} // namespace __llvm_libc
+
+#endif
diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
--- a/libc/startup/gpu/amdgpu/CMakeLists.txt
+++ b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -2,11 +2,12 @@
   crt1
   SRC
     start.cpp
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
     -nogpulib # Do not include any GPU vendor libraries.
-    -nostdinc
     -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}
     -emit-llvm # AMDGPU's intermediate object file format is bitcode.
     --target=${LIBC_GPU_TARGET_TRIPLE}
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -6,9 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/RPC/rpc_client.h"
+
 extern "C" int main(int argc, char **argv);
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
-_start(int argc, char **argv, int *ret) {
+_start(int argc, char **argv, int *ret, void *in, void *out, void *buffer) {
+  __llvm_libc::rpc::client.reset(in, out, buffer);
+
   __atomic_fetch_or(ret, main(argc, argv), __ATOMIC_RELAXED);
 }
diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
--- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
+++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
@@ -1,4 +1,7 @@
 add_executable(amdhsa_loader Loader.cpp)
+add_dependencies(amdhsa_loader libc.src.__support.RPC.rpc)
+
+target_include_directories(amdhsa_loader PRIVATE ${LIBC_SOURCE_DIR})
 target_link_libraries(amdhsa_loader
   PRIVATE
   hsa-runtime64::hsa-runtime64
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -15,6 +15,8 @@
 
 #include "Loader.h"
 
+#include "src/__support/RPC/rpc.h"
+
 #include <hsa/hsa.h>
 #include <hsa/hsa_ext_amd.h>
 
@@ -31,8 +33,35 @@
   int argc;
   void *argv;
   void *ret;
+  void *inbox;
+  void *outbox;
+  void *buffer;
 };
 
+static __llvm_libc::rpc::Server server;
+
+/// Queries the RPC client at least once and performs server-side work if there
+/// are any active requests.
+void handle_server() {
+  while (server.run(
+      [&](__llvm_libc::rpc::Buffer *buffer) {
+        switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) {
+        case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: {
+          fputs(reinterpret_cast<const char *>(&buffer->data[1]), stderr);
+          break;
+        }
+        case __llvm_libc::rpc::Opcode::EXIT: {
+          exit(buffer->data[1]);
+          break;
+        }
+        default:
+          return;
+        };
+      },
+      [](__llvm_libc::rpc::Buffer *buffer) {}))
+    ;
+}
+
 /// Print the error code and exit if \p code indicates an error.
 static void handle_error(hsa_status_t code) {
   if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
@@ -278,6 +307,26 @@
     handle_error(err);
   hsa_amd_memory_fill(dev_ret, 0, sizeof(int));
 
+  // Allocate finegrained memory for the RPC server and client to share.
+  void *server_inbox;
+  void *server_outbox;
+  void *buffer;
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
+          finegrained_pool, sizeof(__llvm_libc::cpp::Atomic<int>),
+          /*flags=*/0, &server_inbox))
+    handle_error(err);
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
+          finegrained_pool, sizeof(__llvm_libc::cpp::Atomic<int>),
+          /*flags=*/0, &server_outbox))
+    handle_error(err);
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
+          finegrained_pool, sizeof(__llvm_libc::rpc::Buffer),
+          /*flags=*/0, &buffer))
+    handle_error(err);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_inbox);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer);
+
   // Initialie all the arguments (explicit and implicit) to zero, then set the
   // explicit arguments to the values created above.
   std::memset(args, 0, args_size);
@@ -285,6 +334,9 @@
   kernel_args->argc = argc;
   kernel_args->argv = dev_argv;
   kernel_args->ret = dev_ret;
+  kernel_args->inbox = server_outbox;
+  kernel_args->outbox = server_inbox;
+  kernel_args->buffer = buffer;
 
   // Obtain a packet from the queue.
   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
@@ -316,6 +368,9 @@
           hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
     handle_error(err);
 
+  // Initialize the RPC server's buffer for host-device communication.
+  server.reset(server_inbox, server_outbox, buffer);
+
   // Initialize the packet header and set the doorbell signal to begin execution
   // by the HSA runtime.
   uint16_t header =
@@ -326,11 +381,12 @@
                    __ATOMIC_RELEASE);
   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
 
-  // Wait until the kernel has completed execution on the device.
-  while (hsa_signal_wait_scacquire(packet->completion_signal,
-                                   HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
-                                   HSA_WAIT_STATE_ACTIVE) != 0)
-    ;
+  // Wait until the kernel has completed execution on the device. Periodically
+  // check the RPC client for work to be performed on the server.
+  while (hsa_signal_wait_scacquire(
+             packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
+             /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
+    handle_server();
 
   // Create a memory signal and copy the return value back from the device into
   // a new buffer.