diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -2,6 +2,8 @@
   crt1
   SRC
     start.cpp
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of crt for amdgpu ----------------------------------===//
+//===-- Implementation of crt for nvptx -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/RPC/rpc_client.h"
+
 extern "C" int main(int argc, char **argv, char **envp);
 
 extern "C" [[gnu::visibility("protected")]] __attribute__((nvptx_kernel)) void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
+  __llvm_libc::rpc::client.reset(in, out, buffer);
+
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -15,6 +15,8 @@
 
 #include "Loader.h"
 
+#include "src/__support/RPC/rpc.h"
+
 #include "cuda.h"
 #include <cstddef>
 #include <cstdio>
@@ -32,6 +34,30 @@
   void *buffer;
 };
 
+static __llvm_libc::rpc::Server server;
+
+/// Queries the RPC client at least once and performs server-side work if there
+/// are any active requests.
+void handle_server() {
+  while (server.handle(
+      [&](__llvm_libc::rpc::Buffer *buffer) {
+        switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) {
+        case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: {
+          fputs(reinterpret_cast<const char *>(&buffer->data[1]), stderr);
+          break;
+        }
+        case __llvm_libc::rpc::Opcode::EXIT: {
+          exit(buffer->data[1]);
+          break;
+        }
+        default:
+          return;
+        };
+      },
+      [](__llvm_libc::rpc::Buffer *buffer) {}))
+    ;
+}
+
 static void handle_error(CUresult err) {
   if (err == CUDA_SUCCESS)
     return;
@@ -106,8 +132,13 @@
   if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
     handle_error(err);
 
+  void *server_inbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
+  void *server_outbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
+  void *buffer = allocator(sizeof(__llvm_libc::rpc::Buffer));
+  if (!server_inbox || !server_outbox || !buffer)
+    handle_error("Failed to allocate memory the RPC client / server.");
+
   // Set up the arguments to the '_start' kernel on the GPU.
-  // TODO: Setup RPC server implementation;
   uint64_t args_size = sizeof(kernel_args_t);
   kernel_args_t args;
   std::memset(&args, 0, args_size);
@@ -115,10 +146,16 @@
   args.argv = dev_argv;
   args.envp = dev_envp;
   args.ret = reinterpret_cast<void *>(dev_ret);
+  args.inbox = server_outbox;
+  args.outbox = server_inbox;
+  args.buffer = buffer;
   void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
                          CU_LAUNCH_PARAM_END};
 
+  // Initialize the RPC server's buffer for host-device communication.
+  server.reset(server_inbox, server_outbox, buffer);
+
   // Call the kernel with the given arguments.
   if (CUresult err =
           cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1,
@@ -126,9 +163,10 @@
                          /*bloackDimZ=*/1, 0, stream, nullptr, args_config))
     handle_error(err);
 
-  // TODO: Query the RPC server periodically while the kernel is running.
+  // Wait until the kernel has completed execution on the device. Periodically
+  // check the RPC client for work to be performed on the server.
   while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
-    ;
+    handle_server();
 
   // Copy the return value back from the kernel and wait.
   int host_ret = 0;