diff --git a/libc/docs/gpu/rpc.rst b/libc/docs/gpu/rpc.rst
--- a/libc/docs/gpu/rpc.rst
+++ b/libc/docs/gpu/rpc.rst
@@ -15,3 +15,10 @@
 require support from the operating system. We instead implement a remote
 procedure call (RPC) interface to allow submitting work from the GPU to a host
 server that forwards it to the host system.
+
+Extensions
+----------
+
+We describe which operation the RPC server should take with a 16-bit opcode. We 
+consider the first 32768 numbers to be reserved while the others are free to 
+use.
diff --git a/libc/include/llvm-libc-types/rpc_opcodes_t.h b/libc/include/llvm-libc-types/rpc_opcodes_t.h
--- a/libc/include/llvm-libc-types/rpc_opcodes_t.h
+++ b/libc/include/llvm-libc-types/rpc_opcodes_t.h
@@ -20,11 +20,6 @@
   RPC_MALLOC = 7,
   RPC_FREE = 8,
   RPC_HOST_CALL = 9,
-  // TODO: Move these out of here and handle then with custom handlers in the
-  // loader.
-  RPC_TEST_INCREMENT = 1000,
-  RPC_TEST_INTERFACE = 1001,
-  RPC_TEST_STREAM = 1002,
 } rpc_opcode_t;
 
 #endif // __LLVM_LIBC_TYPES_RPC_OPCODE_H__
diff --git a/libc/include/llvm-libc-types/test_rpc_opcodes_t.h b/libc/include/llvm-libc-types/test_rpc_opcodes_t.h
new file mode 100644
--- /dev/null
+++ b/libc/include/llvm-libc-types/test_rpc_opcodes_t.h
@@ -0,0 +1,21 @@
+//===-- Definition of RPC opcodes used for internal tests -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LLVM_LIBC_TYPES_TEST_RPC_OPCODE_H__
+#define __LLVM_LIBC_TYPES_TEST_RPC_OPCODE_H__
+
+// We consider the first 32768 opcodes as reserved for libc purposes. We allow
+// extensions to use any other number without conflicting with anything else.
+typedef enum : unsigned short {
+  RPC_TEST_NOOP = 1 << 15,
+  RPC_TEST_INCREMENT,
+  RPC_TEST_INTERFACE,
+  RPC_TEST_STREAM,
+} rpc_test_opcode_t;
+
+#endif // __LLVM_LIBC_TYPES_TEST_RPC_OPCODE_H__
diff --git a/libc/test/integration/startup/gpu/rpc_interface_test.cpp b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
--- a/libc/test/integration/startup/gpu/rpc_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/test/integration/startup/gpu/rpc_stream_test.cpp b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
--- a/libc/test/integration/startup/gpu/rpc_stream_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/integer_to_string.h"
diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -10,6 +10,9 @@
 #define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
 
 #include "utils/gpu/server/rpc_server.h"
+
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
@@ -104,4 +107,119 @@
   handle_error("Failure in the RPC server\n");
 }
 
+inline void register_rpc_callbacks(uint32_t device_id) {
+  // Register the ping test for the `libc` tests.
+  rpc_register_callback(
+      device_id, static_cast<rpc_opcode_t>(RPC_TEST_INCREMENT),
+      [](rpc_port_t port, void *data) {
+        rpc_recv_and_send(
+            port,
+            [](rpc_buffer_t *buffer, void *data) {
+              reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
+            },
+            data);
+      },
+      nullptr);
+
+  // Register the interface test callbacks.
+  rpc_register_callback(
+      device_id, static_cast<rpc_opcode_t>(RPC_TEST_INTERFACE),
+      [](rpc_port_t port, void *data) {
+        uint64_t cnt = 0;
+        bool end_with_recv;
+        rpc_recv(
+            port,
+            [](rpc_buffer_t *buffer, void *data) {
+              *reinterpret_cast<bool *>(data) = buffer->data[0];
+            },
+            &end_with_recv);
+        rpc_recv(
+            port,
+            [](rpc_buffer_t *buffer, void *data) {
+              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
+            },
+            &cnt);
+        rpc_send(
+            port,
+            [](rpc_buffer_t *buffer, void *data) {
+              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
+              buffer->data[0] = cnt = cnt + 1;
+            },
+            &cnt);
+        rpc_recv(
+            port,
+            [](rpc_buffer_t *buffer, void *data) {
+              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
+            },
+            &cnt);
+        rpc_send(
+            port,
+            [](rpc_buffer_t *buffer, void *data) {
+              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
+              buffer->data[0] = cnt = cnt + 1;
+            },
+            &cnt);
+        rpc_recv(
+            port,
+            [](rpc_buffer_t *buffer, void *data) {
+              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
+            },
+            &cnt);
+        rpc_recv(
+            port,
+            [](rpc_buffer_t *buffer, void *data) {
+              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
+            },
+            &cnt);
+        rpc_send(
+            port,
+            [](rpc_buffer_t *buffer, void *data) {
+              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
+              buffer->data[0] = cnt = cnt + 1;
+            },
+            &cnt);
+        rpc_send(
+            port,
+            [](rpc_buffer_t *buffer, void *data) {
+              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
+              buffer->data[0] = cnt = cnt + 1;
+            },
+            &cnt);
+        if (end_with_recv)
+          rpc_recv(
+              port,
+              [](rpc_buffer_t *buffer, void *data) {
+                *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
+              },
+              &cnt);
+        else
+          rpc_send(
+              port,
+              [](rpc_buffer_t *buffer, void *data) {
+                uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
+                buffer->data[0] = cnt = cnt + 1;
+              },
+              &cnt);
+      },
+      nullptr);
+
+  // Register the stream test handler.
+  rpc_register_callback(
+      device_id, static_cast<rpc_opcode_t>(RPC_TEST_STREAM),
+      [](rpc_port_t port, void *data) {
+        uint64_t sizes[RPC_MAXIMUM_LANE_SIZE] = {0};
+        void *dst[RPC_MAXIMUM_LANE_SIZE] = {nullptr};
+        rpc_recv_n(
+            port, dst, sizes,
+            [](uint64_t size, void *) -> void * { return new char[size]; },
+            nullptr);
+        rpc_send_n(port, dst, sizes);
+        for (uint64_t i = 0; i < RPC_MAXIMUM_LANE_SIZE; ++i) {
+          if (dst[i])
+            delete[] reinterpret_cast<uint8_t *>(dst[i]);
+        }
+      },
+      nullptr);
+}
+
 #endif
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -150,6 +150,8 @@
 
   // Register RPC callbacks for the malloc and free functions on HSA.
   uint32_t device_id = 0;
+  register_rpc_callbacks(device_id);
+
   auto tuple = std::make_tuple(dev_agent, coarsegrained_pool);
   rpc_register_callback(
       device_id, RPC_MALLOC,
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -177,6 +177,8 @@
 
   // Register RPC callbacks for the malloc and free functions on HSA.
   uint32_t device_id = 0;
+  register_rpc_callbacks(device_id);
+
   rpc_register_callback(
       device_id, RPC_MALLOC,
       [](rpc_port_t port, void *data) {
diff --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/rpc_server.h
--- a/libc/utils/gpu/server/rpc_server.h
+++ b/libc/utils/gpu/server/rpc_server.h
@@ -20,9 +20,12 @@
 /// The maxium number of ports that can be opened for any server.
 const uint64_t RPC_MAXIMUM_PORT_COUNT = 512;
 
+/// The maximum number of parallel lanes that we can support.
+const uint64_t RPC_MAXIMUM_LANE_SIZE = 64;
+
 /// The symbol name associated with the client for use with the LLVM C library
 /// implementation.
-inline const char *rpc_client_symbol_name = "__llvm_libc_rpc_client";
+const char *const rpc_client_symbol_name = "__llvm_libc_rpc_client";
 
 /// status codes.
 typedef enum {
@@ -100,9 +103,19 @@
 /// Use the \p port to send a buffer using the \p callback.
 void rpc_send(rpc_port_t port, rpc_port_callback_ty callback, void *data);
 
+/// Use the \p port to send \p bytes using the \p callback. The input is an
+/// array of at least the configured lane size.
+void rpc_send_n(rpc_port_t port, const void *const *src, uint64_t *size);
+
 /// Use the \p port to recieve a buffer using the \p callback.
 void rpc_recv(rpc_port_t port, rpc_port_callback_ty callback, void *data);
 
+/// Use the \p port to recieve \p bytes using the \p callback. The inputs is an
+/// array of at least the configured lane size. The \p alloc function allocates
+/// memory for the recieved bytes.
+void rpc_recv_n(rpc_port_t port, void **dst, uint64_t *size, rpc_alloc_ty alloc,
+                void *data);
+
 /// Use the \p port to receive and send a buffer using the \p callback.
 void rpc_recv_and_send(rpc_port_t port, rpc_port_callback_ty callback,
                        void *data);
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -26,6 +26,9 @@
 static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT,
               "Incorrect maximum port count");
 
+static_assert(RPC_MAXIMUM_LANE_SIZE == rpc::MAX_LANE_SIZE,
+              "Incorrect maximum port count");
+
 // The client needs to support different lane sizes for the SIMT model. Because
 // of this we need to select between the possible sizes that the client can use.
 struct Server {
@@ -141,43 +144,6 @@
       });
       break;
     }
-    // TODO: Move handling of these  test cases to the loader implementation.
-    case RPC_TEST_INCREMENT: {
-      port->recv_and_send([](rpc::Buffer *buffer) {
-        reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
-      });
-      break;
-    }
-    case RPC_TEST_INTERFACE: {
-      uint64_t cnt = 0;
-      bool end_with_recv;
-      port->recv([&](rpc::Buffer *buffer) { end_with_recv = buffer->data[0]; });
-      port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
-      port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
-      port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
-      port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
-      port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
-      port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
-      port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
-      port->send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
-      if (end_with_recv)
-        port->recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
-      else
-        port->send(
-            [&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
-      break;
-    }
-    case RPC_TEST_STREAM: {
-      uint64_t sizes[rpc::MAX_LANE_SIZE] = {0};
-      void *dst[rpc::MAX_LANE_SIZE] = {nullptr};
-      port->recv_n(dst, sizes, [](uint64_t size) { return new char[size]; });
-      port->send_n(dst, sizes);
-      for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) {
-        if (dst[i])
-          delete[] reinterpret_cast<uint8_t *>(dst[i]);
-      }
-      break;
-    }
     case RPC_NOOP: {
       port->recv([](rpc::Buffer *) {});
       break;
@@ -375,6 +341,11 @@
       port);
 }
 
+void rpc_send_n(rpc_port_t ref, const void *const *src, uint64_t *size) {
+  auto port = get_port(ref);
+  std::visit([=](auto &port) { port->send_n(src, size); }, port);
+}
+
 void rpc_recv(rpc_port_t ref, rpc_port_callback_ty callback, void *data) {
   auto port = get_port(ref);
   std::visit(
@@ -386,6 +357,13 @@
       port);
 }
 
+void rpc_recv_n(rpc_port_t ref, void **dst, uint64_t *size, rpc_alloc_ty alloc,
+                void *data) {
+  auto port = get_port(ref);
+  auto alloc_fn = [=](uint64_t size) { return alloc(size, data); };
+  std::visit([=](auto &port) { port->recv_n(dst, size, alloc_fn); }, port);
+}
+
 void rpc_recv_and_send(rpc_port_t ref, rpc_port_callback_ty callback,
                        void *data) {
   auto port = get_port(ref);