diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -89,6 +89,8 @@
     libc.src.stdio.feof
     libc.src.stdio.ferror
     libc.src.stdio.clearerr
+    libc.src.stdio.printf
+    libc.src.stdio.fprintf
     libc.src.stdio.snprintf
     libc.src.stdio.vsnprintf
     libc.src.stdio.puts
diff --git a/libc/include/llvm-libc-types/rpc_opcodes_t.h b/libc/include/llvm-libc-types/rpc_opcodes_t.h
--- a/libc/include/llvm-libc-types/rpc_opcodes_t.h
+++ b/libc/include/llvm-libc-types/rpc_opcodes_t.h
@@ -25,6 +25,9 @@
   RPC_FEOF = 12,
   RPC_FERROR = 13,
   RPC_CLEARERR = 14,
+  RPC_PRINTF_TO_STDOUT = 15,
+  RPC_PRINTF_TO_STDERR = 16,
+  RPC_PRINTF_TO_STREAM = 17
 } rpc_opcode_t;
 
 #endif // __LLVM_LIBC_TYPES_RPC_OPCODE_H__
diff --git a/libc/src/__support/arg_list.h b/libc/src/__support/arg_list.h
--- a/libc/src/__support/arg_list.h
+++ b/libc/src/__support/arg_list.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_ARG_LIST_H
 #define LLVM_LIBC_SRC___SUPPORT_ARG_LIST_H
 
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/common.h"
 
 #include <stdarg.h>
@@ -60,6 +61,35 @@
   size_t read_count() const { return arg_counter; }
 };
 
+// Used for the GPU implementation of `printf`. This models a variadic list as a
+// simple array of pointers that are built manually by the implementation.
+class ArrayArgList {
+  size_t idx;
+  void **list;
+
+public:
+  LIBC_INLINE ArrayArgList(void **list) : idx(0), list(list) {}
+  LIBC_INLINE ArrayArgList(va_list) {}
+  LIBC_INLINE ArrayArgList(const ArrayArgList &other) {
+    idx = other.idx;
+    list = other.list;
+  }
+  LIBC_INLINE ~ArrayArgList() = default;
+
+  LIBC_INLINE ArrayArgList &operator=(const ArrayArgList &rhs) {
+    idx = rhs.idx;
+    list = rhs.list;
+    return *this;
+  }
+
+  template <class T> LIBC_INLINE T next_var() {
+    if constexpr (cpp::is_same_v<T, char *>)
+      return reinterpret_cast<T>(list[idx++]);
+    else
+      return *reinterpret_cast<T *>(list[idx++]);
+  }
+}; // namespace __llvm_libc
+
 } // namespace internal
 } // namespace __llvm_libc
 
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -26,6 +26,9 @@
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/generic)
 endif()
 
+add_subdirectory(printf_core)
+add_subdirectory(scanf_core)
+
 add_entrypoint_object(
   fflush
   SRCS
@@ -183,59 +186,6 @@
     libc.src.stdio.printf_core.writer
 )
 
-list(APPEND printf_deps
-      libc.src.__support.arg_list
-      libc.src.stdio.printf_core.vfprintf_internal
-)
-
-if(LIBC_CONF_PRINTF_DISABLE_FLOAT)
-  list(APPEND printf_copts "-DLIBC_COPT_PRINTF_DISABLE_FLOAT")
-endif()
-if(LIBC_CONF_PRINTF_DISABLE_INDEX_MODE)
-  list(APPEND printf_copts "-DLIBC_COPT_PRINTF_DISABLE_INDEX_MODE")
-endif()
-if(LIBC_CONF_PRINTF_DISABLE_WRITE_INT)
-  list(APPEND printf_copts "-DLIBC_COPT_PRINTF_DISABLE_WRITE_INT")
-endif()
-if(LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE)
-  list(APPEND printf_copts "-DLIBC_COPT_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE")
-endif()
-
-if(LLVM_LIBC_FULL_BUILD)
-  list(APPEND printf_deps
-      libc.src.__support.File.file
-      libc.src.__support.File.platform_file
-      libc.src.__support.File.platform_stdout
-  )
-else()
-  list(APPEND printf_copts "-DLIBC_COPT_PRINTF_USE_SYSTEM_FILE")
-endif()
-
-add_entrypoint_object(
-  printf
-  SRCS
-    printf.cpp
-  HDRS
-    printf.h
-  DEPENDS
-    ${printf_deps}
-  COMPILE_OPTIONS
-    ${printf_copts}
-)
-
-add_entrypoint_object(
-  fprintf
-  SRCS
-    fprintf.cpp
-  HDRS
-    fprintf.h
-  DEPENDS
-    libc.src.__support.arg_list
-    libc.src.stdio.printf_core.vfprintf_internal
-  COMPILE_OPTIONS
-    ${printf_copts}
-)
-
 add_entrypoint_object(
   vsprintf
   SRCS
@@ -258,34 +208,6 @@
     libc.src.stdio.printf_core.writer
 )
 
-add_entrypoint_object(
-  vprintf
-  SRCS
-    vprintf.cpp
-  HDRS
-    vprintf.h
-  DEPENDS
-    ${printf_deps}
-  COMPILE_OPTIONS
-    ${printf_copts}
-)
-
-add_entrypoint_object(
-  vfprintf
-  SRCS
-    vfprintf.cpp
-  HDRS
-    vfprintf.h
-  DEPENDS
-    libc.src.__support.arg_list
-    libc.src.stdio.printf_core.vfprintf_internal
-  COMPILE_OPTIONS
-    ${printf_copts}
-)
-
-add_subdirectory(printf_core)
-add_subdirectory(scanf_core)
-
 add_entrypoint_object(
   ftell
   SRCS
@@ -334,3 +256,7 @@
 add_stdio_entrypoint_object(stdin)
 add_stdio_entrypoint_object(stdout)
 add_stdio_entrypoint_object(stderr)
+add_stdio_entrypoint_object(printf)
+add_stdio_entrypoint_object(fprintf)
+add_stdio_entrypoint_object(vprintf)
+add_stdio_entrypoint_object(vfprintf)
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -338,3 +338,81 @@
     libc.src.__support.File.file
     libc.src.__support.File.platform_stderr
 )
+
+list(APPEND printf_deps
+      libc.src.__support.arg_list
+      libc.src.stdio.printf_core.vfprintf_internal
+)
+
+if(LIBC_CONF_PRINTF_DISABLE_FLOAT)
+  list(APPEND printf_copts "-DLIBC_COPT_PRINTF_DISABLE_FLOAT")
+endif()
+if(LIBC_CONF_PRINTF_DISABLE_INDEX_MODE)
+  list(APPEND printf_copts "-DLIBC_COPT_PRINTF_DISABLE_INDEX_MODE")
+endif()
+if(LIBC_CONF_PRINTF_DISABLE_WRITE_INT)
+  list(APPEND printf_copts "-DLIBC_COPT_PRINTF_DISABLE_WRITE_INT")
+endif()
+if(LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE)
+  list(APPEND printf_copts "-DLIBC_COPT_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE")
+endif()
+
+if(LLVM_LIBC_FULL_BUILD)
+  list(APPEND printf_deps
+      libc.src.__support.File.file
+      libc.src.__support.File.platform_file
+      libc.src.__support.File.platform_stdout
+  )
+else()
+  list(APPEND printf_copts "-DLIBC_COPT_PRINTF_USE_SYSTEM_FILE")
+endif()
+
+add_entrypoint_object(
+  printf
+  SRCS
+    printf.cpp
+  HDRS
+    ../printf.h
+  DEPENDS
+    ${printf_deps}
+  COMPILE_OPTIONS
+    ${printf_copts}
+)
+
+add_entrypoint_object(
+  fprintf
+  SRCS
+    fprintf.cpp
+  HDRS
+    ../fprintf.h
+  DEPENDS
+    libc.src.__support.arg_list
+    libc.src.stdio.printf_core.vfprintf_internal
+  COMPILE_OPTIONS
+    ${printf_copts}
+)
+
+add_entrypoint_object(
+  vprintf
+  SRCS
+    vprintf.cpp
+  HDRS
+    ../vprintf.h
+  DEPENDS
+    ${printf_deps}
+  COMPILE_OPTIONS
+    ${printf_copts}
+)
+
+add_entrypoint_object(
+  vfprintf
+  SRCS
+    vfprintf.cpp
+  HDRS
+    ../vfprintf.h
+  DEPENDS
+    libc.src.__support.arg_list
+    libc.src.stdio.printf_core.vfprintf_internal
+  COMPILE_OPTIONS
+    ${printf_copts}
+)
diff --git a/libc/src/stdio/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp
rename from libc/src/stdio/fprintf.cpp
rename to libc/src/stdio/generic/fprintf.cpp
diff --git a/libc/src/stdio/printf.cpp b/libc/src/stdio/generic/printf.cpp
rename from libc/src/stdio/printf.cpp
rename to libc/src/stdio/generic/printf.cpp
diff --git a/libc/src/stdio/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp
rename from libc/src/stdio/vfprintf.cpp
rename to libc/src/stdio/generic/vfprintf.cpp
diff --git a/libc/src/stdio/vprintf.cpp b/libc/src/stdio/generic/vprintf.cpp
rename from libc/src/stdio/vprintf.cpp
rename to libc/src/stdio/generic/vprintf.cpp
diff --git a/libc/src/stdio/gpu/CMakeLists.txt b/libc/src/stdio/gpu/CMakeLists.txt
--- a/libc/src/stdio/gpu/CMakeLists.txt
+++ b/libc/src/stdio/gpu/CMakeLists.txt
@@ -1,3 +1,33 @@
+add_entrypoint_object(
+  stdin
+  SRCS
+    stdin.cpp
+  HDRS
+    ../stdin.h
+  DEPENDS
+    libc.include.stdio
+)
+
+add_entrypoint_object(
+  stdout
+  SRCS
+    stdout.cpp
+  HDRS
+    ../stdout.h
+  DEPENDS
+    libc.include.stdio
+)
+
+add_entrypoint_object(
+  stderr
+  SRCS
+    stderr.cpp
+  HDRS
+    ../stderr.h
+  DEPENDS
+    libc.include.stdio
+)
+
 add_header_library(
   gpu_file
   HDRS
@@ -10,6 +40,24 @@
     .stderr
 )
 
+add_header_library(
+  gpu_parser
+  HDRS
+    parser.h
+  DEPENDS
+    libc.src.__support.arg_list
+)
+
+add_header_library(
+  gpu_printf_impl
+  HDRS
+    print_impl.h
+  DEPENDS
+    libc.src.__support.arg_list
+    .parser
+    .gpu_file
+)
+
 add_entrypoint_object(
   feof
   SRCS
@@ -19,6 +67,9 @@
   DEPENDS
     libc.include.stdio
     libc.src.__support.RPC.rpc_client
+    .stdin
+    .stdout
+    .stderr
 )
 
 add_entrypoint_object(
@@ -30,6 +81,9 @@
   DEPENDS
     libc.include.stdio
     libc.src.__support.RPC.rpc_client
+    .stdin
+    .stdout
+    .stderr
 )
 
 add_entrypoint_object(
@@ -41,6 +95,9 @@
   DEPENDS
     libc.include.stdio
     libc.src.__support.RPC.rpc_client
+    .stdin
+    .stdout
+    .stderr
 )
 
 add_entrypoint_object(
@@ -219,31 +276,23 @@
 )
 
 add_entrypoint_object(
-  stdin
-  SRCS
-    stdin.cpp
-  HDRS
-    ../stdin.h
-  DEPENDS
-    libc.include.stdio
-)
-
-add_entrypoint_object(
-  stdout
+  printf
   SRCS
-    stdout.cpp
+    printf.cpp
   HDRS
-    ../stdout.h
+    ../printf.h
   DEPENDS
     libc.include.stdio
 )
 
 add_entrypoint_object(
-  stderr
+  fprintf
   SRCS
-    stderr.cpp
+    fprintf.cpp
   HDRS
-    ../stderr.h
+    ../fprintf.h
   DEPENDS
     libc.include.stdio
+    libc.src.__support.arg_list
+    libc.src.stdio.printf_core.parser
 )
diff --git a/libc/src/stdio/fprintf.cpp b/libc/src/stdio/gpu/fprintf.cpp
rename from libc/src/stdio/fprintf.cpp
rename to libc/src/stdio/gpu/fprintf.cpp
--- a/libc/src/stdio/fprintf.cpp
+++ b/libc/src/stdio/gpu/fprintf.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of fprintf -------------------------------*- C++ -*-===//
+//===-- GPU Implementation of fprintf -------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,12 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/fprintf.h"
+#include "src/stdio/gpu/printf_impl.h"
 
-#include "src/__support/File/file.h"
-#include "src/__support/arg_list.h"
-#include "src/stdio/printf_core/vfprintf_internal.h"
-
-#include <stdarg.h>
 #include <stdio.h>
 
 namespace __llvm_libc {
@@ -22,12 +18,10 @@
                     ...)) {
   va_list vlist;
   va_start(vlist, format);
-  internal::ArgList args(vlist); // This holder class allows for easier copying
-                                 // and pointer semantics, as well as handling
-                                 // destruction automatically.
+  internal::ArgList args(vlist);
   va_end(vlist);
-  int ret_val = printf_core::vfprintf_internal(stream, format, args);
-  return ret_val;
+
+  return static_cast<int>(printf_common(stream, format, args));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/stdio/gpu/parser.h b/libc/src/stdio/gpu/parser.h
new file mode 100644
--- /dev/null
+++ b/libc/src/stdio/gpu/parser.h
@@ -0,0 +1,205 @@
+//===--------------- Printf format parsing for the GPU --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/arg_list.h"
+#include "src/__support/ctype_utils.h"
+#include "src/string/string_utils.h"
+
+namespace __llvm_libc {
+
+namespace gpu {
+
+// These sizes need to be compatible to simplify parsing the lengths.
+static_assert(sizeof(uintptr_t) == sizeof(long) &&
+                  sizeof(uintptr_t) == sizeof(long long) &&
+                  sizeof(uintptr_t) == sizeof(intmax_t) &&
+                  sizeof(uintptr_t) == sizeof(size_t) &&
+                  sizeof(uintptr_t) == sizeof(ptrdiff_t),
+              "Invalid lengths for target");
+
+enum class LengthModifier { none = 0, l = 1 };
+
+enum class SizeArgument { finished = 0, width = 1, precision = 2 };
+
+struct Specifier {
+  uintptr_t raw_value;
+  bool is_string;
+  bool is_empty;
+};
+
+template <typename ArgProvider> struct MicroParser {
+  LIBC_INLINE MicroParser(const char *format, ArgProvider args)
+      : format(format), args(args) {}
+
+  LIBC_INLINE static constexpr bool is_flag(char c) {
+    switch (c) {
+    case ' ':
+    case '-':
+    case '+':
+    case '#':
+    case '0':
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  LIBC_INLINE Specifier get_next_specifier() {
+    Specifier specifier{};
+    // Skip any characters until we reach a control character or the end.
+    while (format[cur_pos] != '\0' && format[cur_pos] != '%' &&
+           size_pos == SizeArgument::finished)
+      ++cur_pos;
+
+    if (format[cur_pos] != '\0')
+      cur_pos++;
+
+    // Skip all characters that aren't related to the length or type.
+    if (size_pos == SizeArgument::finished) {
+      while (format[cur_pos] != '\0' && is_flag(format[cur_pos]))
+        ++cur_pos;
+
+      if (format[cur_pos] == '*') {
+        specifier.raw_value =
+            static_cast<uintptr_t>(args.template next_var<uint32_t>());
+        size_pos = SizeArgument::width;
+        return specifier;
+      }
+
+      while (format[cur_pos] != '\0' && internal::isdigit(format[cur_pos]))
+        ++cur_pos;
+    }
+
+    if (format[cur_pos] == '.' && size_pos != SizeArgument::precision) {
+      ++cur_pos;
+
+      if (format[cur_pos] == '*') {
+        specifier.raw_value =
+            static_cast<uintptr_t>(args.template next_var<uint32_t>());
+        size_pos = SizeArgument::precision;
+        return specifier;
+      }
+
+      while (format[cur_pos] != '\0' && internal::isdigit(format[cur_pos]))
+        ++cur_pos;
+    }
+
+    LengthModifier lm = parse_length_modifier();
+
+    // We use the type and length modifier to access the variadic argument
+    // appropriately. All arguments are promoted to a simple integer.
+    switch (format[cur_pos]) {
+    case 'c':
+      specifier.raw_value =
+          static_cast<uintptr_t>(args.template next_var<uint32_t>());
+      break;
+    case 'd':
+    case 'i':
+    case 'o':
+    case 'x':
+    case 'X':
+    case 'u':
+      if (lm == LengthModifier::none)
+        specifier.raw_value =
+            static_cast<uintptr_t>(args.template next_var<uint32_t>());
+      else
+        specifier.raw_value =
+            static_cast<uintptr_t>(args.template next_var<uint64_t>());
+      break;
+    case 'f':
+    case 'F':
+    case 'e':
+    case 'E':
+    case 'a':
+    case 'A':
+    case 'g':
+    case 'G': {
+      specifier.raw_value =
+          cpp::bit_cast<uintptr_t>(args.template next_var<double>());
+      break;
+    }
+    case 'p':
+      specifier.raw_value =
+          reinterpret_cast<uintptr_t>(args.template next_var<void *>());
+      break;
+    case 's':
+      // Strings require special handling as they cannot simply be promoted.
+      specifier.raw_value =
+          reinterpret_cast<uintptr_t>(args.template next_var<void *>());
+      specifier.is_string = true;
+      break;
+    default:
+      // This was a malformed input or a '%' literal.
+      specifier.is_empty = true;
+      break;
+    }
+
+    size_pos = SizeArgument::finished;
+    return specifier;
+  }
+
+  LIBC_INLINE bool end(const Specifier &cur) const {
+    return format[cur_pos] == '\0' && cur.is_empty;
+  }
+
+  LIBC_INLINE size_t get_size(const Specifier &cur) const {
+    if (cur.is_empty)
+      return 0;
+    else if (cur.is_string)
+      return internal::string_length(reinterpret_cast<char *>(cur.raw_value)) +
+             1;
+    else
+      return sizeof(uintptr_t);
+  }
+
+  LIBC_INLINE const void *get_pointer(const Specifier &cur) const {
+    if (cur.is_string)
+      return reinterpret_cast<void *>(cur.raw_value);
+    else
+      return &cur.raw_value;
+  }
+
+private:
+  LIBC_INLINE LengthModifier parse_length_modifier() {
+    // We are only concerned with whether or not the length specifier is larger
+    // than a regular integer.
+    switch (format[cur_pos]) {
+    case 'l': {
+      if (format[cur_pos + 1] == 'l')
+        ++cur_pos;
+      [[fallthrough]];
+    case 't':
+    case 'j':
+    case 'z':
+      ++cur_pos;
+      return LengthModifier::l;
+    }
+    case 'h': {
+      if (format[cur_pos + 1] == 'h')
+        ++cur_pos;
+      [[fallthrough]];
+    case 'q':
+    case 'L':
+      ++cur_pos;
+      return LengthModifier::none;
+    }
+    default:
+      return LengthModifier::none;
+    };
+    return LengthModifier::none;
+  }
+
+  const char *__restrict const format;
+  ArgProvider args;
+  uint32_t cur_pos = 0;
+  SizeArgument size_pos = SizeArgument::finished;
+};
+
+} // namespace gpu
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdio/gpu/printf.cpp b/libc/src/stdio/gpu/printf.cpp
new file mode 100644
--- /dev/null
+++ b/libc/src/stdio/gpu/printf.cpp
@@ -0,0 +1,25 @@
+//===-- GPU Implementation of printf --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/printf.h"
+#include "src/stdio/gpu/printf_impl.h"
+
+#include <stdio.h>
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
+  va_list vlist;
+  va_start(vlist, format);
+  internal::ArgList args(vlist);
+  va_end(vlist);
+
+  return static_cast<int>(printf_common(stdout, format, args));
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdio/gpu/printf_impl.h b/libc/src/stdio/gpu/printf_impl.h
new file mode 100644
--- /dev/null
+++ b/libc/src/stdio/gpu/printf_impl.h
@@ -0,0 +1,55 @@
+//===--------------- Printf format parsing for the GPU --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/arg_list.h"
+#include "src/stdio/gpu/parser.h"
+
+#include <stdio.h>
+
+namespace __llvm_libc {
+
+template <unsigned opcode>
+LIBC_INLINE uint64_t printf_impl(::FILE *__restrict stream,
+                                 const char *__restrict format,
+                                 internal::ArgList &args) {
+  rpc::Client::Port port = rpc::client.open<opcode>();
+  if constexpr (opcode == RPC_PRINTF_TO_STREAM)
+    port.send([&](rpc::Buffer *buffer) {
+      buffer->data[0] = reinterpret_cast<uintptr_t>(stream);
+    });
+
+  port.send_n(format, internal::string_length(format) + 1);
+
+  uint64_t mask = gpu::get_lane_mask();
+  gpu::MicroParser<internal::ArgList> parser(format, args);
+  for (gpu::Specifier cur = parser.get_next_specifier();
+       gpu::ballot(mask, !parser.end(cur)); cur = parser.get_next_specifier())
+    port.send_n(parser.get_pointer(cur), parser.get_size(cur));
+
+  uint64_t ret = 0;
+  port.recv([&](rpc::Buffer *buffer) {
+    ret = reinterpret_cast<uint64_t *>(buffer->data)[0];
+  });
+  port.close();
+
+  return ret;
+}
+
+LIBC_INLINE uint64_t printf_common(::FILE *__restrict stream,
+                                   const char *__restrict format,
+                                   internal::ArgList &args) {
+  if (stream == stdout)
+    return printf_impl<RPC_PRINTF_TO_STDOUT>(stdout, format, args);
+  else if (stream == stderr)
+    return printf_impl<RPC_PRINTF_TO_STDERR>(stderr, format, args);
+  else
+    return printf_impl<RPC_PRINTF_TO_STREAM>(stream, format, args);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/test/integration/startup/gpu/CMakeLists.txt b/libc/test/integration/startup/gpu/CMakeLists.txt
--- a/libc/test/integration/startup/gpu/CMakeLists.txt
+++ b/libc/test/integration/startup/gpu/CMakeLists.txt
@@ -1,6 +1,9 @@
 add_custom_target(libc-startup-tests)
 add_dependencies(libc-integration-tests libc-startup-tests)
 
+# Create an output directory for any temporary test files.
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/testdata)
+
 add_integration_test(
   startup_args_test
   SUITE libc-startup-tests
@@ -53,3 +56,22 @@
    --threads 32
    --blocks 8
 )
+
+add_integration_test(
+  startup_rpc_printf_test
+  SUITE libc-startup-tests
+  SRCS
+   rpc_printf_test.cpp
+  DEPENDS
+    libc.include.stdio
+    libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
+    libc.src.stdio.stdout
+    libc.src.stdio.stderr
+    libc.src.stdio.fprintf
+    libc.src.stdio.fopen
+    libc.src.stdio.fclose
+  LOADER_ARGS
+   --threads 32
+   --blocks 4
+)
diff --git a/libc/test/integration/startup/gpu/rpc_printf_test.cpp b/libc/test/integration/startup/gpu/rpc_printf_test.cpp
new file mode 100644
--- /dev/null
+++ b/libc/test/integration/startup/gpu/rpc_printf_test.cpp
@@ -0,0 +1,83 @@
+//===-- RPC test to check args to printf ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/GPU/utils.h"
+#include "src/stdio/fopen.h"
+#include "src/stdio/fprintf.h"
+#include "test/IntegrationTest/test.h"
+#include <stdint.h>
+
+using namespace __llvm_libc;
+
+FILE *file = __llvm_libc::fopen("testdata/test_data.txt", "w");
+
+// NVPTX requires that the constructor contains the full signature.
+[[gnu::constructor]] void init(int, char **, char **) {
+  file = __llvm_libc::fopen("testdata/test_data.txt", "w");
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  ASSERT_TRUE(file && "failed to open file");
+
+  int written = 0;
+  written = __llvm_libc::fprintf(file, "A simple string\n");
+  ASSERT_EQ(written, 16);
+
+  written = __llvm_libc::fprintf(file, "%s", "A simple string\n");
+  ASSERT_EQ(written, 16);
+
+  // Check printing a different value with each thread.
+  written = __llvm_libc::fprintf(file, "%8ld\n", gpu::get_thread_id());
+  ASSERT_EQ(written, 9);
+
+  // Check a literal '%' with a bunch of stuff inbetween.
+  written = __llvm_libc::fprintf(file, "%00000%%c", 'c');
+  ASSERT_EQ(written, 2);
+
+  // Check floating point precision and printing.
+  written = __llvm_libc::fprintf(file, "%d%c%.1f\n", 1, 'c', 1.0);
+  ASSERT_EQ(written, 6);
+
+  // Check various length modifiers.
+  written = __llvm_libc::fprintf(file, "%hhd%hd%d%ld%lld%jd%zd%td%.f", '\x1', 1,
+                                 1, 1l, 1ll, 1ll, 1ll, 1ll, 1.1);
+  ASSERT_EQ(written, 9);
+
+  // Check that the server properly handles a divergent number of arguments.
+  const char *str = gpu::get_thread_id() % 2 ? "%s" : "%20ld\n";
+  written = __llvm_libc::fprintf(file, str, "string\n");
+  ASSERT_EQ(written, gpu::get_thread_id() % 2 ? 7 : 21);
+
+  const char *arg = gpu::get_thread_id() % 2 ? "string\n" : "%s";
+  written = __llvm_libc::fprintf(file, arg, "string\n");
+  ASSERT_EQ(written, 7);
+
+  // Check that we correctly ignore some malformed input.
+  written = __llvm_libc::fprintf(file, "%()d %d %d %d", 1, 2, 3);
+  ASSERT_EQ(written, 10);
+  written = __llvm_libc::fprintf(file, "%1.1.d %d", 1);
+  ASSERT_EQ(written, 8);
+
+  // Check that we handle variable widths correctly.
+  written = __llvm_libc::fprintf(file, "%*d%*.*f", 5, 100, 10, 5);
+  ASSERT_EQ(written, 15);
+
+  // Check the values of some sign modifiers.
+  written = __llvm_libc::fprintf(file, "%-10f%++10f% 10f", 1.0, 1.0, 1.0);
+  ASSERT_EQ(written, 30);
+
+  // Check for extremely abused variable width arguments
+  written = __llvm_libc::fprintf(file, "%**d", 1, 2);
+  ASSERT_EQ(written, 4);
+  written = __llvm_libc::fprintf(file, "%**d%6d", 1, 1);
+  ASSERT_EQ(written, 10);
+  written = __llvm_libc::fprintf(file, "%**.**f", 1, 1, 1.0);
+  ASSERT_EQ(written, 7);
+
+  return 0;
+}
diff --git a/libc/test/src/__support/arg_list_test.cpp b/libc/test/src/__support/arg_list_test.cpp
--- a/libc/test/src/__support/arg_list_test.cpp
+++ b/libc/test/src/__support/arg_list_test.cpp
@@ -114,11 +114,16 @@
   return s.c + s.s + s.i + s.l + s.f + s.d + last;
 }
 
+// FIXME: The NVPTX backend doesn't handle by-val struct types correctly.
+#ifndef LIBC_TARGET_ARCH_IS_NVPTX
+
 TEST(LlvmLibcArgListTest, TestStructTypes) {
   S s{'\x1', 2, 3, 4l, 5.0f, 6.0};
   ASSERT_EQ(check_struct_type(0, s, 1), 22l);
 }
 
+#endif
+
 // Test vector extensions from clang.
 #if LIBC_HAS_ATTRIBUTE(ext_vector_type)
 
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -163,7 +163,7 @@
  set(fprintf_test_copts "-DLIBC_COPT_PRINTF_USE_SYSTEM_FILE")
 endif()
 
-add_libc_unittest(
+add_libc_test(
   fprintf_test
   SUITE
     libc_stdio_unittests
@@ -185,6 +185,8 @@
     printf_test.cpp
   DEPENDS
     libc.src.stdio.printf
+    libc.src.stdio.stdout
+    libc.src.stdio.stderr
 )
 
 add_fp_unittest(
diff --git a/libc/utils/gpu/server/CMakeLists.txt b/libc/utils/gpu/server/CMakeLists.txt
--- a/libc/utils/gpu/server/CMakeLists.txt
+++ b/libc/utils/gpu/server/CMakeLists.txt
@@ -1,4 +1,8 @@
-add_library(llvmlibc_rpc_server STATIC rpc_server.cpp)
+add_library(llvmlibc_rpc_server
+  STATIC
+  ${LIBC_SOURCE_DIR}/src/stdio/printf_core/writer.cpp
+  ${LIBC_SOURCE_DIR}/src/stdio/printf_core/converter.cpp
+  rpc_server.cpp)
 
 # Include the RPC implemenation from libc.
 target_include_directories(llvmlibc_rpc_server PRIVATE ${LIBC_SOURCE_DIR})
@@ -11,6 +15,11 @@
 target_compile_definitions(llvmlibc_rpc_server PUBLIC
                            LIBC_NAMESPACE=${LIBC_NAMESPACE})
 
+target_compile_definitions(llvmlibc_rpc_server
+                           PRIVATE
+                           LIBC_COPT_PRINTF_DISABLE_WRITE_INT
+                           LIBC_COPT_PRINTF_DISABLE_INDEX_MODE)
+
 # Install the server and associated header.
 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/rpc_server.h
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm/
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -8,6 +8,12 @@
 
 #include "rpc_server.h"
 
+#include "src/__support/arg_list.h"
+#include "src/stdio/gpu/parser.h"
+#include "src/stdio/printf_core/converter.h"
+#include "src/stdio/printf_core/parser.h"
+#include "src/stdio/printf_core/writer.h"
+
 #include "src/__support/RPC/rpc.h"
 #include "src/stdio/gpu/file.h"
 #include <atomic>
@@ -27,6 +33,106 @@
 static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT,
               "Incorrect maximum port count");
 
+template <uint32_t lane_size>
+static void handle_printf(typename rpc::Server<lane_size>::Port &port) {
+  uint64_t total_sizes[lane_size] = {0};
+  uint64_t sizes[lane_size] = {0};
+
+  void *format[lane_size] = {nullptr};
+  FILE *files[lane_size] = {nullptr};
+
+  // Get the appropriate output stream to use.
+  if (port.get_opcode() == RPC_PRINTF_TO_STREAM)
+    port.recv([&](rpc::Buffer *buffer, uint32_t id) {
+      files[id] = reinterpret_cast<FILE *>(buffer->data[0]);
+    });
+  else if (port.get_opcode() == RPC_PRINTF_TO_STDOUT)
+    std::fill(files, files + lane_size, stdout);
+  else
+    std::fill(files, files + lane_size, stderr);
+
+  // Recieve the format string from the client.
+  port.recv_n(format, sizes, [&](uint64_t size) { return new char[size]; });
+
+  for (uint32_t lane = 0; lane < lane_size; ++lane)
+    total_sizes[lane] += rpc::align_up(sizes[lane], sizeof(uintptr_t));
+
+  // Parse the formatting string using the same parser the client uses. This
+  // tells us exactly how many packets we need to be recieving from the client.
+  uint32_t num_specifiers = 0;
+  for (uint32_t lane = 0; lane < lane_size; ++lane) {
+    if (!format[lane])
+      continue;
+
+    internal::MockArgList args;
+    uint32_t count = 0;
+    gpu::MicroParser<internal::MockArgList> parser(
+        reinterpret_cast<const char *>(format[lane]), args);
+    for (gpu::Specifier cur = parser.get_next_specifier(); !parser.end(cur);
+         cur = parser.get_next_specifier())
+      count++;
+    num_specifiers = std::max(num_specifiers, count);
+  }
+
+  // Recieve all the arguments from the client and allocate storage for them.
+  std::vector<void *> all_args[lane_size];
+  for (uint32_t i = 0; i < num_specifiers; ++i) {
+    void *args[lane_size] = {nullptr};
+    port.recv_n(args, sizes, [&](uint64_t size) { return new char[size]; });
+
+    for (uint32_t lane = 0; lane < lane_size; ++lane) {
+      if (sizes[lane] > 0)
+        all_args[lane].push_back(args[lane]);
+      total_sizes[lane] += rpc::align_up(sizes[lane], sizeof(uintptr_t));
+    }
+  }
+
+  int results[lane_size] = {0};
+  for (uint32_t lane = 0; lane < lane_size; ++lane) {
+    if (!format[lane])
+      continue;
+
+    // We assume that twice the input length will be enough to fit the string.
+    uint64_t buffer_size = std::max(256ul, 2 * total_sizes[lane]);
+
+    std::unique_ptr<char[]> buffer(new char[buffer_size]);
+    printf_core::WriteBuffer wb(buffer.get(), buffer_size);
+    printf_core::Writer writer(&wb);
+
+    internal::ArrayArgList args(all_args[lane].data());
+    printf_core::Parser<internal::ArrayArgList> parser(
+        reinterpret_cast<const char *>(format[lane]), args);
+
+    // Parse and print the format string using the arguments we copied from the
+    // client.
+    for (printf_core::FormatSection cur_section = parser.get_next_section();
+         !cur_section.raw_string.empty();
+         cur_section = parser.get_next_section()) {
+      if (cur_section.has_conv) {
+        convert(&writer, cur_section);
+      } else {
+        writer.write(cur_section.raw_string);
+      }
+    }
+    results[lane] =
+        fwrite(buffer.get(), 1, writer.get_chars_written(), files[lane]);
+    if (results[lane] != writer.get_chars_written())
+      results[lane] = -1;
+  }
+
+  port.send(
+      [&](rpc::Buffer *buffer, uint32_t id) { buffer->data[0] = results[id]; });
+
+  for (uint32_t lane = 0; lane < lane_size; ++lane) {
+    if (!format[lane])
+      continue;
+
+    delete[] reinterpret_cast<char *>(format[lane]);
+    for (void *ptr : all_args[lane])
+      delete[] reinterpret_cast<uintptr_t *>(ptr);
+  }
+}
+
 // The client needs to support different lane sizes for the SIMT model. Because
 // of this we need to select between the possible sizes that the client can use.
 struct Server {
@@ -160,10 +266,14 @@
       });
       break;
     }
-    case RPC_NOOP: {
+    case RPC_PRINTF_TO_STDOUT:
+    case RPC_PRINTF_TO_STDERR:
+    case RPC_PRINTF_TO_STREAM:
+      handle_printf<lane_size>(*port);
+      break;
+    case RPC_NOOP:
       port->recv([](rpc::Buffer *) {});
       break;
-    }
     default: {
       auto handler =
           callbacks.find(static_cast<rpc_opcode_t>(port->get_opcode()));