diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -8,6 +8,13 @@
   message(STATUS "Skipping HSA loader for gpu target, no HSA was detected")
 endif()
 
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND)
+  add_subdirectory(nvptx)
+else()
+  message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
+endif()
+
 # Add a custom target to be used for testing.
 if(TARGET amdhsa_loader AND LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
   add_custom_target(libc.utils.gpu.loader)
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -6,9 +6,46 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
+#define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
+
+#include <cstring>
 #include <stddef.h>
 
 /// Generic interface to load the \p image and launch execution of the _start
 /// kernel on the target device. Copies \p argc and \p argv to the device.
 /// Returns the final value of the `main` function on the device.
 int load(int argc, char **argv, char **evnp, void *image, size_t size);
+
+/// Copy the system's argument vector to GPU memory allocated using \p alloc.
+template <typename Allocator>
+void *copy_argument_vector(int argc, char **argv, Allocator alloc) {
+  void *dev_argv = alloc(argc * sizeof(char *));
+  if (dev_argv == nullptr)
+    return nullptr;
+
+  for (int i = 0; i < argc; ++i) {
+    size_t size = strlen(argv[i]) + 1;
+    void *dev_str = alloc(size);
+    if (dev_str == nullptr)
+      return nullptr;
+
+    // Load the host memory buffer with the pointer values of the newly
+    // allocated strings.
+    std::memcpy(dev_str, argv[i], size);
+    static_cast<void **>(dev_argv)[i] = dev_str;
+  }
+  return dev_argv;
+};
+
+/// Copy the system's environment to GPU memory allocated using \p alloc.
+template <typename Allocator>
+void *copy_environment(char **envp, Allocator alloc) {
+  int envc = 0;
+  for (char **env = envp; *env != 0; ++env)
+    ++envc;
+
+  return copy_argument_vector(envc, envp, alloc);
+};
+
+#endif
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -75,6 +75,11 @@
   exit(EXIT_FAILURE);
 }
 
+static void handle_error(const char *msg) {
+  fprintf(stderr, "%s\n", msg);
+  exit(EXIT_FAILURE);
+}
+
 /// Generic interface for iterating using the HSA callbacks.
 template <typename elem_ty, typename func_ty, typename callback_ty>
 hsa_status_t iterate(func_ty func, callback_ty cb) {
@@ -279,50 +284,23 @@
 
   // Allocate fine-grained memory on the host to hold the pointer array for the
   // copied argv and allow the GPU agent to access it.
-  void *dev_argv;
-  if (hsa_status_t err =
-          hsa_amd_memory_pool_allocate(finegrained_pool, argc * sizeof(char *),
-                                       /*flags=*/0, &dev_argv))
-    handle_error(err);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_argv);
-
-  // Copy each string in the argument vector to global memory on the device.
-  for (int i = 0; i < argc; ++i) {
-    size_t size = strlen(argv[i]) + 1;
-    void *dev_str;
+  auto allocator = [&](uint64_t size) -> void * {
+    void *dev_ptr = nullptr;
     if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
-                                                        /*flags=*/0, &dev_str))
+                                                        /*flags=*/0, &dev_ptr))
       handle_error(err);
-    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_str);
-    // Load the host memory buffer with the pointer values of the newly
-    // allocated strings.
-    std::memcpy(dev_str, argv[i], size);
-    static_cast<void **>(dev_argv)[i] = dev_str;
-  }
+    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
+    return dev_ptr;
+  };
+  void *dev_argv = copy_argument_vector(argc, argv, allocator);
+  if (!dev_argv)
+    handle_error("Failed to allocate device argv");
 
   // Allocate fine-grained memory on the host to hold the pointer array for the
   // copied environment array and allow the GPU agent to access it.
-  int envc = 0;
-  for (char **env = envp; *env != 0; ++env)
-    ++envc;
-  void *dev_envp;
-  if (hsa_status_t err =
-          hsa_amd_memory_pool_allocate(finegrained_pool, envc * sizeof(char *),
-                                       /*flags=*/0, &dev_envp))
-    handle_error(err);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_envp);
-  for (int i = 0; i < envc; ++i) {
-    size_t size = strlen(envp[i]) + 1;
-    void *dev_str;
-    if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
-                                                        /*flags=*/0, &dev_str))
-      handle_error(err);
-    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_str);
-    // Load the host memory buffer with the pointer values of the newly
-    // allocated strings.
-    std::memcpy(dev_str, envp[i], size);
-    static_cast<void **>(dev_envp)[i] = dev_str;
-  }
+  void *dev_envp = copy_environment(envp, allocator);
+  if (!dev_envp)
+    handle_error("Failed to allocate device environment");
 
   // Allocate space for the return pointer and initialize it to zero.
   void *dev_ret;
diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(nvptx_loader Loader.cpp)
+add_dependencies(nvptx_loader libc.src.__support.RPC.rpc)
+
+target_include_directories(nvptx_loader PRIVATE ${LIBC_SOURCE_DIR})
+target_link_libraries(nvptx_loader
+  PRIVATE
+  gpu_loader
+  CUDA::cuda_driver
+)
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
new file mode 100644
--- /dev/null
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -0,0 +1,147 @@
+//===-- Loader Implementation for NVPTX devices --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file impelements a simple loader to run images supporting the NVPTX
+// architecture. The file launches the '_start' kernel which should be provided
+// by the device application start code and call ultimately call the 'main'
+// function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Loader.h"
+
+#include "cuda.h"
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+/// The arguments to the '_start' kernel.
+struct kernel_args_t {
+  int argc;
+  void *argv;
+  void *envp;
+  void *ret;
+  void *inbox;
+  void *outbox;
+  void *buffer;
+};
+
+static void handle_error(CUresult err) {
+  if (err == CUDA_SUCCESS)
+    return;
+
+  const char *err_str = nullptr;
+  CUresult result = cuGetErrorString(err, &err_str);
+  if (result != CUDA_SUCCESS)
+    fprintf(stderr, "Unknown Error\n");
+  else
+    fprintf(stderr, "%s\n", err_str);
+  exit(1);
+}
+
+static void handle_error(const char *msg) {
+  fprintf(stderr, "%s\n", msg);
+  exit(EXIT_FAILURE);
+}
+
+int load(int argc, char **argv, char **envp, void *image, size_t size) {
+  if (CUresult err = cuInit(0))
+    handle_error(err);
+
+  // Obtain the first device found on the system.
+  CUdevice device;
+  if (CUresult err = cuDeviceGet(&device, 0))
+    handle_error(err);
+
+  // Initialize the CUDA context and claim it for this execution.
+  CUcontext context;
+  if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
+    handle_error(err);
+  if (CUresult err = cuCtxSetCurrent(context))
+    handle_error(err);
+
+  // Initialize a non-blocking CUDA stream to execute the kernel.
+  CUstream stream;
+  if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
+    handle_error(err);
+
+  // Load the image into a CUDA module.
+  CUmodule binary;
+  if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
+    handle_error(err);
+
+  // look up the '_start' kernel in the loaded module.
+  CUfunction function;
+  if (CUresult err = cuModuleGetFunction(&function, binary, "_start"))
+    handle_error(err);
+
+  // Allocate pinned memory on the host to hold the pointer array for the
+  // copied argv and allow the GPU device to access it.
+  auto allocator = [&](uint64_t size) -> void * {
+    void *dev_ptr;
+    if (CUresult err = cuMemAllocHost(&dev_ptr, size))
+      handle_error(err);
+    return dev_ptr;
+  };
+  void *dev_argv = copy_argument_vector(argc, argv, allocator);
+  if (!dev_argv)
+    handle_error("Failed to allocate device argv");
+
+  // Allocate pinned memory on the host to hold the pointer array for the
+  // copied environment array and allow the GPU device to access it.
+  void *dev_envp = copy_environment(envp, allocator);
+  if (!dev_envp)
+    handle_error("Failed to allocate device environment");
+
+  // Allocate space for the return pointer and initialize it to zero.
+  CUdeviceptr dev_ret;
+  if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
+    handle_error(err);
+  if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
+    handle_error(err);
+
+  // Set up the arguments to the '_start' kernel on the GPU.
+  // TODO: Setup RPC server implementation;
+  uint64_t args_size = sizeof(kernel_args_t);
+  kernel_args_t args;
+  std::memset(&args, 0, args_size);
+  args.argc = argc;
+  args.argv = dev_argv;
+  args.envp = dev_envp;
+  args.ret = reinterpret_cast<void *>(dev_ret);
+  void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
+                         CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
+                         CU_LAUNCH_PARAM_END};
+
+  // Call the kernel with the given arguments.
+  if (CUresult err =
+          cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1,
+                         /*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1,
+                         /*bloackDimZ=*/1, 0, stream, nullptr, args_config))
+    handle_error(err);
+
+  // TODO: Query the RPC server periodically while the kernel is running.
+  while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
+    ;
+
+  // Copy the return value back from the kernel and wait.
+  int host_ret = 0;
+  if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
+    handle_error(err);
+
+  if (CUresult err = cuStreamSynchronize(stream))
+    handle_error(err);
+
+  // Destroy the context and the loaded binary.
+  if (CUresult err = cuModuleUnload(binary))
+    handle_error(err);
+  if (CUresult err = cuDevicePrimaryCtxRelease(device))
+    handle_error(err);
+  return host_ret;
+}