diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -8,6 +8,13 @@
   message(STATUS "Skipping HSA loader for gpu target, no HSA was detected")
 endif()
 
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND)
+  add_subdirectory(nvptx)
+else()
+  message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
+endif()
+
 # Add a custom target to be used for testing.
 if(TARGET amdhsa_loader AND LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
   add_custom_target(libc.utils.gpu.loader)
diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(nvptx_loader Loader.cpp)
+add_dependencies(nvptx_loader libc.src.__support.RPC.rpc)
+
+target_include_directories(nvptx_loader PRIVATE ${LIBC_SOURCE_DIR})
+target_link_libraries(nvptx_loader
+  PRIVATE
+  gpu_loader
+  CUDA::cuda_driver
+)
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
new file mode 100644
--- /dev/null
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -0,0 +1,159 @@
+//===-- Loader Implementation for NVPTX devices --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file impelements a simple loader to run images supporting the NVPTX
+// architecture. The file launches the '_start' kernel which should be provided
+// by the device application start code and call ultimately call the 'main'
+// function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "cuda.h"
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+
+/// The arguments to the '_start' kernel.
+struct kernel_args_t {
+  int argc;
+  void *argv;
+  void *envp;
+  void *ret;
+  void *inbox;
+  void *outbox;
+  void *buffer;
+};
+
+static void handle_error(CUresult err) {
+  if (err == CUDA_SUCCESS)
+    return;
+
+  const char *err_str = nullptr;
+  CUresult result = cuGetErrorString(err, &err_str);
+  if (result != CUDA_SUCCESS)
+    fprintf(stderr, "Unknown Error\n");
+  else
+    fprintf(stderr, "%s\n", err_str);
+  exit(1);
+}
+
+int load(int argc, char **argv, char **envp, void *image, size_t size) {
+  if (CUresult err = cuInit(0))
+    handle_error(err);
+
+  // Obtain the first device found on the system.
+  CUdevice device;
+  if (CUresult err = cuDeviceGet(&device, 0))
+    handle_error(err);
+
+  // Initialize the CUDA context and claim it for this execution.
+  CUcontext context;
+  if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
+    handle_error(err);
+  if (CUresult err = cuCtxSetCurrent(context))
+    handle_error(err);
+
+  // Initialize a non-blocking CUDA stream to execute the kernel.
+  CUstream stream;
+  if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
+    handle_error(err);
+
+  // Load the image into a CUDA module.
+  CUmodule binary;
+  if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
+    handle_error(err);
+
+  // look up the '_start' kernel in the loaded module.
+  CUfunction function;
+  if (CUresult err = cuModuleGetFunction(&function, binary, "_start"))
+    handle_error(err);
+
+  // Allocate pinned memory on the host to hold the pointer array for the
+  // copied argv and allow the GPU device to access it.
+  void *dev_argv;
+  if (CUresult err = cuMemAllocHost(&dev_argv, sizeof(char *) * argc))
+    handle_error(err);
+
+  // Copy each string in the argument vector to shared memory on the device.
+  for (int i = 0; i < argc; ++i) {
+    size_t size = strlen(argv[i]) + 1;
+    void *dev_str;
+    if (CUresult err = cuMemAllocHost(&dev_str, size))
+      handle_error(err);
+    // Load the host memory buffer with the pointer values of the newly
+    // allocated strings.
+    std::memcpy(dev_str, argv[i], size);
+    static_cast<void **>(dev_argv)[i] = dev_str;
+  }
+
+  // Allocate fine-grained memory on the host to hold the pointer array for the
+  // copied environment array and allow the GPU agent to access it.
+  int envc = 0;
+  for (char **env = envp; *env != 0; ++env)
+    ++envc;
+  void *dev_envp;
+  if (CUresult err = cuMemAllocHost(&dev_envp, sizeof(char *) * envc))
+    handle_error(err);
+
+  for (int i = 0; i < envc; ++i) {
+    size_t size = strlen(envp[i]) + 1;
+    void *dev_str;
+    if (CUresult err = cuMemAllocHost(&dev_str, size))
+      handle_error(err);
+    // Load the host memory buffer with the pointer values of the newly
+    // allocated strings.
+    std::memcpy(dev_str, envp[i], size);
+    static_cast<void **>(dev_envp)[i] = dev_str;
+  }
+
+  // Allocate space for the return pointer and initialize it to zero.
+  CUdeviceptr dev_ret;
+  if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
+    handle_error(err);
+  if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
+    handle_error(err);
+
+  // Set up the arguments to the '_start' kernel on the GPU.
+  // TODO: Setup RPC server implementation;
+  uint64_t args_size = sizeof(kernel_args_t);
+  kernel_args_t args;
+  std::memset(&args, 0, args_size);
+  args.argc = argc;
+  args.argv = dev_argv;
+  args.envp = dev_argv;
+  args.ret = reinterpret_cast<void *>(dev_ret);
+  void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
+                         CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
+                         CU_LAUNCH_PARAM_END};
+
+  // Call the kernel with the given arguments.
+  if (CUresult err =
+          cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1,
+                         /*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1,
+                         /*bloackDimZ=*/1, 0, stream, nullptr, args_config))
+    handle_error(err);
+
+  // TODO: Query the RPC server periodically while the kernel is running.
+  while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
+    ;
+
+  // Copy the return value back from the kernel and wait.
+  int host_ret = 0;
+  if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
+    handle_error(err);
+
+  if (CUresult err = cuStreamSynchronize(stream))
+    handle_error(err);
+
+  // Destroy the context and the loaded binary.
+  if (CUresult err = cuModuleUnload(binary))
+    handle_error(err);
+  if (CUresult err = cuDevicePrimaryCtxRelease(device))
+    handle_error(err);
+  return host_ret;
+}