diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -35,9 +35,11 @@
   NOOP = 0,
   PRINT_TO_STDERR = 1,
   EXIT = 2,
-  TEST_INCREMENT = 3,
-  TEST_INTERFACE = 4,
-  TEST_STREAM = 5,
+  MALLOC = 3,
+  FREE = 4,
+  TEST_INCREMENT = 5,
+  TEST_INTERFACE = 6,
+  TEST_STREAM = 7,
 };
 
 /// A fixed size channel used to communicate between the RPC client and server.
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -287,10 +287,23 @@
     DEPENDS
       ${SCUDO_DEPS}
   )
+elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  add_entrypoint_external(
+    calloc
+  )
+  add_entrypoint_external(
+    realloc
+  )
+  add_entrypoint_external(
+    aligned_alloc
+  )
 else()
   add_entrypoint_external(
     malloc
   )
+  add_entrypoint_external(
+    free
+  )
   add_entrypoint_external(
     calloc
   )
@@ -300,9 +313,6 @@
   add_entrypoint_external(
     aligned_alloc
   )
-  add_entrypoint_external(
-    free
-  )
 endif()
 
 if(NOT LLVM_LIBC_FULL_BUILD)
@@ -356,3 +366,19 @@
   DEPENDS
     .${LIBC_TARGET_OS}.abort
 )
+
+if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  add_entrypoint_object(
+    malloc
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.malloc
+  )
+
+  add_entrypoint_object(
+    free
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.free
+  )
+endif()
diff --git a/libc/src/stdlib/free.h b/libc/src/stdlib/free.h
new file mode 100644
--- /dev/null
+++ b/libc/src/stdlib/free.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for free --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdlib.h>
+
+#ifndef LLVM_LIBC_SRC_STDLIB_FREE_H
+#define LLVM_LIBC_SRC_STDLIB_FREE_H
+
+namespace __llvm_libc {
+
+void free(void *ptr);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_LDIV_H
diff --git a/libc/src/stdlib/gpu/CMakeLists.txt b/libc/src/stdlib/gpu/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/libc/src/stdlib/gpu/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_entrypoint_object(
+  malloc
+  SRCS
+    malloc.cpp
+  HDRS
+    ../malloc.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.RPC.rpc_client
+)
+
+add_entrypoint_object(
+  free
+  SRCS
+    free.cpp
+  HDRS
+    ../free.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.RPC.rpc_client
+)
diff --git a/libc/src/stdlib/gpu/free.cpp b/libc/src/stdlib/gpu/free.cpp
new file mode 100644
--- /dev/null
+++ b/libc/src/stdlib/gpu/free.cpp
@@ -0,0 +1,23 @@
+//===-- GPU Implementation of free ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/free.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(void, free, (void *ptr)) {
+  rpc::Client::Port port = rpc::client.open<rpc::FREE>();
+  port.send([=](rpc::Buffer *buffer) {
+    buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
+  });
+  port.close();
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/gpu/malloc.cpp b/libc/src/stdlib/gpu/malloc.cpp
new file mode 100644
--- /dev/null
+++ b/libc/src/stdlib/gpu/malloc.cpp
@@ -0,0 +1,26 @@
+//===-- GPU Implementation of malloc --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/malloc.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(void *, malloc, (size_t size)) {
+  void *ptr = nullptr;
+  rpc::Client::Port port = rpc::client.open<rpc::MALLOC>();
+  port.send_and_recv([=](rpc::Buffer *buffer) { buffer->data[0] = size; },
+                     [&](rpc::Buffer *buffer) {
+                       ptr = reinterpret_cast<void *>(buffer->data[0]);
+                     });
+  port.close();
+  return ptr;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/malloc.h b/libc/src/stdlib/malloc.h
new file mode 100644
--- /dev/null
+++ b/libc/src/stdlib/malloc.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for malloc ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdlib.h>
+
+#ifndef LLVM_LIBC_SRC_STDLIB_MALLOC_H
+#define LLVM_LIBC_SRC_STDLIB_MALLOC_H
+
+namespace __llvm_libc {
+
+void *malloc(size_t size);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_LDIV_H
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -320,4 +320,19 @@
       libc.src.signal.raise
   )
 
+  # Only the GPU has an in-tree 'malloc' implementation.
+  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+    add_libc_test(
+      malloc_test
+      HERMETIC_TEST_ONLY
+      SUITE
+        libc-stdlib-tests
+      SRCS
+        malloc_test.cpp
+      DEPENDS
+        libc.include.stdlib
+        libc.src.stdlib.malloc
+        libc.src.stdlib.free
+    )
+  endif()
 endif()
diff --git a/libc/test/src/stdlib/malloc_test.cpp b/libc/test/src/stdlib/malloc_test.cpp
new file mode 100644
--- /dev/null
+++ b/libc/test/src/stdlib/malloc_test.cpp
@@ -0,0 +1,19 @@
+//===-- Unittests for malloc ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/free.h"
+#include "src/stdlib/malloc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcMallocTest, Allocate) {
+  int *ptr = reinterpret_cast<int *>(__llvm_libc::malloc(sizeof(int)));
+  EXPECT_NE(reinterpret_cast<void *>(ptr), static_cast<void *>(nullptr));
+  *ptr = 1;
+  EXPECT_EQ(*ptr, 1);
+  __llvm_libc::free(ptr);
+}
diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -14,10 +14,17 @@
 find_package(CUDAToolkit QUIET)
 # The CUDA loader requires LLVM to traverse the ELF image for symbols.
 find_package(LLVM QUIET)
-if(CUDAToolkit_FOUND AND LLVM_FOUND)
+if(CUDAToolkit_FOUND AND LLVM_FOUND AND
+   ${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11.2")
   add_subdirectory(nvptx)
 else()
-  message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
+  if(${CUDAToolkit_VERSION} VERSION_LESS "11.2")
+    message(WARNING 
+      "Skipping CUDA loader for gpu target, CUDA must be version 11.2 or later.
+       Found CUDA Version ${CUDAToolkit_VERSION}")
+  else()
+    message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
+  endif()
 endif()
 
 # Add a custom target to be used for testing.
diff --git a/libc/utils/gpu/loader/Server.h b/libc/utils/gpu/loader/Server.h
--- a/libc/utils/gpu/loader/Server.h
+++ b/libc/utils/gpu/loader/Server.h
@@ -21,7 +21,8 @@
 
 /// Queries the RPC client at least once and performs server-side work if there
 /// are any active requests.
-void handle_server() {
+template <typename Alloc, typename Dealloc>
+void handle_server(Alloc allocator, Dealloc deallocator) {
   using namespace __llvm_libc;
 
   // Continue servicing the client until there is no work left and we return.
@@ -50,6 +51,19 @@
       });
       break;
     }
+    case rpc::Opcode::MALLOC: {
+      port->recv_and_send([&](rpc::Buffer *buffer) {
+        buffer->data[0] =
+            reinterpret_cast<uintptr_t>(allocator(buffer->data[0]));
+      });
+      break;
+    }
+    case rpc::Opcode::FREE: {
+      port->recv([&](rpc::Buffer *buffer) {
+        deallocator(reinterpret_cast<void *>(buffer->data[0]));
+      });
+      break;
+    }
     case rpc::Opcode::TEST_INCREMENT: {
       port->recv_and_send([](rpc::Buffer *buffer) {
         reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -134,6 +134,7 @@
 template <typename args_t>
 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
                            hsa_amd_memory_pool_t kernargs_pool,
+                           hsa_amd_memory_pool_t coarsegrained_pool,
                            hsa_queue_t *queue, const LaunchParameters &params,
                            const char *kernel_name, args_t kernel_args) {
   // Look up the '_start' kernel in the loaded executable.
@@ -142,6 +143,21 @@
           executable, kernel_name, &dev_agent, &symbol))
     return err;
 
+  auto allocator = [&](uint64_t size) -> void * {
+    void *dev_ptr = nullptr;
+    if (hsa_status_t err =
+            hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
+                                         /*flags=*/0, &dev_ptr))
+      handle_error(err);
+    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
+    return dev_ptr;
+  };
+
+  auto deallocator = [](void *ptr) -> void {
+    if (hsa_status_t err = hsa_amd_memory_pool_free(ptr))
+      handle_error(err);
+  };
+
   // Retrieve different properties of the kernel symbol used for launch.
   uint64_t kernel;
   uint32_t args_size;
@@ -219,11 +235,11 @@
   while (hsa_signal_wait_scacquire(
              packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
              /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
-    handle_server();
+    handle_server(allocator, deallocator);
 
   // Handle the server one more time in case the kernel exited with a pending
   // send still in flight.
-  handle_server();
+  handle_server(allocator, deallocator);
 
   // Destroy the resources acquired to launch the kernel and return.
   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
@@ -366,14 +382,15 @@
 
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
   begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer};
-  if (hsa_status_t err =
-          launch_kernel(dev_agent, executable, kernargs_pool, queue,
-                        single_threaded_params, "_begin.kd", init_args))
+  if (hsa_status_t err = launch_kernel(
+          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+          single_threaded_params, "_begin.kd", init_args))
     handle_error(err);
 
   start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
-  if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
-                                       queue, params, "_start.kd", args))
+  if (hsa_status_t err =
+          launch_kernel(dev_agent, executable, kernargs_pool,
+                        coarsegrained_pool, queue, params, "_start.kd", args))
     handle_error(err);
 
   // Create a memory signal and copy the return value back from the device into
@@ -402,9 +419,9 @@
   int ret = *static_cast<int *>(host_ret);
 
   end_args_t fini_args = {ret};
-  if (hsa_status_t err =
-          launch_kernel(dev_agent, executable, kernargs_pool, queue,
-                        single_threaded_params, "_end.kd", fini_args))
+  if (hsa_status_t err = launch_kernel(
+          dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
+          single_threaded_params, "_end.kd", fini_args))
     handle_error(err);
 
   // Free the memory allocated for the device.
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -174,6 +174,29 @@
                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
                          CU_LAUNCH_PARAM_END};
 
+  // Initialize a non-blocking CUDA stream to allocate memory if needed. This
+  // needs to be done on a separate stream or else it will deadlock with the
+  // executing kernel.
+  CUstream memory_stream;
+  if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
+    handle_error(err);
+
+  auto allocator = [&](uint64_t size) -> void * {
+    CUdeviceptr dev_ptr;
+    if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
+      handle_error(err);
+
+    // Wait until the memory allocation is complete.
+    while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
+      ;
+    return reinterpret_cast<void *>(dev_ptr);
+  };
+  auto deallocator = [&](void *ptr) -> void {
+    if (CUresult err =
+            cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr), memory_stream))
+      handle_error(err);
+  };
+
   // Call the kernel with the given arguments.
   if (CUresult err = cuLaunchKernel(
           function, params.num_blocks_x, params.num_blocks_y,
@@ -184,11 +207,11 @@
   // Wait until the kernel has completed execution on the device. Periodically
   // check the RPC client for work to be performed on the server.
   while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
-    handle_server();
+    handle_server(allocator, deallocator);
 
   // Handle the server one more time in case the kernel exited with a pending
   // send still in flight.
-  handle_server();
+  handle_server(allocator, deallocator);
 
   return CUDA_SUCCESS;
 }