diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -158,7 +158,7 @@
 
 /// Returns a fixed-frequency timestamp. The actual frequency is dependent on
 /// the card and can only be queried via the driver.
-LIBC_INLINE uint64_t fixed_frequrency_clock() {
+LIBC_INLINE uint64_t fixed_frequency_clock() {
   if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl))
     return __builtin_amdgcn_s_sendmsg_rtnl(0x83);
   else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memrealtime))
diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
--- a/libc/src/__support/GPU/generic/utils.h
+++ b/libc/src/__support/GPU/generic/utils.h
@@ -69,7 +69,7 @@
 
 LIBC_INLINE uint64_t processor_clock() { return 0; }
 
-LIBC_INLINE uint64_t fixed_frequrency_clock() { return 0; }
+LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; }
 
 } // namespace gpu
 } // namespace __llvm_libc
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -142,7 +142,7 @@
 }
 
 /// Returns a global fixed-frequency timer at nanosecond frequency.
-LIBC_INLINE uint64_t fixed_frequrency_clock() {
+LIBC_INLINE uint64_t fixed_frequency_clock() {
   uint64_t nsecs;
   LIBC_INLINE_ASM("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
   return nsecs;
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -15,6 +15,12 @@
 
 namespace __llvm_libc {
 
+// The AMDGPU architecture provides a fixed frequency clock used for obtaining
+// real time. However, the frequency of this clock varies between cards and can
+// only be obtained via the driver. The loader will set this so we can use it.
+extern "C" [[gnu::visibility("protected")]] uint64_t
+    [[clang::address_space(4)]] __llvm_libc_clock_freq = 0;
+
 extern "C" uintptr_t __init_array_start[];
 extern "C" uintptr_t __init_array_end[];
 extern "C" uintptr_t __fini_array_start[];
diff --git a/libc/test/UnitTest/LibcTest.cpp b/libc/test/UnitTest/LibcTest.cpp
--- a/libc/test/UnitTest/LibcTest.cpp
+++ b/libc/test/UnitTest/LibcTest.cpp
@@ -15,6 +15,16 @@
 
 #if __STDC_HOSTED__
 #include <time.h>
+#elif defined(LIBC_TARGET_ARCH_IS_GPU)
+#include "src/__support/GPU/utils.h"
+static long clock() { return __llvm_libc::gpu::fixed_frequency_clock(); }
+#if LIBC_TARGET_ARCH_IS_NVPTX
+uint64_t CLOCKS_PER_SEC = 1000000000UL;
+#else
+// The AMDGPU loader needs to initialize this at runtime by querying the driver.
+extern "C" [[gnu::visibility("protected")]] uint64_t __llvm_libc_clock_freq;
+uint64_t CLOCKS_PER_SEC = __llvm_libc_clock_freq;
+#endif
 #else
 static long clock() { return 0; }
 #define CLOCKS_PER_SEC 1
@@ -136,14 +146,22 @@
       break;
     case RunContext::RunResult::Pass:
       tlog << GREEN << "[       OK ] " << RESET << TestName;
-#if __STDC_HOSTED__
+#if __STDC_HOSTED__ || defined(LIBC_TARGET_ARCH_IS_GPU)
       tlog << " (took ";
       if (start_time > end_time) {
         tlog << "unknown - try rerunning)\n";
       } else {
         const auto duration = end_time - start_time;
-        const uint64_t duration_ms = duration * 1000 / CLOCKS_PER_SEC;
-        tlog << duration_ms << " ms)\n";
+        const uint64_t duration_ms = (duration * 1000) / CLOCKS_PER_SEC;
+        const uint64_t duration_us = (duration * 1000 * 1000) / CLOCKS_PER_SEC;
+        const uint64_t duration_ns =
+            (duration * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
+        if (duration_ms != 0)
+          tlog << duration_ms << " ms)\n";
+        else if (duration_us != 0)
+          tlog << duration_us << " us)\n";
+        else
+          tlog << duration_ns << " ns)\n";
       }
 #else
       tlog << '\n';
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -264,6 +264,30 @@
   return HSA_STATUS_SUCCESS;
 }
 
+/// Copies data from the source agent to the destination agent. The source
+/// memory must first be pinned explicitly or allocated via HSA.
+static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
+                               const void *src, hsa_agent_t src_agent,
+                               uint64_t size) {
+  // Create a memory signal to copy information between the host and device.
+  hsa_signal_t memory_signal;
+  if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
+    return err;
+
+  if (hsa_status_t err = hsa_amd_memory_async_copy(
+          dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
+    return err;
+
+  while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
+                                   UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
+    ;
+
+  if (hsa_status_t err = hsa_signal_destroy(memory_signal))
+    return err;
+
+  return HSA_STATUS_SUCCESS;
+}
+
 int load(int argc, char **argv, char **envp, void *image, size_t size,
          const LaunchParameters &params) {
   // Initialize the HSA runtime used to communicate with the device.
@@ -388,6 +412,34 @@
                                          wavefront_size, rpc_alloc, &tuple))
     handle_error(err);
 
+  // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
+  void *host_clock_freq;
+  if (hsa_status_t err =
+          hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
+                                       /*flags=*/0, &host_clock_freq))
+    handle_error(err);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
+
+  if (hsa_status_t err = hsa_agent_get_info(
+          dev_agent,
+          static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
+          host_clock_freq))
+    handle_error(err);
+
+  hsa_executable_symbol_t freq_sym;
+  if (hsa_status_t err = hsa_executable_get_symbol_by_name(
+          executable, "__llvm_libc_clock_freq", &dev_agent, &freq_sym))
+    handle_error(err);
+
+  void *freq_addr;
+  if (hsa_status_t err = hsa_executable_symbol_get_info(
+          freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &freq_addr))
+    handle_error(err);
+
+  if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
+                                    host_agent, sizeof(uint64_t)))
+    handle_error(err);
+
   // Obtain a queue with the minimum (power of two) size, used to send commands
   // to the HSA runtime and launch execution on the device.
   uint64_t queue_size;
@@ -414,12 +466,6 @@
                         coarsegrained_pool, queue, params, "_start.kd", args))
     handle_error(err);
 
-  // Create a memory signal and copy the return value back from the device into
-  // a new buffer.
-  hsa_signal_t memory_signal;
-  if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
-    handle_error(err);
-
   void *host_ret;
   if (hsa_status_t err =
           hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
@@ -428,14 +474,9 @@
   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
 
   if (hsa_status_t err =
-          hsa_amd_memory_async_copy(host_ret, host_agent, dev_ret, dev_agent,
-                                    sizeof(int), 0, nullptr, memory_signal))
+          hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
     handle_error(err);
 
-  while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
-                                   UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
-    ;
-
   // Save the return value and perform basic clean-up.
   int ret = *static_cast<int *>(host_ret);
 
@@ -458,8 +499,6 @@
   if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
     handle_error(err);
 
-  if (hsa_status_t err = hsa_signal_destroy(memory_signal))
-    handle_error(err);
   if (hsa_status_t err = hsa_queue_destroy(queue))
     handle_error(err);