diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -142,9 +142,7 @@
 
 /// Returns the current value of the GPU's processor clock.
 LIBC_INLINE uint64_t processor_clock() {
-  uint64_t timestamp;
-  LIBC_INLINE_ASM("mov.u64  %0, %%clock64;" : "=l"(timestamp));
-  return timestamp;
+  return __nvvm_read_ptx_sreg_clock64();
 }
 
 /// Returns a global fixed-frequency timer at nanosecond frequency.
diff --git a/libc/utils/gpu/CMakeLists.txt b/libc/utils/gpu/CMakeLists.txt
--- a/libc/utils/gpu/CMakeLists.txt
+++ b/libc/utils/gpu/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(server)
 add_subdirectory(loader)
+add_subdirectory(timing)
diff --git a/libc/utils/gpu/timing/CMakeLists.txt b/libc/utils/gpu/timing/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/libc/utils/gpu/timing/CMakeLists.txt
@@ -0,0 +1,16 @@
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  return()
+endif()
+
+foreach(target nvptx amdgpu)
+  add_subdirectory(${target})
+  list(APPEND target_gpu_timing libc.utils.gpu.timing.${target}.${target}_timing)
+endforeach()
+
+add_header_library(
+  timing
+  HDRS
+    timing.h
+  DEPENDS
+    ${target_gpu_timing}
+)
diff --git a/libc/utils/gpu/timing/amdgpu/CMakeLists.txt b/libc/utils/gpu/timing/amdgpu/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/libc/utils/gpu/timing/amdgpu/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_header_library(
+  amdgpu_timing
+  HDRS
+    timing.h
+  DEPENDS
+    libc.src.__support.common
+)
diff --git a/libc/utils/gpu/timing/amdgpu/timing.h b/libc/utils/gpu/timing/amdgpu/timing.h
new file mode 100644
--- /dev/null
+++ b/libc/utils/gpu/timing/amdgpu/timing.h
@@ -0,0 +1,73 @@
+//===------------- AMDGPU implementation of timing utils --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
+#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+
+#include <stdint.h>
+
+namespace __llvm_libc {
+
+// Returns the overhead associated with calling the profiling region. This
+// allows us to substract the constant-time overhead from the latency to
+// obtain a true result. This can vary with system load.
+[[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
+  __builtin_amdgcn_s_waitcnt(0);
+  uint64_t start = gpu::processor_clock();
+  uint32_t result = 0.0;
+  asm volatile("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm volatile("" ::"s"(start));
+  uint64_t stop = gpu::processor_clock();
+  return stop - start;
+}
+
+// Profile a simple function and obtain its latency in clock cycles on the
+// system. This function cannot be inlined or else it will disturb the very
+// deliccate balance of hard-coded dependencies.
+template <typename F, typename T>
+[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
+  // We need to store the input somewhere to guarantee that the compiler will
+  // not constant propagate it and remove the profiling region.
+  volatile uint32_t storage = t;
+  float arg = storage;
+  asm volatile("" ::"s"(arg));
+
+  // The AMDGPU architecture needs to wait on pending results.
+  __builtin_amdgcn_s_waitcnt(0);
+  // Get the current timestamp from the clock.
+  uint64_t start = gpu::processor_clock();
+
+  // This forces the compiler to load the input argument and run the clock cycle
+  // counter before the profiling region.
+  asm volatile("" ::"s"(arg), "s"(start));
+
+  // Run the function under test and return its value.
+  auto result = f(arg);
+
+  // This inline assembly performs a no-op which forces the result to both be
+  // used and prevents us from exiting this region before it's complete.
+  asm volatile("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+
+  // Obtain the current timestamp after running the calculation and force
+  // ordering.
+  uint64_t stop = gpu::processor_clock();
+  asm volatile("" ::"s"(stop));
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/utils/gpu/timing/nvptx/CMakeLists.txt b/libc/utils/gpu/timing/nvptx/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/libc/utils/gpu/timing/nvptx/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_header_library(
+  nvptx_timing
+  HDRS
+    timing.h
+  DEPENDS
+    libc.src.__support.common
+)
diff --git a/libc/utils/gpu/timing/nvptx/timing.h b/libc/utils/gpu/timing/nvptx/timing.h
new file mode 100644
--- /dev/null
+++ b/libc/utils/gpu/timing/nvptx/timing.h
@@ -0,0 +1,82 @@
+//===------------- NVPTX implementation of timing utils ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+
+#include <stdint.h>
+
+namespace __llvm_libc {
+
+// Returns the overhead associated with calling the profiling region. This
+// allows us to substract the constant-time overhead from the latency to
+// obtain a true result. This can vary with system load.
+[[gnu::noinline]] static uint64_t overhead() {
+  volatile uint32_t x = 1;
+  uint32_t y = x;
+  gpu::sync_threads();
+  uint64_t start = gpu::processor_clock();
+  asm volatile("" ::"r"(y), "r"(start));
+  uint32_t result = y;
+  asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  uint64_t stop = gpu::processor_clock();
+  gpu::sync_threads();
+  volatile auto storage = result;
+  return stop - start;
+}
+
+// Stimulate a simple function and obtain its latency in clock cycles on the
+// system. This function cannot be inlined or else it will disturb the very
+// deliccate balance of hard-coded dependencies.
+//
+// FIXME: This does not work in general on NVPTX because of further
+// optimizations ptxas performs. The only way to get consistent results is to
+// pass and extra "SHELL:-Xcuda-ptxas -O0" to CMake's compiler flag. This
+// negatively implacts performance but it is at least stable.
+template <typename F, typename T>
+[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
+  // We need to store the input somewhere to guarantee that the compiler will
+  // not constant propagate it and remove the profiling region.
+  volatile T storage = t;
+  T arg = storage;
+  asm volatile("" ::"r"(arg));
+
+  // Get the current timestamp from the clock.
+  gpu::sync_threads();
+  uint64_t start = gpu::processor_clock();
+
+  // This forces the compiler to load the input argument and run the clock cycle
+  // counter before the profiling region.
+  asm volatile("" ::"r"(arg), "r"(start));
+
+  // Run the function under test and return its value.
+  auto result = f(arg);
+
+  // This inline assembly performs a no-op which forces the result to both be
+  // used and prevents us from exiting this region before it's complete.
+  asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+
+  // Obtain the current timestamp after running the calculation and force
+  // ordering.
+  uint64_t stop = gpu::processor_clock();
+  gpu::sync_threads();
+  asm volatile("" ::"r"(stop));
+  volatile T output = result;
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
diff --git a/libc/utils/gpu/timing/timing.h b/libc/utils/gpu/timing/timing.h
new file mode 100644
--- /dev/null
+++ b/libc/utils/gpu/timing/timing.h
@@ -0,0 +1,22 @@
+//===------------- Implementation of GPU timing utils -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_GPU_TIMING_H
+#define LLVM_LIBC_UTILS_GPU_TIMING_H
+
+#include "src/__support/macros/properties/architectures.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#include "amdgpu/timing.h"
+#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
+#include "nvptx/timing.h"
+#else
+#error "unsupported platform"
+#endif
+
+#endif // LLVM_LIBC_UTILS_GPU_TIMING_H