diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -14,11 +14,114 @@
 #include <stdint.h>
 
 namespace __llvm_libc {
+namespace gpu {
 
+/// The number of threads that execute in lock-step in a lane.
+constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE;
+
+/// Returns the number of workgroups in the 'x' dimension of the grid.
+LIBC_INLINE uint32_t get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+/// Returns the number of workgroups in the 'y' dimension of the grid.
+LIBC_INLINE uint32_t get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+/// Returns the number of workgroups in the 'z' dimension of the grid.
+LIBC_INLINE uint32_t get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+/// Returns the 'x' dimension of the current AMD workgroup's id.
 LIBC_INLINE uint32_t get_block_id_x() {
   return __builtin_amdgcn_workgroup_id_x();
 }
 
+/// Returns the 'y' dimension of the current AMD workgroup's id.
+LIBC_INLINE uint32_t get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+/// Returns the 'z' dimension of the current AMD workgroup's id.
+LIBC_INLINE uint32_t get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+/// Returns the absolute id of the AMD workgroup.
+LIBC_INLINE uint64_t get_block_id() {
+  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
+         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
+}
+
+/// Returns the number of workitems in the 'x' dimension.
+LIBC_INLINE uint32_t get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+/// Returns the number of workitems in the 'y' dimension.
+LIBC_INLINE uint32_t get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+/// Returns the number of workitems in the 'z' dimension.
+LIBC_INLINE uint32_t get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+/// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+LIBC_INLINE uint32_t get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+/// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+LIBC_INLINE uint32_t get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+/// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+LIBC_INLINE uint32_t get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+/// Returns the absolute id of the thread in the current AMD workgroup.
+LIBC_INLINE uint64_t get_thread_id() {
+  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
+         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
+}
+
+/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.
+LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+
+/// Returns the id of the thread inside of an AMD wavefront executing together.
+[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
+  if (LANE_SIZE == 64)
+    return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+  else
+    return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
+}
+
+/// Returns the bit-mask of active threads in the current wavefront.
+[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
+  return __builtin_amdgcn_read_exec();
+}
+
+/// Copies the value from the first active thread in the wavefront to the rest.
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
+  return __builtin_amdgcn_readfirstlane(x);
+}
+
+/// Waits for all the threads in the block to converge and issues a fence.
+[[clang::convergent]] LIBC_INLINE void sync_threads() {
+  __builtin_amdgcn_s_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+}
+
+/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
+[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {}
+
+} // namespace gpu
 } // namespace __llvm_libc
 
 #endif
diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
--- a/libc/src/__support/GPU/generic/utils.h
+++ b/libc/src/__support/GPU/generic/utils.h
@@ -14,9 +14,51 @@
 #include <stdint.h>
 
 namespace __llvm_libc {
+namespace gpu {
+
+constexpr const uint64_t LANE_SIZE = 1;
+
+LIBC_INLINE uint32_t get_num_blocks_x() { return 1; }
+
+LIBC_INLINE uint32_t get_num_blocks_y() { return 0; }
+
+LIBC_INLINE uint32_t get_num_blocks_z() { return 0; }
 
 LIBC_INLINE uint32_t get_block_id_x() { return 0; }
 
+LIBC_INLINE uint32_t get_block_id_y() { return 0; }
+
+LIBC_INLINE uint32_t get_block_id_z() { return 0; }
+
+LIBC_INLINE uint64_t get_block_id() { return 0; }
+
+LIBC_INLINE uint32_t get_num_threads_x() { return 1; }
+
+LIBC_INLINE uint32_t get_num_threads_y() { return 0; }
+
+LIBC_INLINE uint32_t get_num_threads_z() { return 0; }
+
+LIBC_INLINE uint32_t get_thread_id_x() { return 0; }
+
+LIBC_INLINE uint32_t get_thread_id_y() { return 0; }
+
+LIBC_INLINE uint32_t get_thread_id_z() { return 0; }
+
+LIBC_INLINE uint64_t get_thread_id() { return 0; }
+
+LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+
+LIBC_INLINE uint32_t get_lane_id() { return 0; }
+
+LIBC_INLINE uint64_t get_lane_mask() { return 1; }
+
+LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }
+
+LIBC_INLINE void sync_threads() {}
+
+LIBC_INLINE void sync_lane(uint64_t) {}
+
+} // namespace gpu
 } // namespace __llvm_libc
 
 #endif
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -2,7 +2,7 @@
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-License-id: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -14,9 +14,113 @@
 #include <stdint.h>
 
 namespace __llvm_libc {
+namespace gpu {
 
+/// The number of threads that execute in lock-step in a warp.
+constexpr const uint64_t LANE_SIZE = 32;
+
+/// Returns the number of CUDA blocks in the 'x' dimension.
+LIBC_INLINE uint32_t get_num_blocks_x() {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+
+/// Returns the number of CUDA blocks in the 'y' dimension.
+LIBC_INLINE uint32_t get_num_blocks_y() {
+  return __nvvm_read_ptx_sreg_nctaid_y();
+}
+
+/// Returns the number of CUDA blocks in the 'z' dimension.
+LIBC_INLINE uint32_t get_num_blocks_z() {
+  return __nvvm_read_ptx_sreg_nctaid_z();
+}
+
+/// Returns the 'x' dimension of the current CUDA block's id.
 LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }
 
+/// Returns the 'y' dimension of the current CUDA block's id.
+LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }
+
+/// Returns the 'z' dimension of the current CUDA block's id.
+LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }
+
+/// Returns the absolute id of the CUDA block.
+LIBC_INLINE uint64_t get_block_id() {
+  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
+         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
+}
+
+/// Returns the number of CUDA threads in the 'x' dimension.
+LIBC_INLINE uint32_t get_num_threads_x() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+/// Returns the number of CUDA threads in the 'y' dimension.
+LIBC_INLINE uint32_t get_num_threads_y() {
+  return __nvvm_read_ptx_sreg_ntid_y();
+}
+
+/// Returns the number of CUDA threads in the 'z' dimension.
+LIBC_INLINE uint32_t get_num_threads_z() {
+  return __nvvm_read_ptx_sreg_ntid_z();
+}
+
+/// Returns the 'x' dimension id of the thread in the current CUDA block.
+LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }
+
+/// Returns the 'y' dimension id of the thread in the current CUDA block.
+LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }
+
+/// Returns the 'z' dimension id of the thread in the current CUDA block.
+LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }
+
+/// Returns the absolute id of the thread in the current CUDA block.
+LIBC_INLINE uint64_t get_thread_id() {
+  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
+         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
+}
+
+/// Returns the size of a CUDA warp.
+LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+
+/// Returns the id of the thread inside of a CUDA warp executing together.
+[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
+  return get_thread_id() & (get_lane_size() - 1);
+}
+
+/// Returns the bit-mask of active threads in the current warp.
+[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
+  uint32_t mask;
+  asm volatile("activemask.b32 %0;" : "=r"(mask));
+  return mask;
+}
+
+/// Copies the value from the first active thread in the warp to the rest.
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
+  // NOTE: This is not sufficient in all cases on Volta hardware or later. The
+  // lane mask returned here is not always the true lane mask used by the
+  // intrinsics in cases of incedental or enforced divergence by the user.
+  uint64_t lane_mask = get_lane_mask();
+  uint64_t id = __builtin_ffsl(lane_mask) - 1;
+#if __CUDA_ARCH__ >= 600
+  return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);
+#else
+  return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
+#endif
+}
+
+/// Waits for all the threads in the block to converge and issues a fence.
+[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
+
+/// Waits for all threads in the warp to reconverge for independent scheduling.
+[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
+#if __CUDA_ARCH__ >= 700
+  __nvvm_bar_warp_sync(mask);
+#else
+  (void)mask;
+#endif
+}
+
+} // namespace gpu
 } // namespace __llvm_libc
 
 #endif
diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt
--- a/libc/src/__support/RPC/CMakeLists.txt
+++ b/libc/src/__support/RPC/CMakeLists.txt
@@ -20,5 +20,6 @@
   HDRS
     rpc_client.h
   DEPENDS
+    libc.src.__support.GPU.utils
     .rpc
 )
diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
--- a/libc/startup/gpu/amdgpu/CMakeLists.txt
+++ b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -4,6 +4,7 @@
     start.cpp
   DEPENDS
     libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -6,16 +6,38 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 
-static __llvm_libc::cpp::Atomic<uint32_t> lock;
-
 extern "C" int main(int argc, char **argv, char **envp);
 
+namespace __llvm_libc {
+
+static cpp::Atomic<uint32_t> lock = 0;
+
+static cpp::Atomic<uint32_t> init = 0;
+
+void init_rpc(void *in, void *out, void *buffer) {
+  // Only a single thread should update the RPC data.
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    rpc::client.reset(&lock, in, out, buffer);
+    init.store(1, cpp::MemoryOrder::RELAXED);
+  }
+
+  // Wait until the previous thread signals that the data has been written.
+  while (!init.load(cpp::MemoryOrder::RELAXED))
+    rpc::sleep_briefly();
+
+  // Wait for the threads in the block to converge and fence the write.
+  gpu::sync_threads();
+}
+
+} // namespace __llvm_libc
+
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::rpc::client.reset(&lock, in, out, buffer);
+  __llvm_libc::init_rpc(in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }
diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -5,6 +5,7 @@
     start.cpp
   DEPENDS
     libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -6,16 +6,38 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 
-static __llvm_libc::cpp::Atomic<uint32_t> lock;
-
 extern "C" int main(int argc, char **argv, char **envp);
 
-extern "C" [[gnu::visibility("protected")]] __attribute__((nvptx_kernel)) void
+namespace __llvm_libc {
+
+static cpp::Atomic<uint32_t> lock = 0;
+
+static cpp::Atomic<uint32_t> init = 0;
+
+void init_rpc(void *in, void *out, void *buffer) {
+  // Only a single thread should update the RPC data.
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    rpc::client.reset(&lock, in, out, buffer);
+    init.store(1, cpp::MemoryOrder::RELAXED);
+  }
+
+  // Wait until the previous thread signals that the data has been written.
+  while (!init.load(cpp::MemoryOrder::RELAXED))
+    rpc::sleep_briefly();
+
+  // Wait for the threads in the block to converge and fence the write.
+  gpu::sync_threads();
+}
+
+} // namespace __llvm_libc
+
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::rpc::client.reset(&lock, in, out, buffer);
+  __llvm_libc::init_rpc(in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }
diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -13,7 +13,7 @@
 using namespace __llvm_libc;
 
 static void test_add_simple() {
-  uint32_t num_additions = 1000 + 10 * get_block_id_x();
+  uint32_t num_additions = 1000 + 10 * gpu::get_block_id_x();
   uint64_t cnt = 0;
   for (uint32_t i = 0; i < num_additions; ++i) {
     rpc::Port port = rpc::client.open(rpc::TEST_INCREMENT);