diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h --- a/libc/src/__support/GPU/amdgpu/utils.h +++ b/libc/src/__support/GPU/amdgpu/utils.h @@ -14,11 +14,114 @@ #include namespace __llvm_libc { +namespace gpu { +/// The number of threads that execute in lock-step in a lane. +constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE; + +/// Returns the number of workgroups in the 'x' dimension of the grid. +LIBC_INLINE uint32_t get_num_blocks_x() { + return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); +} + +/// Returns the number of workgroups in the 'y' dimension of the grid. +LIBC_INLINE uint32_t get_num_blocks_y() { + return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); +} + +/// Returns the number of workgroups in the 'z' dimension of the grid. +LIBC_INLINE uint32_t get_num_blocks_z() { + return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); +} + +/// Returns the 'x' dimension of the current AMD workgroup's id. LIBC_INLINE uint32_t get_block_id_x() { return __builtin_amdgcn_workgroup_id_x(); } +/// Returns the 'y' dimension of the current AMD workgroup's id. +LIBC_INLINE uint32_t get_block_id_y() { + return __builtin_amdgcn_workgroup_id_y(); +} + +/// Returns the 'z' dimension of the current AMD workgroup's id. +LIBC_INLINE uint32_t get_block_id_z() { + return __builtin_amdgcn_workgroup_id_z(); +} + +/// Returns the absolute id of the AMD workgroup. +LIBC_INLINE uint64_t get_block_id() { + return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + + get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); +} + +/// Returns the number of workitems in the 'x' dimension. +LIBC_INLINE uint32_t get_num_threads_x() { + return __builtin_amdgcn_workgroup_size_x(); +} + +/// Returns the number of workitems in the 'y' dimension. +LIBC_INLINE uint32_t get_num_threads_y() { + return __builtin_amdgcn_workgroup_size_y(); +} + +/// Returns the number of workitems in the 'z' dimension. +LIBC_INLINE uint32_t get_num_threads_z() { + return __builtin_amdgcn_workgroup_size_z(); +} + +/// Returns the 'x' dimension id of the workitem in the current AMD workgroup. +LIBC_INLINE uint32_t get_thread_id_x() { + return __builtin_amdgcn_workitem_id_x(); +} + +/// Returns the 'y' dimension id of the workitem in the current AMD workgroup. +LIBC_INLINE uint32_t get_thread_id_y() { + return __builtin_amdgcn_workitem_id_y(); +} + +/// Returns the 'z' dimension id of the workitem in the current AMD workgroup. +LIBC_INLINE uint32_t get_thread_id_z() { + return __builtin_amdgcn_workitem_id_z(); +} + +/// Returns the absolute id of the thread in the current AMD workgroup. +LIBC_INLINE uint64_t get_thread_id() { + return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + + get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); +} + +/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware. +LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } + +/// Returns the id of the thread inside of an AMD wavefront executing together. +[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { + if (LANE_SIZE == 64) + return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); + else + return __builtin_amdgcn_mbcnt_lo(~0u, 0u); +} + +/// Returns the bit-mask of active threads in the current wavefront. +[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() { + return __builtin_amdgcn_read_exec(); +} + +/// Copies the value from the first active thread in the wavefront to the rest. +[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) { + return __builtin_amdgcn_readfirstlane(x); +} + +/// Waits for all the threads in the block to converge and issues a fence. +[[clang::convergent]] LIBC_INLINE void sync_threads() { + __builtin_amdgcn_s_barrier(); + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); +} + +/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU. +[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {} + +} // namespace gpu } // namespace __llvm_libc #endif diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h --- a/libc/src/__support/GPU/generic/utils.h +++ b/libc/src/__support/GPU/generic/utils.h @@ -14,9 +14,51 @@ #include namespace __llvm_libc { +namespace gpu { + +constexpr const uint64_t LANE_SIZE = 1; + +LIBC_INLINE uint32_t get_num_blocks_x() { return 1; } + +LIBC_INLINE uint32_t get_num_blocks_y() { return 0; } + +LIBC_INLINE uint32_t get_num_blocks_z() { return 0; } LIBC_INLINE uint32_t get_block_id_x() { return 0; } +LIBC_INLINE uint32_t get_block_id_y() { return 0; } + +LIBC_INLINE uint32_t get_block_id_z() { return 0; } + +LIBC_INLINE uint64_t get_block_id() { return 0; } + +LIBC_INLINE uint32_t get_num_threads_x() { return 1; } + +LIBC_INLINE uint32_t get_num_threads_y() { return 0; } + +LIBC_INLINE uint32_t get_num_threads_z() { return 0; } + +LIBC_INLINE uint32_t get_thread_id_x() { return 0; } + +LIBC_INLINE uint32_t get_thread_id_y() { return 0; } + +LIBC_INLINE uint32_t get_thread_id_z() { return 0; } + +LIBC_INLINE uint64_t get_thread_id() { return 0; } + +LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } + +LIBC_INLINE uint32_t get_lane_id() { return 0; } + +LIBC_INLINE uint64_t get_lane_mask() { return 1; } + +LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; } + +LIBC_INLINE void sync_threads() {} + +LIBC_INLINE void sync_lane(uint64_t) {} + +} // namespace gpu } // namespace __llvm_libc #endif diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h --- a/libc/src/__support/GPU/nvptx/utils.h +++ b/libc/src/__support/GPU/nvptx/utils.h @@ -2,7 +2,7 @@ // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-License-id: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -14,9 +14,113 @@ #include namespace __llvm_libc { +namespace gpu { +/// The number of threads that execute in lock-step in a warp. +constexpr const uint64_t LANE_SIZE = 32; + +/// Returns the number of CUDA blocks in the 'x' dimension. +LIBC_INLINE uint32_t get_num_blocks_x() { + return __nvvm_read_ptx_sreg_nctaid_x(); +} + +/// Returns the number of CUDA blocks in the 'y' dimension. +LIBC_INLINE uint32_t get_num_blocks_y() { + return __nvvm_read_ptx_sreg_nctaid_y(); +} + +/// Returns the number of CUDA blocks in the 'z' dimension. +LIBC_INLINE uint32_t get_num_blocks_z() { + return __nvvm_read_ptx_sreg_nctaid_z(); +} + +/// Returns the 'x' dimension of the current CUDA block's id. LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); } +/// Returns the 'y' dimension of the current CUDA block's id. +LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); } + +/// Returns the 'z' dimension of the current CUDA block's id. +LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); } + +/// Returns the absolute id of the CUDA block. +LIBC_INLINE uint64_t get_block_id() { + return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + + get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); +} + +/// Returns the number of CUDA threads in the 'x' dimension. +LIBC_INLINE uint32_t get_num_threads_x() { + return __nvvm_read_ptx_sreg_ntid_x(); +} + +/// Returns the number of CUDA threads in the 'y' dimension. +LIBC_INLINE uint32_t get_num_threads_y() { + return __nvvm_read_ptx_sreg_ntid_y(); +} + +/// Returns the number of CUDA threads in the 'z' dimension. +LIBC_INLINE uint32_t get_num_threads_z() { + return __nvvm_read_ptx_sreg_ntid_z(); +} + +/// Returns the 'x' dimension id of the thread in the current CUDA block. +LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); } + +/// Returns the 'y' dimension id of the thread in the current CUDA block. +LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); } + +/// Returns the 'z' dimension id of the thread in the current CUDA block. +LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); } + +/// Returns the absolute id of the thread in the current CUDA block. +LIBC_INLINE uint64_t get_thread_id() { + return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + + get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); +} + +/// Returns the size of a CUDA warp. +LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } + +/// Returns the id of the thread inside of a CUDA warp executing together. +[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { + return get_thread_id() & (get_lane_size() - 1); +} + +/// Returns the bit-mask of active threads in the current warp. +[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() { + uint32_t mask; + asm volatile("activemask.b32 %0;" : "=r"(mask)); + return mask; +} + +/// Copies the value from the first active thread in the warp to the rest. +[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) { + // NOTE: This is not sufficient in all cases on Volta hardware or later. The + // lane mask returned here is not always the true lane mask used by the + // intrinsics in cases of incedental or enforced divergence by the user. + uint64_t lane_mask = get_lane_mask(); + uint64_t id = __builtin_ffsl(lane_mask) - 1; +#if __CUDA_ARCH__ >= 600 + return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1); +#else + return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1); +#endif +} + +/// Waits for all the threads in the block to converge and issues a fence. +[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); } + +/// Waits for all threads in the warp to reconverge for independent scheduling. +[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) { +#if __CUDA_ARCH__ >= 700 + __nvvm_bar_warp_sync(mask); +#else + (void)mask; +#endif +} + +} // namespace gpu } // namespace __llvm_libc #endif diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt --- a/libc/src/__support/RPC/CMakeLists.txt +++ b/libc/src/__support/RPC/CMakeLists.txt @@ -20,5 +20,6 @@ HDRS rpc_client.h DEPENDS + libc.src.__support.GPU.utils .rpc ) diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt --- a/libc/startup/gpu/amdgpu/CMakeLists.txt +++ b/libc/startup/gpu/amdgpu/CMakeLists.txt @@ -4,6 +4,7 @@ start.cpp DEPENDS libc.src.__support.RPC.rpc_client + libc.src.__support.GPU.utils COMPILE_OPTIONS -ffreestanding # To avoid compiler warnings about calling the main function. -fno-builtin diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -6,16 +6,38 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/GPU/utils.h" #include "src/__support/RPC/rpc_client.h" -static __llvm_libc::cpp::Atomic lock; - extern "C" int main(int argc, char **argv, char **envp); +namespace __llvm_libc { + +static cpp::Atomic lock = 0; + +static cpp::Atomic init = 0; + +void init_rpc(void *in, void *out, void *buffer) { + // Only a single thread should update the RPC data. + if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) { + rpc::client.reset(&lock, in, out, buffer); + init.store(1, cpp::MemoryOrder::RELAXED); + } + + // Wait until the previous thread signals that the data has been written. + while (!init.load(cpp::MemoryOrder::RELAXED)) + rpc::sleep_briefly(); + + // Wait for the threads in the block to converge and fence the write. + gpu::sync_threads(); +} + +} // namespace __llvm_libc + extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void _start(int argc, char **argv, char **envp, int *ret, void *in, void *out, void *buffer) { - __llvm_libc::rpc::client.reset(&lock, in, out, buffer); + __llvm_libc::init_rpc(in, out, buffer); __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED); } diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt --- a/libc/startup/gpu/nvptx/CMakeLists.txt +++ b/libc/startup/gpu/nvptx/CMakeLists.txt @@ -5,6 +5,7 @@ start.cpp DEPENDS libc.src.__support.RPC.rpc_client + libc.src.__support.GPU.utils COMPILE_OPTIONS -ffreestanding # To avoid compiler warnings about calling the main function. -fno-builtin diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -6,16 +6,38 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/GPU/utils.h" #include "src/__support/RPC/rpc_client.h" -static __llvm_libc::cpp::Atomic lock; - extern "C" int main(int argc, char **argv, char **envp); -extern "C" [[gnu::visibility("protected")]] __attribute__((nvptx_kernel)) void +namespace __llvm_libc { + +static cpp::Atomic lock = 0; + +static cpp::Atomic init = 0; + +void init_rpc(void *in, void *out, void *buffer) { + // Only a single thread should update the RPC data. + if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) { + rpc::client.reset(&lock, in, out, buffer); + init.store(1, cpp::MemoryOrder::RELAXED); + } + + // Wait until the previous thread signals that the data has been written. + while (!init.load(cpp::MemoryOrder::RELAXED)) + rpc::sleep_briefly(); + + // Wait for the threads in the block to converge and fence the write. + gpu::sync_threads(); +} + +} // namespace __llvm_libc + +extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void _start(int argc, char **argv, char **envp, int *ret, void *in, void *out, void *buffer) { - __llvm_libc::rpc::client.reset(&lock, in, out, buffer); + __llvm_libc::init_rpc(in, out, buffer); __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED); } diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp --- a/libc/test/integration/startup/gpu/rpc_test.cpp +++ b/libc/test/integration/startup/gpu/rpc_test.cpp @@ -13,7 +13,7 @@ using namespace __llvm_libc; static void test_add_simple() { - uint32_t num_additions = 1000 + 10 * get_block_id_x(); + uint32_t num_additions = 1000 + 10 * gpu::get_block_id_x(); uint64_t cnt = 0; for (uint32_t i = 0; i < num_additions; ++i) { rpc::Port port = rpc::client.open(rpc::TEST_INCREMENT);