diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h --- a/libc/src/__support/GPU/nvptx/utils.h +++ b/libc/src/__support/GPU/nvptx/utils.h @@ -142,9 +142,7 @@ /// Returns the current value of the GPU's processor clock. LIBC_INLINE uint64_t processor_clock() { - uint64_t timestamp; - LIBC_INLINE_ASM("mov.u64 %0, %%clock64;" : "=l"(timestamp)); - return timestamp; + return __nvvm_read_ptx_sreg_clock64(); } /// Returns a global fixed-frequency timer at nanosecond frequency. diff --git a/libc/utils/gpu/CMakeLists.txt b/libc/utils/gpu/CMakeLists.txt --- a/libc/utils/gpu/CMakeLists.txt +++ b/libc/utils/gpu/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(server) add_subdirectory(loader) +add_subdirectory(timing) diff --git a/libc/utils/gpu/timing/CMakeLists.txt b/libc/utils/gpu/timing/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/timing/CMakeLists.txt @@ -0,0 +1,16 @@ +if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) + return() +endif() + +foreach(target nvptx amdgpu) + add_subdirectory(${target}) + list(APPEND target_gpu_timing libc.utils.gpu.timing.${target}.${target}_timing) +endforeach() + +add_header_library( + timing + HDRS + timing.h + DEPENDS + ${target_gpu_timing} +) diff --git a/libc/utils/gpu/timing/amdgpu/CMakeLists.txt b/libc/utils/gpu/timing/amdgpu/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/timing/amdgpu/CMakeLists.txt @@ -0,0 +1,7 @@ +add_header_library( + amdgpu_timing + HDRS + timing.h + DEPENDS + libc.src.__support.common +) diff --git a/libc/utils/gpu/timing/amdgpu/timing.h b/libc/utils/gpu/timing/amdgpu/timing.h new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/timing/amdgpu/timing.h @@ -0,0 +1,73 @@ +//===------------- AMDGPU implementation of timing utils --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU +#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU + +#include "src/__support/GPU/utils.h" +#include "src/__support/common.h" +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/config.h" + +#include + +namespace __llvm_libc { + +// Returns the overhead associated with calling the profiling region. This +// allows us to substract the constant-time overhead from the latency to +// obtain a true result. This can vary with system load. +[[gnu::noinline]] static LIBC_INLINE uint64_t overhead() { + __builtin_amdgcn_s_waitcnt(0); + uint64_t start = gpu::processor_clock(); + uint32_t result = 0.0; + asm volatile("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :); + asm volatile("" ::"s"(start)); + uint64_t stop = gpu::processor_clock(); + return stop - start; +} + +// Profile a simple function and obtain its latency in clock cycles on the +// system. This function cannot be inlined or else it will disturb the very +// deliccate balance of hard-coded dependencies. +template +[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) { + // We need to store the input somewhere to guarantee that the compiler will + // not constant propagate it and remove the profiling region. + volatile uint32_t storage = t; + float arg = storage; + asm volatile("" ::"s"(arg)); + + // The AMDGPU architecture needs to wait on pending results. + __builtin_amdgcn_s_waitcnt(0); + // Get the current timestamp from the clock. + uint64_t start = gpu::processor_clock(); + + // This forces the compiler to load the input argument and run the clock cycle + // counter before the profiling region. + asm volatile("" ::"s"(arg), "s"(start)); + + // Run the function under test and return its value. + auto result = f(arg); + + // This inline assembly performs a no-op which forces the result to both be + // used and prevents us from exiting this region before it's complete. + asm volatile("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :); + + // Obtain the current timestamp after running the calculation and force + // ordering. + uint64_t stop = gpu::processor_clock(); + asm volatile("" ::"s"(stop)); + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); + + // Return the time elapsed. + return stop - start; +} + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU diff --git a/libc/utils/gpu/timing/nvptx/CMakeLists.txt b/libc/utils/gpu/timing/nvptx/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/timing/nvptx/CMakeLists.txt @@ -0,0 +1,7 @@ +add_header_library( + nvptx_timing + HDRS + timing.h + DEPENDS + libc.src.__support.common +) diff --git a/libc/utils/gpu/timing/nvptx/timing.h b/libc/utils/gpu/timing/nvptx/timing.h new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/timing/nvptx/timing.h @@ -0,0 +1,82 @@ +//===------------- NVPTX implementation of timing utils ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX +#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX + +#include "src/__support/GPU/utils.h" +#include "src/__support/common.h" +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/config.h" + +#include + +namespace __llvm_libc { + +// Returns the overhead associated with calling the profiling region. This +// allows us to substract the constant-time overhead from the latency to +// obtain a true result. This can vary with system load. +[[gnu::noinline]] static uint64_t overhead() { + volatile uint32_t x = 1; + uint32_t y = x; + gpu::sync_threads(); + uint64_t start = gpu::processor_clock(); + asm volatile("" ::"r"(y), "r"(start)); + uint32_t result = y; + asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); + uint64_t stop = gpu::processor_clock(); + gpu::sync_threads(); + volatile auto storage = result; + return stop - start; +} + +// Stimulate a simple function and obtain its latency in clock cycles on the +// system. This function cannot be inlined or else it will disturb the very +// deliccate balance of hard-coded dependencies. +// +// FIXME: This does not work in general on NVPTX because of further +// optimizations ptxas performs. The only way to get consistent results is to +// pass and extra "SHELL:-Xcuda-ptxas -O0" to CMake's compiler flag. This +// negatively implacts performance but it is at least stable. +template +[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) { + // We need to store the input somewhere to guarantee that the compiler will + // not constant propagate it and remove the profiling region. + volatile T storage = t; + T arg = storage; + asm volatile("" ::"r"(arg)); + + // Get the current timestamp from the clock. + gpu::sync_threads(); + uint64_t start = gpu::processor_clock(); + + // This forces the compiler to load the input argument and run the clock cycle + // counter before the profiling region. + asm volatile("" ::"r"(arg), "r"(start)); + + // Run the function under test and return its value. + auto result = f(arg); + + // This inline assembly performs a no-op which forces the result to both be + // used and prevents us from exiting this region before it's complete. + asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); + + // Obtain the current timestamp after running the calculation and force + // ordering. + uint64_t stop = gpu::processor_clock(); + gpu::sync_threads(); + asm volatile("" ::"r"(stop)); + volatile T output = result; + + // Return the time elapsed. + return stop - start; +} + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX diff --git a/libc/utils/gpu/timing/timing.h b/libc/utils/gpu/timing/timing.h new file mode 100644 --- /dev/null +++ b/libc/utils/gpu/timing/timing.h @@ -0,0 +1,22 @@ +//===------------- Implementation of GPU timing utils -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_GPU_TIMING_H +#define LLVM_LIBC_UTILS_GPU_TIMING_H + +#include "src/__support/macros/properties/architectures.h" + +#if defined(LIBC_TARGET_ARCH_IS_AMDGPU) +#include "amdgpu/timing.h" +#elif defined(LIBC_TARGET_ARCH_IS_NVPTX) +#include "nvptx/timing.h" +#else +#error "unsupported platform" +#endif + +#endif // LLVM_LIBC_UTILS_GPU_TIMING_H