diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h --- a/libc/src/__support/GPU/amdgpu/utils.h +++ b/libc/src/__support/GPU/amdgpu/utils.h @@ -158,7 +158,7 @@ /// Returns a fixed-frequency timestamp. The actual frequency is dependent on /// the card and can only be queried via the driver. -LIBC_INLINE uint64_t fixed_frequrency_clock() { +LIBC_INLINE uint64_t fixed_frequency_clock() { if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl)) return __builtin_amdgcn_s_sendmsg_rtnl(0x83); else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memrealtime)) diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h --- a/libc/src/__support/GPU/generic/utils.h +++ b/libc/src/__support/GPU/generic/utils.h @@ -69,7 +69,7 @@ LIBC_INLINE uint64_t processor_clock() { return 0; } -LIBC_INLINE uint64_t fixed_frequrency_clock() { return 0; } +LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; } } // namespace gpu } // namespace __llvm_libc diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h --- a/libc/src/__support/GPU/nvptx/utils.h +++ b/libc/src/__support/GPU/nvptx/utils.h @@ -142,7 +142,7 @@ } /// Returns a global fixed-frequency timer at nanosecond frequency. -LIBC_INLINE uint64_t fixed_frequrency_clock() { +LIBC_INLINE uint64_t fixed_frequency_clock() { uint64_t nsecs; LIBC_INLINE_ASM("mov.u64 %0, %%globaltimer;" : "=l"(nsecs)); return nsecs; diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -15,6 +15,12 @@ namespace __llvm_libc { +// The AMDGPU architecture provides a fixed frequency clock used for obtaining +// real time. However, the frequency of this clock varies between cards and can +// only be obtained via the driver. The loader will set this so we can use it. +extern "C" [[gnu::visibility("protected")]] uint64_t + [[clang::address_space(4)]] __llvm_libc_clock_freq = 0; + extern "C" uintptr_t __init_array_start[]; extern "C" uintptr_t __init_array_end[]; extern "C" uintptr_t __fini_array_start[]; diff --git a/libc/test/UnitTest/LibcTest.cpp b/libc/test/UnitTest/LibcTest.cpp --- a/libc/test/UnitTest/LibcTest.cpp +++ b/libc/test/UnitTest/LibcTest.cpp @@ -15,6 +15,16 @@ #if __STDC_HOSTED__ #include +#elif defined(LIBC_TARGET_ARCH_IS_GPU) +#include "src/__support/GPU/utils.h" +static long clock() { return __llvm_libc::gpu::fixed_frequency_clock(); } +#if LIBC_TARGET_ARCH_IS_NVPTX +uint64_t CLOCKS_PER_SEC = 1000000000UL; +#else +// The AMDGPU loader needs to initialize this at runtime by querying the driver. +extern "C" [[gnu::visibility("protected")]] uint64_t __llvm_libc_clock_freq; +uint64_t CLOCKS_PER_SEC = __llvm_libc_clock_freq; +#endif #else static long clock() { return 0; } #define CLOCKS_PER_SEC 1 @@ -136,14 +146,22 @@ break; case RunContext::RunResult::Pass: tlog << GREEN << "[ OK ] " << RESET << TestName; -#if __STDC_HOSTED__ +#if __STDC_HOSTED__ || defined(LIBC_TARGET_ARCH_IS_GPU) tlog << " (took "; if (start_time > end_time) { tlog << "unknown - try rerunning)\n"; } else { const auto duration = end_time - start_time; - const uint64_t duration_ms = duration * 1000 / CLOCKS_PER_SEC; - tlog << duration_ms << " ms)\n"; + const uint64_t duration_ms = (duration * 1000) / CLOCKS_PER_SEC; + const uint64_t duration_us = (duration * 1000 * 1000) / CLOCKS_PER_SEC; + const uint64_t duration_ns = + (duration * 1000 * 1000 * 1000) / CLOCKS_PER_SEC; + if (duration_ms != 0) + tlog << duration_ms << " ms)\n"; + else if (duration_us != 0) + tlog << duration_us << " us)\n"; + else + tlog << duration_ns << " ns)\n"; } #else tlog << '\n'; diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -264,6 +264,30 @@ return HSA_STATUS_SUCCESS; } +/// Copies data from the source agent to the destination agent. The source +/// memory must first be pinned explicitly or allocated via HSA. +static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent, + const void *src, hsa_agent_t src_agent, + uint64_t size) { + // Create a memory signal to copy information between the host and device. + hsa_signal_t memory_signal; + if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) + return err; + + if (hsa_status_t err = hsa_amd_memory_async_copy( + dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal)) + return err; + + while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, + UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) + ; + + if (hsa_status_t err = hsa_signal_destroy(memory_signal)) + return err; + + return HSA_STATUS_SUCCESS; +} + int load(int argc, char **argv, char **envp, void *image, size_t size, const LaunchParameters ¶ms) { // Initialize the HSA runtime used to communicate with the device. @@ -388,6 +412,34 @@ wavefront_size, rpc_alloc, &tuple)) handle_error(err); + // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU. + void *host_clock_freq; + if (hsa_status_t err = + hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t), + /*flags=*/0, &host_clock_freq)) + handle_error(err); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq); + + if (hsa_status_t err = hsa_agent_get_info( + dev_agent, + static_cast(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY), + host_clock_freq)) + handle_error(err); + + hsa_executable_symbol_t freq_sym; + if (hsa_status_t err = hsa_executable_get_symbol_by_name( + executable, "__llvm_libc_clock_freq", &dev_agent, &freq_sym)) + handle_error(err); + + void *freq_addr; + if (hsa_status_t err = hsa_executable_symbol_get_info( + freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &freq_addr)) + handle_error(err); + + if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq, + host_agent, sizeof(uint64_t))) + handle_error(err); + // Obtain a queue with the minimum (power of two) size, used to send commands // to the HSA runtime and launch execution on the device. uint64_t queue_size; @@ -414,12 +466,6 @@ coarsegrained_pool, queue, params, "_start.kd", args)) handle_error(err); - // Create a memory signal and copy the return value back from the device into - // a new buffer. - hsa_signal_t memory_signal; - if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) - handle_error(err); - void *host_ret; if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int), @@ -428,14 +474,9 @@ hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret); if (hsa_status_t err = - hsa_amd_memory_async_copy(host_ret, host_agent, dev_ret, dev_agent, - sizeof(int), 0, nullptr, memory_signal)) + hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int))) handle_error(err); - while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, - UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) - ; - // Save the return value and perform basic clean-up. int ret = *static_cast(host_ret); @@ -458,8 +499,6 @@ if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret)) handle_error(err); - if (hsa_status_t err = hsa_signal_destroy(memory_signal)) - handle_error(err); if (hsa_status_t err = hsa_queue_destroy(queue)) handle_error(err);