diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1160,6 +1160,8 @@ * ``LIBOMPTARGET_AMDGPU_TEAMS_PER_CU`` * ``LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES`` * ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS`` +* ``LIBOMPTARGET_AMDGPU_KERNEL_BUSYWAIT`` +* ``LIBOMPTARGET_AMDGPU_DATA_BUSYWAIT`` The environment variables ``LIBOMPTARGET_SHARED_MEMORY_SIZE``, ``LIBOMPTARGET_STACK_SIZE`` and ``LIBOMPTARGET_HEAP_SIZE`` are described in @@ -1238,6 +1240,22 @@ streams. More HSA signals will be created dynamically throughout the execution if needed. The default value is ``64``. +LIBOMPTARGET_AMDGPU_KERNEL_BUSYWAIT +""""""""""""""""""""""""""""""""""" + +This environment variable controls the timeout hint in microseconds for the +HSA wait state within the AMDGPU plugin in kernel launches. A value larger +than ``0`` may reduce the latency within the HSA runtime when waiting for +the signal to complete. The default value is ``0``. + +LIBOMPTARGET_AMDGPU_DATA_BUSYWAIT +""""""""""""""""""""""""""""""""""" + +This environment variable controls the timeout hint in microseconds for the +HSA wait state within the AMDGPU plugin for data transfers. A value larger +than ``0`` may reduce the latency within the HSA runtime when waiting for +the signal to complete. The default value is ``0``. + .. _remote_offloading_plugin: Remote Offloading Plugin: diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -511,8 +511,14 @@ } /// Wait until the signal gets a zero value. - Error wait() const { - // TODO: Is it better to use busy waiting or blocking the thread? + Error wait(const uint64_t ActiveTimeout = 0) const { + if (ActiveTimeout) { + hsa_signal_value_t Got = 1; + Got = hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, + ActiveTimeout, HSA_WAIT_STATE_ACTIVE); + if (Got == 0) + return Plugin::success(); + } while (hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) ; @@ -890,6 +896,9 @@ /// Return the last valid slot on the stream. uint32_t last() const { return size() - 1; } + /// Timeout hint for HSA actively waiting for signal value to change + uint64_t KernelBusyWaitTics; + /// Consume one slot from the stream. Since the stream uses signals on demand /// and releases them once the slot is no longer used, the function requires /// an idle signal for the new consumed slot. @@ -1247,7 +1256,7 @@ return Plugin::success(); // Wait until all previous operations on the stream have completed. - if (auto Err = Slots[last()].Signal->wait()) + if (auto Err = Slots[last()].Signal->wait(KernelBusyWaitTics)) return Err; // Reset the stream and perform all pending post actions. @@ -1555,6 +1564,8 @@ 1 * 1024 * 1024), // 1MB OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS", 64), + OMPX_KernelBusyWait("LIBOMPTARGET_AMDGPU_KERNEL_BUSYWAIT", 0), + OMPX_DataBusyWait("LIBOMPTARGET_AMDGPU_DATA_BUSYWAIT", 0), AMDGPUStreamManager(*this), AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice), Queues() {} @@ -1679,6 +1690,9 @@ return Plugin::success(); } + uint64_t getDataBusyWaitTics() const { return OMPX_DataBusyWait; } + uint64_t getKernelBusyWaitTics() const { return OMPX_KernelBusyWait; } + Expected> doJITPostProcessing(std::unique_ptr MB) const override { @@ -1941,7 +1955,7 @@ Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) return Err; - if (auto Err = Signal.wait()) + if (auto Err = Signal.wait(getDataBusyWaitTics())) return Err; if (auto Err = Signal.deinit()) @@ -1998,7 +2012,7 @@ Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) return Err; - if (auto Err = Signal.wait()) + if (auto Err = Signal.wait(getDataBusyWaitTics())) return Err; if (auto Err = Signal.deinit()) @@ -2173,6 +2187,12 @@ /// will be created. UInt32Envar OMPX_InitialNumSignals; + /// Environment variables to set the time to wait in active state before + /// switching to blocked state. The default 0 goes directly to blocked state. + /// The unit for these variables are microseconds. + UInt32Envar OMPX_KernelBusyWait; + UInt32Envar OMPX_DataBusyWait; + /// Stream manager for AMDGPU streams. AMDGPUStreamManagerTy AMDGPUStreamManager; @@ -2267,7 +2287,8 @@ : Agent(Device.getAgent()), Queue(Device.getNextQueue()), SignalManager(Device.getSignalManager()), // Initialize the std::deque with some empty positions. - Slots(32), NextSlot(0), SyncCycle(0) {} + Slots(32), NextSlot(0), SyncCycle(0), + KernelBusyWaitTics(Device.getKernelBusyWaitTics()) {} /// Class implementing the AMDGPU-specific functionalities of the global /// handler.