diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1160,6 +1160,7 @@ * ``LIBOMPTARGET_AMDGPU_TEAMS_PER_CU`` * ``LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES`` * ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS`` +* ``LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT`` The environment variables ``LIBOMPTARGET_SHARED_MEMORY_SIZE``, ``LIBOMPTARGET_STACK_SIZE`` and ``LIBOMPTARGET_HEAP_SIZE`` are described in @@ -1238,6 +1239,14 @@ streams. More HSA signals will be created dynamically throughout the execution if needed. The default value is ``64``. +LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT +""""""""""""""""""""""""""""""""""" + +This environment variable controls the timeout hint in microseconds for the +HSA wait state within the AMDGPU plugin. For the duration of this value +the HSA runtime may busy wait. This can reduce overall latency. +The default value is ``2000000``. + .. _remote_offloading_plugin: Remote Offloading Plugin: diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -511,8 +511,14 @@ } /// Wait until the signal gets a zero value. - Error wait() const { - // TODO: Is it better to use busy waiting or blocking the thread? + Error wait(const uint64_t ActiveTimeout = 0) const { + if (ActiveTimeout) { + hsa_signal_value_t Got = 1; + Got = hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, + ActiveTimeout, HSA_WAIT_STATE_ACTIVE); + if (Got == 0) + return Plugin::success(); + } while (hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) ; @@ -890,6 +896,9 @@ /// Return the last valid slot on the stream. uint32_t last() const { return size() - 1; } + /// Timeout hint for HSA actively waiting for signal value to change + const uint64_t StreamBusyWaitMicroseconds; + /// Consume one slot from the stream. Since the stream uses signals on demand /// and releases them once the slot is no longer used, the function requires /// an idle signal for the new consumed slot. @@ -1247,7 +1256,7 @@ return Plugin::success(); // Wait until all previous operations on the stream have completed. - if (auto Err = Slots[last()].Signal->wait()) + if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds)) return Err; // Reset the stream and perform all pending post actions. @@ -1555,6 +1564,7 @@ 1 * 1024 * 1024), // 1MB OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS", 64), + OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000), AMDGPUStreamManager(*this), AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice), Queues() {} @@ -1679,6 +1689,10 @@ return Plugin::success(); } + const uint64_t getStreamBusyWaitMicroseconds() const { + return OMPX_StreamBusyWait; + } + Expected> doJITPostProcessing(std::unique_ptr MB) const override { @@ -1941,7 +1955,7 @@ Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) return Err; - if (auto Err = Signal.wait()) + if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) return Err; if (auto Err = Signal.deinit()) @@ -1998,7 +2012,7 @@ Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) return Err; - if (auto Err = Signal.wait()) + if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) return Err; if (auto Err = Signal.deinit()) @@ -2173,6 +2187,12 @@ /// will be created. UInt32Envar OMPX_InitialNumSignals; + /// Environment variables to set the time to wait in active state before + /// switching to blocked state. The default 2000000 busywaits for 2 seconds + /// before going into a blocking HSA wait state. The unit for these variables + /// are microseconds. + UInt32Envar OMPX_StreamBusyWait; + /// Stream manager for AMDGPU streams. AMDGPUStreamManagerTy AMDGPUStreamManager; @@ -2267,7 +2287,8 @@ : Agent(Device.getAgent()), Queue(Device.getNextQueue()), SignalManager(Device.getSignalManager()), // Initialize the std::deque with some empty positions. - Slots(32), NextSlot(0), SyncCycle(0) {} + Slots(32), NextSlot(0), SyncCycle(0), + StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {} /// Class implementing the AMDGPU-specific functionalities of the global /// handler.