diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -511,8 +511,14 @@ } /// Wait until the signal gets a zero value. - Error wait() const { - // TODO: Is it better to use busy waiting or blocking the thread? + Error wait(const uint64_t ActiveTimeout = 0) const { + if (ActiveTimeout) { + hsa_signal_value_t Got = 1; + Got = hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, + ActiveTimeout, HSA_WAIT_STATE_ACTIVE); + if (Got == 0) + return Plugin::success(); + } while (hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) ; @@ -1236,6 +1242,8 @@ return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"); } + uint64_t KernelBusyWaitTics; // initialized from AMDGPUDeviceTy + /// Synchronize with the stream. The current thread waits until all operations /// are finalized and it performs the pending post actions (i.e., releasing /// intermediate buffers). @@ -1247,7 +1255,7 @@ return Plugin::success(); // Wait until all previous operations on the stream have completed. - if (auto Err = Slots[last()].Signal->wait()) + if (auto Err = Slots[last()].Signal->wait(KernelBusyWaitTics)) return Err; // Reset the stream and perform all pending post actions. @@ -1555,12 +1563,17 @@ 1 * 1024 * 1024), // 1MB OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS", 64), + OMPX_KernelBusyWait("LIBOMPTARGET_AMDGPU_KERNEL_BUSYWAIT", 0), + OMPX_DataBusyWait("LIBOMPTARGET_AMDGPU_DATA_BUSYWAIT", 0), AMDGPUStreamManager(*this), AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice), Queues() {} ~AMDGPUDeviceTy() {} + uint64_t KernelBusyWaitTics; + uint64_t DataBusyWaitTics; + /// Initialize the device, its resources and get its properties. Error initImpl(GenericPluginTy &Plugin) override { // First setup all the memory pools. @@ -1622,6 +1635,9 @@ const uint32_t NumQueues = std::min(OMPX_NumQueues.get(), MaxQueues); const uint32_t QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize); + KernelBusyWaitTics = OMPX_KernelBusyWait; + DataBusyWaitTics = OMPX_DataBusyWait; + // Construct and initialize each device queue. Queues = std::vector(NumQueues); for (AMDGPUQueueTy &Queue : Queues) @@ -1679,6 +1695,9 @@ return Plugin::success(); } + uint64_t getDataBusyWaitTics() const { return DataBusyWaitTics; } + uint64_t getKernelBusyWaitTics() const { return KernelBusyWaitTics; } + Expected> doJITPostProcessing(std::unique_ptr MB) const override { @@ -1941,7 +1960,7 @@ Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) return Err; - if (auto Err = Signal.wait()) + if (auto Err = Signal.wait(getDataBusyWaitTics())) return Err; if (auto Err = Signal.deinit()) @@ -1998,7 +2017,7 @@ Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) return Err; - if (auto Err = Signal.wait()) + if (auto Err = Signal.wait(getDataBusyWaitTics())) return Err; if (auto Err = Signal.deinit()) @@ -2173,6 +2192,11 @@ /// will be created. UInt32Envar OMPX_InitialNumSignals; + /// Environment variables to set the time to wait in active state before + /// switching to blocked state. The default 0 goes directly to blocked state. + UInt32Envar OMPX_KernelBusyWait; + UInt32Envar OMPX_DataBusyWait; + /// Stream manager for AMDGPU streams. AMDGPUStreamManagerTy AMDGPUStreamManager; @@ -2267,7 +2291,8 @@ : Agent(Device.getAgent()), Queue(Device.getNextQueue()), SignalManager(Device.getSignalManager()), // Initialize the std::deque with some empty positions. - Slots(32), NextSlot(0), SyncCycle(0) {} + Slots(32), NextSlot(0), SyncCycle(0), + KernelBusyWaitTics(Device.getKernelBusyWaitTics()) {} /// Class implementing the AMDGPU-specific functionalities of the global /// handler.