diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1175,6 +1175,7 @@ * ``LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS`` * ``LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES`` * ``LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE`` +* ``LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING`` * ``LIBOMPTARGET_AMDGPU_TEAMS_PER_CU`` * ``LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES`` * ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS`` @@ -1231,6 +1232,17 @@ It is also the number of AQL packets that can be pushed into each queue without waiting the driver to process them. The default value is ``512``. +LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING +""""""""""""""""""""""""""""""""""""""""""" + +This environment variable controls if idle HSA queues will be preferentially +assigned to streams, for example when they are requested for a kernel launch. +Should all queues be considered busy, a new queue is initialized and returned, +until we reach the set maximum. Otherwise, we will select the least utilized +queue. If this is disabled, each time a stream is requested a new HSA queue +will be initialized, regardless of their utilization. Additionally, queues will +be selected using round robin selection. The default value is ``true``. + .. _libomptarget_amdgpu_teams_per_cu: LIBOMPTARGET_AMDGPU_TEAMS_PER_CU diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -594,13 +594,16 @@ return Plugin::check(Status, "Error in hsa_queue_destroy: %s"); } - /// Returns if this queue is considered busy - bool isBusy() const { return NumUsers > 0; } + /// Returns the number of streams, this queue is currently assigned to. + bool getUserCount() const { return NumUsers; } - /// Decrement user count of the queue object + /// Returns if the underlying HSA queue is initialized. + bool isInitialized() { return Queue != nullptr; } + + /// Decrement user count of the queue object. void removeUser() { --NumUsers; } - /// Increase user count of the queue object + /// Increase user count of the queue object. void addUser() { ++NumUsers; } /// Push a kernel launch to the queue. The kernel launch requires an output @@ -784,8 +787,9 @@ /// atomic operations. We can further investigate it if this is a bottleneck. std::mutex Mutex; - /// Indicates that the queue is busy when > 0 - int NumUsers; + /// The number of streams, this queue is currently assigned to. A queue is + /// considered idle when this is zero, otherwise: busy. + uint32_t NumUsers; }; /// Struct that implements a stream of asynchronous operations for AMDGPU @@ -1451,7 +1455,9 @@ using ResourcePoolTy = GenericDeviceResourceManagerTy; AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent) - : GenericDeviceResourceManagerTy(Device), NextQueue(0), Agent(HSAAgent) {} + : GenericDeviceResourceManagerTy(Device), + OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true), + NextQueue(0), Agent(HSAAgent) {} Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) { Queues = std::vector(NumHSAQueues); @@ -1493,35 +1499,39 @@ private: /// Search for and assign an prefereably idle queue to the given Stream. If - /// there is no queue without current users, resort to round robin selection. + /// there is no queue without current users, choose the queue with the lowest + /// user count. If utilization is ignored: use round robin selection. inline Error assignNextQueue(AMDGPUStreamTy *Stream) { - uint32_t StartIndex = NextQueue % MaxNumQueues; - AMDGPUQueueTy *Q = nullptr; - - for (int i = 0; i < MaxNumQueues; ++i) { - Q = &Queues[StartIndex++]; - if (StartIndex == MaxNumQueues) - StartIndex = 0; - - if (Q->isBusy()) - continue; - else { - if (auto Err = Q->init(Agent, QueueSize)) - return Err; - - Q->addUser(); - Stream->Queue = Q; - return Plugin::success(); + // Start from zero when tracking utilization, otherwise: round robin policy. + uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues; + + if (OMPX_QueueTracking) { + // Find the least used queue. + for (uint32_t I = 0; I < MaxNumQueues; ++I) { + // Early exit when an initialized queue is idle. + if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) { + Index = I; + break; + } + + // Update the least used queue. + if (Queues[Index].getUserCount() > Queues[I].getUserCount()) + Index = I; } } - // All queues busy: Round robin (StartIndex has the initial value again) - Queues[StartIndex].addUser(); - Stream->Queue = &Queues[StartIndex]; - ++NextQueue; + // Make sure the queue is initialized, then add user & assign. + if (auto Err = Queues[Index].init(Agent, QueueSize)) + return Err; + Queues[Index].addUser(); + Stream->Queue = &Queues[Index]; + return Plugin::success(); } + /// Envar for controlling the tracking of busy HSA queues. + BoolEnvar OMPX_QueueTracking; + /// The next queue index to use for round robin selection. uint32_t NextQueue;