diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1175,6 +1175,7 @@ * ``LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS`` * ``LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES`` * ``LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE`` +* ``LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING`` * ``LIBOMPTARGET_AMDGPU_TEAMS_PER_CU`` * ``LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES`` * ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS`` @@ -1231,6 +1232,17 @@ It is also the number of AQL packets that can be pushed into each queue without waiting the driver to process them. The default value is ``512``. +LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING +""""""""""""""""""""""""""""""""""""""""""" + +This environment variable controls if idle HSA queues will be preferentially +assigned to streams, for example when they are requested for a kernel launch. +Should all queues be considered busy, a new queue is initialized and returned, +until we reach the set maximum. Otherwise, we will resort to round robin +selection. If this is disabled, each time a stream is requested a new HSA queue +will be initialized, regardless of their utilization. The default value is +``true``. + .. _libomptarget_amdgpu_teams_per_cu: LIBOMPTARGET_AMDGPU_TEAMS_PER_CU diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1461,12 +1461,15 @@ using ResourcePoolTy = GenericDeviceResourceManagerTy; AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent) - : GenericDeviceResourceManagerTy(Device), NextQueue(0), Agent(HSAAgent) {} + : GenericDeviceResourceManagerTy(Device), + OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true), + NextQueue(0), Agent(HSAAgent) {} Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) { Queues = std::vector(NumHSAQueues); QueueSize = HSAQueueSize; MaxNumQueues = NumHSAQueues; + OMPX_QueueTracking = OMPX_QueueTracking.get(); // Initialize one queue eagerly if (auto Err = Queues.front().init(Agent, QueueSize)) return Err; @@ -1508,14 +1511,15 @@ uint32_t StartIndex = NextQueue % MaxNumQueues; AMDGPUQueueTy *Q = nullptr; - for (int i = 0; i < MaxNumQueues; ++i) { - Q = &Queues[StartIndex++]; - if (StartIndex == MaxNumQueues) - StartIndex = 0; + if (OMPX_QueueTracking || NextQueue < MaxNumQueues) + for (int i = 0; i < MaxNumQueues; ++i) { + Q = &Queues[StartIndex++]; + if (StartIndex == MaxNumQueues) + StartIndex = 0; + + if (OMPX_QueueTracking && Q->isBusy()) + continue; - if (Q->isBusy()) - continue; - else { if (auto Err = Q->init(Agent, QueueSize)) return Err; @@ -1523,7 +1527,6 @@ Stream->Queue = Q; return Plugin::success(); } - } // All queues busy: Round robin (StartIndex has the initial value again) Queues[StartIndex].addUser(); @@ -1532,6 +1535,9 @@ return Plugin::success(); } + /// Envar for controlling the tracking of busy HSA queues. + BoolEnvar OMPX_QueueTracking; + /// The next queue index to use for round robin selection. uint32_t NextQueue;