diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1175,6 +1175,7 @@
 * ``LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS``
 * ``LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES``
 * ``LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE``
+* ``LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING``
 * ``LIBOMPTARGET_AMDGPU_TEAMS_PER_CU``
 * ``LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES``
 * ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS``
@@ -1231,6 +1232,17 @@
 It is also the number of AQL packets that can be pushed into each queue without
 waiting the driver to process them. The default value is ``512``.
 
+LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING
+"""""""""""""""""""""""""""""""""""""""""""
+
+This environment variable controls if idle HSA queues will be preferentially
+assigned to streams, for example when they are requested for a kernel launch.
+Should all queues be considered busy, a new queue is initialized and returned,
+until we reach the set maximum. Otherwise, we will resort to round robin
+selection. If this is disabled, each time a stream is requested a new HSA queue
+will be initialized, regardless of their utilization. The default value is
+``true``.
+
 .. _libomptarget_amdgpu_teams_per_cu:
 
 LIBOMPTARGET_AMDGPU_TEAMS_PER_CU
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1461,12 +1461,15 @@
   using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
 
   AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
-      : GenericDeviceResourceManagerTy(Device), NextQueue(0), Agent(HSAAgent) {}
+      : GenericDeviceResourceManagerTy(Device),
+        OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
+        NextQueue(0), Agent(HSAAgent) {}
 
   Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
     Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
     QueueSize = HSAQueueSize;
     MaxNumQueues = NumHSAQueues;
+    OMPX_QueueTracking = OMPX_QueueTracking.get();
     // Initialize one queue eagerly
     if (auto Err = Queues.front().init(Agent, QueueSize))
       return Err;
@@ -1508,14 +1511,15 @@
     uint32_t StartIndex = NextQueue % MaxNumQueues;
     AMDGPUQueueTy *Q = nullptr;
 
-    for (int i = 0; i < MaxNumQueues; ++i) {
-      Q = &Queues[StartIndex++];
-      if (StartIndex == MaxNumQueues)
-        StartIndex = 0;
+    if (OMPX_QueueTracking || NextQueue < MaxNumQueues)
+      for (int i = 0; i < MaxNumQueues; ++i) {
+        Q = &Queues[StartIndex++];
+        if (StartIndex == MaxNumQueues)
+          StartIndex = 0;
+
+        if (OMPX_QueueTracking && Q->isBusy())
+          continue;
 
-      if (Q->isBusy())
-        continue;
-      else {
         if (auto Err = Q->init(Agent, QueueSize))
           return Err;
 
@@ -1523,7 +1527,6 @@
         Stream->Queue = Q;
         return Plugin::success();
       }
-    }
 
     // All queues busy: Round robin (StartIndex has the initial value again)
     Queues[StartIndex].addUser();
@@ -1532,6 +1535,9 @@
     return Plugin::success();
   }
 
+  /// Envar for controlling the tracking of busy HSA queues.
+  BoolEnvar OMPX_QueueTracking;
+
   /// The next queue index to use for round robin selection.
   uint32_t NextQueue;