diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -349,7 +349,7 @@
   // cleanup without risking running outside of the lifetime of HSA
   const hsa_status_t S;
 
-  bool success() { return S == HSA_STATUS_SUCCESS; }
+  bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
   HSALifetime() : S(hsa_init()) {}
 
   ~HSALifetime() {
@@ -363,9 +363,63 @@
   }
 };
 
+// Handle scheduling of multiple hsa_queue's per device to
+// multiple threads (one scheduler per device)
+class HSAQueueScheduler {
+public:
+  HSAQueueScheduler() : current(0) {}
+
+  HSAQueueScheduler(const HSAQueueScheduler &) = delete;
+
+  HSAQueueScheduler(HSAQueueScheduler &&q) {
+    current = q.current.load();
+    for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
+      HSAQueues[i] = q.HSAQueues[i];
+      q.HSAQueues[i] = nullptr;
+    }
+  }
+
+  // \return false if any HSA queue creation fails
+  bool CreateQueues(hsa_agent_t HSAAgent, uint32_t queue_size) {
+    for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
+      hsa_queue_t *Q = nullptr;
+      hsa_status_t rc =
+          hsa_queue_create(HSAAgent, queue_size, HSA_QUEUE_TYPE_MULTI,
+                           callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
+      if (rc != HSA_STATUS_SUCCESS) {
+        DP("Failed to create HSA queue %d\n", i);
+        return false;
+      }
+      HSAQueues[i] = Q;
+    }
+    return true;
+  }
+
+  ~HSAQueueScheduler() {
+    for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) {
+      if (HSAQueues[i]) {
+        hsa_status_t err = hsa_queue_destroy(HSAQueues[i]);
+        if (err != HSA_STATUS_SUCCESS)
+          DP("Error destroying HSA queue");
+      }
+    }
+  }
+
+  // \return next queue to use for device
+  hsa_queue_t *Next() {
+    return HSAQueues[(current.fetch_add(1, std::memory_order_relaxed)) %
+                     NUM_QUEUES_PER_DEVICE];
+  }
+
+private:
+  // Number of queues per device
+  enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
+  hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
+  std::atomic<uint8_t> current;
+};
+
 /// Class containing all the device information
-class RTLDeviceInfoTy {
-  HSALifetime HSA; // First field => constructed first and destructed last
+class RTLDeviceInfoTy : HSALifetime {
   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
 
   struct QueueDeleter {
@@ -390,8 +444,7 @@
 
   // GPU devices
   std::vector<hsa_agent_t> HSAAgents;
-  std::vector<std::unique_ptr<hsa_queue_t, QueueDeleter>>
-      HSAQueues; // one per gpu
+  std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
 
   // CPUs
   std::vector<hsa_agent_t> CPUAgents;
@@ -658,7 +711,7 @@
     //  1 => tracing dispatch only
     // >1 => verbosity increase
 
-    if (!HSA.success()) {
+    if (!HSAInitSuccess()) {
       DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
       return;
     }
@@ -697,7 +750,7 @@
     }
 
     // Init the device info
-    HSAQueues.resize(NumberOfDevices);
+    HSAQueueSchedulers.reserve(NumberOfDevices);
     FuncGblEntries.resize(NumberOfDevices);
     ThreadsPerGroup.resize(NumberOfDevices);
     ComputeUnits.resize(NumberOfDevices);
@@ -740,15 +793,10 @@
       }
 
       {
-        hsa_queue_t *Q = nullptr;
-        hsa_status_t rc =
-            hsa_queue_create(HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI,
-                             callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
-        if (rc != HSA_STATUS_SUCCESS) {
-          DP("Failed to create HSA queue %d\n", i);
+        HSAQueueScheduler QSched;
+        if (!QSched.CreateQueues(HSAAgents[i], queue_size))
           return;
-        }
-        HSAQueues[i].reset(Q);
+        HSAQueueSchedulers.emplace_back(std::move(QSched));
       }
 
       deviceStateStore[i] = {nullptr, 0};
@@ -776,7 +824,7 @@
 
   ~RTLDeviceInfoTy() {
     DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
-    if (!HSA.success()) {
+    if (!HSAInitSuccess()) {
       // Then none of these can have been set up and they can't be torn down
       return;
     }
@@ -1113,7 +1161,7 @@
 
   // Run on the device.
   {
-    hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();
+    hsa_queue_t *queue = DeviceInfo.HSAQueueSchedulers[device_id].Next();
     if (!queue) {
       return OFFLOAD_FAIL;
     }