diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -349,7 +349,7 @@ // cleanup without risking running outside of the lifetime of HSA const hsa_status_t S; - bool success() { return S == HSA_STATUS_SUCCESS; } + bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; } HSALifetime() : S(hsa_init()) {} ~HSALifetime() { @@ -363,9 +363,63 @@ } }; +// Handle scheduling of multiple hsa_queue's per device to +// multiple threads (one scheduler per device) +class HSAQueueScheduler { +public: + HSAQueueScheduler() : current(0) {} + + HSAQueueScheduler(const HSAQueueScheduler &) = delete; + + HSAQueueScheduler(HSAQueueScheduler &&q) { + current = q.current.load(); + for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) { + HSAQueues[i] = q.HSAQueues[i]; + q.HSAQueues[i] = nullptr; + } + } + + // \return false if any HSA queue creation fails + bool CreateQueues(hsa_agent_t HSAAgent, uint32_t queue_size) { + for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) { + hsa_queue_t *Q = nullptr; + hsa_status_t rc = + hsa_queue_create(HSAAgent, queue_size, HSA_QUEUE_TYPE_MULTI, + callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q); + if (rc != HSA_STATUS_SUCCESS) { + DP("Failed to create HSA queue %d\n", i); + return false; + } + HSAQueues[i] = Q; + } + return true; + } + + ~HSAQueueScheduler() { + for (uint8_t i = 0; i < NUM_QUEUES_PER_DEVICE; i++) { + if (HSAQueues[i]) { + hsa_status_t err = hsa_queue_destroy(HSAQueues[i]); + if (err != HSA_STATUS_SUCCESS) + DP("Error destroying HSA queue"); + } + } + } + + // \return next queue to use for device + hsa_queue_t *Next() { + return HSAQueues[(current.fetch_add(1, std::memory_order_relaxed)) % + NUM_QUEUES_PER_DEVICE]; + } + +private: + // Number of queues per device + enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 }; + hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {}; + std::atomic current; +}; + /// Class containing all the device information -class RTLDeviceInfoTy { - HSALifetime HSA; // First field => constructed first and destructed last +class RTLDeviceInfoTy : HSALifetime { std::vector> FuncGblEntries; struct QueueDeleter { @@ -390,8 +444,7 @@ // GPU devices std::vector HSAAgents; - std::vector> - HSAQueues; // one per gpu + std::vector HSAQueueSchedulers; // one per gpu // CPUs std::vector CPUAgents; @@ -658,7 +711,7 @@ // 1 => tracing dispatch only // >1 => verbosity increase - if (!HSA.success()) { + if (!HSAInitSuccess()) { DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"); return; } @@ -697,7 +750,7 @@ } // Init the device info - HSAQueues.resize(NumberOfDevices); + HSAQueueSchedulers.reserve(NumberOfDevices); FuncGblEntries.resize(NumberOfDevices); ThreadsPerGroup.resize(NumberOfDevices); ComputeUnits.resize(NumberOfDevices); @@ -740,15 +793,10 @@ } { - hsa_queue_t *Q = nullptr; - hsa_status_t rc = - hsa_queue_create(HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, - callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q); - if (rc != HSA_STATUS_SUCCESS) { - DP("Failed to create HSA queue %d\n", i); + HSAQueueScheduler QSched; + if (!QSched.CreateQueues(HSAAgents[i], queue_size)) return; - } - HSAQueues[i].reset(Q); + HSAQueueSchedulers.emplace_back(std::move(QSched)); } deviceStateStore[i] = {nullptr, 0}; @@ -776,7 +824,7 @@ ~RTLDeviceInfoTy() { DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"); - if (!HSA.success()) { + if (!HSAInitSuccess()) { // Then none of these can have been set up and they can't be torn down return; } @@ -1113,7 +1161,7 @@ // Run on the device. { - hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get(); + hsa_queue_t *queue = DeviceInfo.HSAQueueSchedulers[device_id].Next(); if (!queue) { return OFFLOAD_FAIL; }