Index: openmp/docs/design/Runtimes.rst =================================================================== --- openmp/docs/design/Runtimes.rst +++ openmp/docs/design/Runtimes.rst @@ -1188,7 +1188,7 @@ operations (e.g., kernel launches and memory copies) that are executed sequentially. Parallelism is achieved by featuring multiple streams. The ``libomptarget`` leverages streams to exploit parallelism between plugin -operations. The default value is ``32``. +operations. The default value is ``1``, more streams are created as needed. LIBOMPTARGET_NUM_INITIAL_EVENTS """"""""""""""""""""""""""""""" @@ -1196,7 +1196,8 @@ This environment variable sets the number of pre-created events in the plugin (if supported) at initialization. More events will be created dynamically throughout the execution if needed. An event is used to synchronize -a stream with another efficiently. The default value is ``32``. +a stream with another efficiently. The default value is ``1``, more events are +created as needed. LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS """"""""""""""""""""""""""""""""""""" Index: openmp/libomptarget/include/Utilities.h =================================================================== --- openmp/libomptarget/include/Utilities.h +++ openmp/libomptarget/include/Utilities.h @@ -83,6 +83,12 @@ } } + Envar &operator=(const Ty &V) { + Data = V; + Initialized = true; + return *this; + } + /// Get the definitive value. const Ty &get() const { // Throw a runtime error in case this envar is not initialized. Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -581,8 +581,20 @@ return Plugin::check(Status, "Error in hsa_queue_create: %s"); } + /// If the queue is not initialized, do it now. + Error initLazy(hsa_agent_t Agent, int32_t QueueSize) { + // Lock the queue during the lazy init + std::lock_guard Lock(Mutex); + if (Queue) + return Plugin::success(); + return init(Agent, QueueSize); + } + /// Deinitialize the queue and destroy its resources. Error deinit() { + std::lock_guard Lock(Mutex); + if (!Queue) + return Plugin::success(); hsa_status_t Status = hsa_queue_destroy(Queue); return Plugin::check(Status, "Error in hsa_queue_destroy: %s"); } @@ -599,6 +611,7 @@ // the addition of other packets to the queue. The following piece of code // should be lightweight; do not block the thread, allocate memory, etc. std::lock_guard Lock(Mutex); + assert(Queue && "Interacted with a non-initialized queue!"); // Avoid defining the input dependency if already satisfied. if (InputSignal && !InputSignal->load()) @@ -647,6 +660,7 @@ const AMDGPUSignalTy *InputSignal2) { // Lock the queue during the packet publishing process. std::lock_guard Lock(Mutex); + assert(Queue && "Interacted with a non-initialized queue!"); // Push the barrier with the lock acquired. return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2); @@ -1637,14 +1651,14 @@ return Err; // Compute the number of queues and their size. - const uint32_t NumQueues = std::min(OMPX_NumQueues.get(), MaxQueues); - const uint32_t QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize); + OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues)); + OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize); // Construct and initialize each device queue. - Queues = std::vector(NumQueues); - for (AMDGPUQueueTy &Queue : Queues) - if (auto Err = Queue.init(Agent, QueueSize)) - return Err; + Queues = std::vector(OMPX_NumQueues); + // Initialize one queue eagerly. + if (auto Err = Queues.front().init(Agent, OMPX_QueueSize)) + return Err; // Initialize stream pool. if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams)) @@ -2354,12 +2368,22 @@ }); } - /// Get the next queue in a round-robin fashion. + /// Get the next queue in a round-robin fashion, includes lazy initialization. AMDGPUQueueTy &getNextQueue() { - static std::atomic NextQueue(0); - uint32_t Current = NextQueue.fetch_add(1, std::memory_order_relaxed); - return Queues[Current % Queues.size()]; + uint32_t Idx = Current % Queues.size(); + auto &Queue = Queues[Idx]; + // Only queue 0 has been initialized eagerly. Others might need lazy/late + // initialization. + if (Idx == 0) + return Queue; + + if (auto Err = Queue.initLazy(Agent, OMPX_QueueSize)) { + // Gracefully handle late initialization errors, but report them anyway. + REPORT("%s\n", toString(std::move(Err)).data()); + return Queues[0]; + } + return Queue; } private: @@ -2422,6 +2446,9 @@ /// List of device packet queues. std::vector Queues; + + // The next queue to be used for a new stream. + std::atomic NextQueue = {0}; }; Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -374,9 +374,9 @@ // device initialization. These cannot be consulted until the device is // initialized correctly. We intialize them in GenericDeviceTy::init(). OMPX_TargetStackSize(), OMPX_TargetHeapSize(), - // By default, the initial number of streams and events are 32. - OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 32), - OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 32), + // By default, the initial number of streams and events is 1. + OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1), + OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1), DeviceId(DeviceId), GridValues(OMPGridValues), PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(), PinnedAllocs(*this) {}