Index: openmp/docs/design/Runtimes.rst
===================================================================
--- openmp/docs/design/Runtimes.rst
+++ openmp/docs/design/Runtimes.rst
@@ -1188,7 +1188,7 @@
 operations (e.g., kernel launches and memory copies) that are executed
 sequentially. Parallelism is achieved by featuring multiple streams. The
 ``libomptarget`` leverages streams to exploit parallelism between plugin
-operations. The default value is ``32``.
+operations. The default value is ``1``, more streams are created as needed.
 
 LIBOMPTARGET_NUM_INITIAL_EVENTS
 """""""""""""""""""""""""""""""
@@ -1196,7 +1196,8 @@
 This environment variable sets the number of pre-created events in the
 plugin (if supported) at initialization. More events will be created
 dynamically throughout the execution if needed. An event is used to synchronize
-a stream with another efficiently. The default value is ``32``.
+a stream with another efficiently. The default value is ``1``, more events are
+created as needed.
 
 LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS
 """""""""""""""""""""""""""""""""""""
Index: openmp/libomptarget/include/Utilities.h
===================================================================
--- openmp/libomptarget/include/Utilities.h
+++ openmp/libomptarget/include/Utilities.h
@@ -83,6 +83,12 @@
     }
   }
 
+  Envar<Ty> &operator=(const Ty &V) {
+    Data = V;
+    Initialized = true;
+    return *this;
+  }
+
   /// Get the definitive value.
   const Ty &get() const {
     // Throw a runtime error in case this envar is not initialized.
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -581,8 +581,20 @@
     return Plugin::check(Status, "Error in hsa_queue_create: %s");
   }
 
+  /// If the queue is not initialized, do it now.
+  Error initLazy(hsa_agent_t Agent, int32_t QueueSize) {
+    // Lock the queue during the lazy init
+    std::lock_guard<std::mutex> Lock(Mutex);
+    if (Queue)
+      return Plugin::success();
+    return init(Agent, QueueSize);
+  }
+
   /// Deinitialize the queue and destroy its resources.
   Error deinit() {
+    std::lock_guard<std::mutex> Lock(Mutex);
+    if (!Queue)
+      return Plugin::success();
     hsa_status_t Status = hsa_queue_destroy(Queue);
     return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
   }
@@ -599,6 +611,7 @@
     // the addition of other packets to the queue. The following piece of code
     // should be lightweight; do not block the thread, allocate memory, etc.
     std::lock_guard<std::mutex> Lock(Mutex);
+    assert(Queue && "Interacted with a non-initialized queue!");
 
     // Avoid defining the input dependency if already satisfied.
     if (InputSignal && !InputSignal->load())
@@ -647,6 +660,7 @@
                     const AMDGPUSignalTy *InputSignal2) {
     // Lock the queue during the packet publishing process.
     std::lock_guard<std::mutex> Lock(Mutex);
+    assert(Queue && "Interacted with a non-initialized queue!");
 
     // Push the barrier with the lock acquired.
     return pushBarrierImpl(OutputSignal, InputSignal1, InputSignal2);
@@ -1637,14 +1651,14 @@
       return Err;
 
     // Compute the number of queues and their size.
-    const uint32_t NumQueues = std::min(OMPX_NumQueues.get(), MaxQueues);
-    const uint32_t QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
+    OMPX_NumQueues = std::max(1U, std::min(OMPX_NumQueues.get(), MaxQueues));
+    OMPX_QueueSize = std::min(OMPX_QueueSize.get(), MaxQueueSize);
 
     // Construct and initialize each device queue.
-    Queues = std::vector<AMDGPUQueueTy>(NumQueues);
-    for (AMDGPUQueueTy &Queue : Queues)
-      if (auto Err = Queue.init(Agent, QueueSize))
-        return Err;
+    Queues = std::vector<AMDGPUQueueTy>(OMPX_NumQueues);
+    // Initialize one queue eagerly.
+    if (auto Err = Queues.front().init(Agent, OMPX_QueueSize))
+      return Err;
 
     // Initialize stream pool.
     if (auto Err = AMDGPUStreamManager.init(OMPX_InitialNumStreams))
@@ -2354,12 +2368,22 @@
         });
   }
 
-  /// Get the next queue in a round-robin fashion.
+  /// Get the next queue in a round-robin fashion, includes lazy initialization.
   AMDGPUQueueTy &getNextQueue() {
-    static std::atomic<uint32_t> NextQueue(0);
-
     uint32_t Current = NextQueue.fetch_add(1, std::memory_order_relaxed);
-    return Queues[Current % Queues.size()];
+    uint32_t Idx = Current % Queues.size();
+    auto &Queue = Queues[Idx];
+    // Only queue 0 has been initialized eagerly. Others might need lazy/late
+    // initialization.
+    if (Idx == 0)
+      return Queue;
+
+    if (auto Err = Queue.initLazy(Agent, OMPX_QueueSize)) {
+      // Gracefully handle late initialization errors, but report them anyway.
+      REPORT("%s\n", toString(std::move(Err)).data());
+      return Queues[0];
+    }
+    return Queue;
   }
 
 private:
@@ -2422,6 +2446,9 @@
 
   /// List of device packet queues.
   std::vector<AMDGPUQueueTy> Queues;
+
+  // The next queue to be used for a new stream.
+  std::atomic<uint32_t> NextQueue = {0};
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -374,9 +374,9 @@
       // device initialization. These cannot be consulted until the device is
       // initialized correctly. We intialize them in GenericDeviceTy::init().
       OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
-      // By default, the initial number of streams and events are 32.
-      OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 32),
-      OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 32),
+      // By default, the initial number of streams and events is 1.
+      OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
+      OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
       DeviceId(DeviceId), GridValues(OMPGridValues),
       PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
       PinnedAllocs(*this) {}