diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
--- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
@@ -181,8 +181,6 @@
   };
 };
 
-extern std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
-
 namespace core {
 hsa_status_t atl_init_gpu_context();
 
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -142,8 +142,6 @@
 
 ATLMachine g_atl_machine;
 
-std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
-
 /*
    atlc is all internal global values.
    The structure atl_context_t is defined in atl_internal.h
@@ -198,10 +196,6 @@
     if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) {
       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED);
       proc->addMemory(new_mem);
-      if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) {
-        DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle);
-        atl_gpu_kernarg_pools.push_back(memory_pool);
-      }
     } else {
       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED);
       proc->addMemory(new_mem);
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -37,6 +37,7 @@
 
 #include "Debug.h"
 #include "get_elf_mach_gfx_name.h"
+#include "machine.h"
 #include "omptargetplugin.h"
 #include "print_tracing.h"
 
@@ -71,6 +72,7 @@
 }
 
 int print_kernel_trace;
+extern ATLMachine g_atl_machine;
 
 #ifdef OMPTARGET_DEBUG
 #define check(msg, status)                                                     \
@@ -136,15 +138,15 @@
   KernelArgPool(const KernelArgPool &) = delete;
   KernelArgPool(KernelArgPool &&) = delete;
 
-  KernelArgPool(uint32_t kernarg_segment_size)
+  KernelArgPool(uint32_t kernarg_segment_size,
+                hsa_amd_memory_pool_t &memory_pool)
       : kernarg_segment_size(kernarg_segment_size) {
 
     // atmi uses one pool per kernel for all gpus, with a fixed upper size
     // preserving that exact scheme here, including the queue<int>
 
     hsa_status_t err = hsa_amd_memory_pool_allocate(
-        atl_gpu_kernarg_pools[0],
-        kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
+        memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
         &kernarg_region);
 
     if (err != HSA_STATUS_SUCCESS) {
@@ -224,7 +226,8 @@
 
   KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
            void *_CallStackAddr, const char *_Name,
-           uint32_t _kernarg_segment_size)
+           uint32_t _kernarg_segment_size,
+           hsa_amd_memory_pool_t &KernArgMemoryPool)
       : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
         device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) {
     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
@@ -232,8 +235,8 @@
     std::string N(_Name);
     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
       KernelArgPoolMap.insert(
-          std::make_pair(N, std::unique_ptr<KernelArgPool>(
-                                new KernelArgPool(_kernarg_segment_size))));
+          std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
+                                _kernarg_segment_size, KernArgMemoryPool))));
     }
   }
 };
@@ -297,6 +300,61 @@
   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
   return header;
 }
+
+hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
+  std::vector<hsa_amd_memory_pool_t> *Result =
+      static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
+  bool AllocAllowed = false;
+  hsa_status_t err = hsa_amd_memory_pool_get_info(
+      MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
+      &AllocAllowed);
+  if (err != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n",
+            get_error_string(err));
+    return err;
+  }
+
+  if (!AllocAllowed) {
+    // nothing needs to be done here.
+    return HSA_STATUS_SUCCESS;
+  }
+
+  uint32_t GlobalFlags = 0;
+  err = hsa_amd_memory_pool_get_info(
+      MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
+  if (err != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "Get memory pool info failed: %s\n", get_error_string(err));
+    return err;
+  }
+
+  fprintf(stderr, "Flags : %d\n", GlobalFlags);
+  if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) &&
+      (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) {
+    Result->push_back(MemoryPool);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t
+CollectKernArgPools(const std::vector<hsa_agent_t> &HSAAgents,
+                    std::vector<hsa_amd_memory_pool_t> &KernArgPools) {
+
+  for (const auto &processor : g_atl_machine.processors<ATLCPUProcessor>()) {
+    hsa_agent_t Agent = processor.agent();
+    hsa_status_t err = HSA_STATUS_SUCCESS;
+    err = hsa_amd_agent_iterate_memory_pools(
+        Agent, addKernArgPool, static_cast<void *>(&KernArgPools));
+    if (err != HSA_STATUS_SUCCESS) {
+      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
+             "Iterate all memory pools", get_error_string(err));
+      return err;
+    }
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
 } // namespace
 } // namespace core
 
@@ -344,6 +402,8 @@
   std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
   std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
 
+  std::vector<hsa_amd_memory_pool_t> KernArgPools;
+
   struct atmiFreePtrDeletor {
     void operator()(void *p) {
       core::Runtime::Memfree(p); // ignore failure to free
@@ -390,6 +450,30 @@
     return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId);
   }
 
+  // selects the first non-zero sized kernarg pool.
+  // TODO: should this take care of the device topology?
+  std::pair<hsa_status_t, hsa_amd_memory_pool_t> getKernArgPool() {
+    hsa_status_t err;
+    hsa_amd_memory_pool_t result;
+    for (int i = 0; i < KernArgPools.size(); i++) {
+      size_t size = 0;
+      hsa_amd_memory_pool_t &pool = KernArgPools[i];
+      err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
+                                         &size);
+      if (err != HSA_STATUS_SUCCESS) {
+        fprintf(stderr, "Failed to get memory pool size: %s\n", get_error_string(err));
+        return {err, pool};
+      }
+      if (size > 0) {
+        DP("Using %d kernarg pool (size = %lu bytes) out of %d pools\n", i,
+           size, KernArgPools.size());
+        return {HSA_STATUS_SUCCESS, pool};
+      }
+    }
+
+    return {HSA_STATUS_ERROR, result};
+  }
+
   // Record entry point associated with device
   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
@@ -477,6 +561,13 @@
       DP("There are %d devices supporting HSA.\n", NumberOfDevices);
     }
 
+    // collect memory pools with kernarg_init flag set
+    err = core::CollectKernArgPools(HSAAgents, KernArgPools);
+    if (err != HSA_STATUS_SUCCESS) {
+      DP("Error when reading memory pools\n");
+      return;
+    }
+
     // Init the device info
     HSAQueues.resize(NumberOfDevices);
     FuncGblEntries.resize(NumberOfDevices);
@@ -1261,6 +1352,14 @@
   }
 
   DP("ATMI module successfully loaded!\n");
+  hsa_amd_memory_pool_t KernArgPool;
+  hsa_status_t Err;
+  std::tie(Err, KernArgPool) = DeviceInfo.getKernArgPool();
+  if (Err != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "Unable to find any kernarg pool: %s\n",
+            get_error_string(Err));
+    return NULL;
+  }
 
   {
     // the device_State array is either large value in bss or a void* that
@@ -1543,8 +1642,8 @@
     }
 
     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
-                                   CallStackAddr, e->name,
-                                   kernarg_segment_size));
+                                   CallStackAddr, e->name, kernarg_segment_size,
+                                   KernArgPool));
     __tgt_offload_entry entry = *e;
     entry.addr = (void *)&KernelsList.back();
     DeviceInfo.addOffloadEntry(device_id, entry);