diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
@@ -25,9 +25,6 @@
 
   if (!symbol || !var_addr || !var_size)
     return HSA_STATUS_ERROR;
-  if (DeviceId < 0 ||
-      DeviceId >= g_atl_machine.processors<ATLGPUProcessor>().size())
-    return HSA_STATUS_ERROR;
 
   // get the symbol info
   std::string symbolStr = std::string(symbol);
@@ -58,9 +55,6 @@
 
   if (!kernel_name || !value)
     return HSA_STATUS_ERROR;
-  if (DeviceId < 0 ||
-      DeviceId >= g_atl_machine.processors<ATLGPUProcessor>().size())
-    return HSA_STATUS_ERROR;
 
   hsa_status_t status = HSA_STATUS_SUCCESS;
   // get the kernel info
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/machine.h b/openmp/libomptarget/plugins/amdgpu/impl/machine.h
--- a/openmp/libomptarget/plugins/amdgpu/impl/machine.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/machine.h
@@ -66,9 +66,6 @@
   }
   template <typename T> void addProcessor(const T &p);
   template <typename T> std::vector<T> &processors();
-  template <typename T> size_t processorCount() {
-    return processors<T>().size();
-  }
 
 private:
   std::vector<ATLCPUProcessor> cpu_processors_;
@@ -78,14 +75,4 @@
 hsa_amd_memory_pool_t get_memory_pool(const ATLProcessor &proc,
                                       const int mem_id);
 
-extern ATLMachine g_atl_machine;
-template <typename T> T &get_processor(int dev_id) {
-  if (dev_id == -1) {
-    // user is asking runtime to pick a device
-    // best device of this type? pick 0 for now
-    dev_id = 0;
-  }
-  return g_atl_machine.processors<T>()[dev_id];
-}
-
 #endif // SRC_RUNTIME_INCLUDE_MACHINE_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h
--- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/rt.h
@@ -51,7 +51,7 @@
 
   // modules
   static hsa_status_t RegisterModuleFromMemory(
-      void *, size_t, int DeviceId,
+      void *, size_t, hsa_agent_t agent,
       hsa_status_t (*on_deserialized_data)(void *data, size_t size,
                                            void *cb_state),
       void *cb_state, std::vector<hsa_executable_t> &HSAExecutables);
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -703,7 +703,7 @@
 } // namespace
 
 static hsa_status_t get_code_object_custom_metadata(
-    void *binary, size_t binSize, int gpu,
+    void *binary, size_t binSize,
     std::map<std::string, atl_kernel_info_t> &KernelInfoTable) {
   // parse code object with different keys from v2
   // also, the kernel name is not the same as the symbol name -- so a
@@ -878,7 +878,7 @@
 }
 
 static hsa_status_t
-populate_InfoTables(hsa_executable_symbol_t symbol, int gpu,
+populate_InfoTables(hsa_executable_symbol_t symbol,
                     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
                     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable) {
   hsa_symbol_kind_t type;
@@ -1020,16 +1020,11 @@
 hsa_status_t RegisterModuleFromMemory(
     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    void *module_bytes, size_t module_size, int gpu,
+    void *module_bytes, size_t module_size, hsa_agent_t agent,
     hsa_status_t (*on_deserialized_data)(void *data, size_t size,
                                          void *cb_state),
     void *cb_state, std::vector<hsa_executable_t> &HSAExecutables) {
   hsa_status_t err;
-  assert(gpu >= 0);
-
-  DEBUG_PRINT("Trying to load module to GPU-%d\n", gpu);
-  ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(gpu);
-  hsa_agent_t agent = proc.agent();
   hsa_executable_t executable = {0};
   hsa_profile_t agent_profile;
 
@@ -1058,7 +1053,7 @@
       // Some metadata info is not available through ROCr API, so use custom
       // code object metadata parsing to collect such metadata info
 
-      err = get_code_object_custom_metadata(module_bytes, module_size, gpu,
+      err = get_code_object_custom_metadata(module_bytes, module_size,
                                             KernelInfoTable);
       if (err != HSA_STATUS_SUCCESS) {
         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
@@ -1116,8 +1111,7 @@
     err = hsa::executable_iterate_symbols(
         executable,
         [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t {
-          return populate_InfoTables(symbol, gpu, KernelInfoTable,
-                                     SymbolInfoTable);
+          return populate_InfoTables(symbol, KernelInfoTable, SymbolInfoTable);
         });
     if (err != HSA_STATUS_SUCCESS) {
       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -91,12 +91,23 @@
 hsa_status_t RegisterModuleFromMemory(
     std::map<std::string, atl_kernel_info_t> &KernelInfo,
     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, void *, size_t,
-    int DeviceId,
+    hsa_agent_t agent,
     hsa_status_t (*on_deserialized_data)(void *data, size_t size,
                                          void *cb_state),
     void *cb_state, std::vector<hsa_executable_t> &HSAExecutables);
 }
 
+namespace hsa {
+template <typename C> hsa_status_t iterate_agents(C cb) {
+  auto L = [](hsa_agent_t agent, void *data) -> hsa_status_t {
+    C *unwrapped = static_cast<C *>(data);
+    return (*unwrapped)(agent);
+  };
+  return hsa_iterate_agents(L, static_cast<void *>(&cb));
+}
+
+} // namespace hsa
+
 /// Keep entries table per device
 struct FuncOrGblEntryTy {
   __tgt_target_table Table;
@@ -244,14 +255,10 @@
 /// FIXME: we may need this to be per device and per library.
 std::list<KernelTy> KernelsList;
 
-static std::vector<hsa_agent_t> find_gpu_agents() {
-  std::vector<hsa_agent_t> res;
-
-  hsa_status_t err = hsa_iterate_agents(
-      [](hsa_agent_t agent, void *data) -> hsa_status_t {
-        std::vector<hsa_agent_t> *res =
-            static_cast<std::vector<hsa_agent_t> *>(data);
+template <typename Callback> static hsa_status_t FindAgents(Callback CB) {
 
+  hsa_status_t err =
+      hsa::iterate_agents([&](hsa_agent_t agent) -> hsa_status_t {
         hsa_device_type_t device_type;
         // get_info fails iff HSA runtime not yet initialized
         hsa_status_t err =
@@ -260,18 +267,16 @@
           printf("rtl.cpp: err %d\n", err);
         assert(err == HSA_STATUS_SUCCESS);
 
-        if (device_type == HSA_DEVICE_TYPE_GPU) {
-          res->push_back(agent);
-        }
+        CB(device_type, agent);
         return HSA_STATUS_SUCCESS;
-      },
-      &res);
+      });
 
   // iterate_agents fails iff HSA runtime not yet initialized
-  if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS)
+  if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) {
     printf("rtl.cpp: err %d\n", err);
-  assert(err == HSA_STATUS_SUCCESS);
-  return res;
+  }
+
+  return err;
 }
 
 static void callbackQueue(hsa_status_t status, hsa_queue_t *source,
@@ -346,8 +351,7 @@
 std::pair<hsa_status_t, hsa_amd_memory_pool_t>
 FindKernargPool(const std::vector<hsa_agent_t> &HSAAgents) {
   std::vector<hsa_amd_memory_pool_t> KernArgPools;
-  for (const auto &processor : g_atl_machine.processors<ATLCPUProcessor>()) {
-    hsa_agent_t Agent = processor.agent();
+  for (const auto &Agent : HSAAgents) {
     hsa_status_t err = HSA_STATUS_SUCCESS;
     err = hsa_amd_agent_iterate_memory_pools(
         Agent, addKernArgPool, static_cast<void *>(&KernArgPools));
@@ -384,6 +388,9 @@
   std::vector<hsa_agent_t> HSAAgents;
   std::vector<hsa_queue_t *> HSAQueues; // one per gpu
 
+  // CPUs
+  std::vector<hsa_agent_t> CPUAgents;
+
   // Device properties
   std::vector<int> ComputeUnits;
   std::vector<int> GroupsPerDevice;
@@ -538,7 +545,16 @@
     // Init hostcall soon after initializing ATMI
     hostrpc_init();
 
-    HSAAgents = find_gpu_agents();
+    err = FindAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
+      if (DeviceType == HSA_DEVICE_TYPE_CPU) {
+        CPUAgents.push_back(Agent);
+      } else {
+        HSAAgents.push_back(Agent);
+      }
+    });
+    if (err != HSA_STATUS_SUCCESS)
+      return;
+
     NumberOfDevices = (int)HSAAgents.size();
 
     if (NumberOfDevices == 0) {
@@ -547,8 +563,7 @@
     } else {
       DP("There are %d devices supporting HSA.\n", NumberOfDevices);
     }
-
-    std::tie(err, KernArgPool) = core::FindKernargPool(HSAAgents);
+    std::tie(err, KernArgPool) = core::FindKernargPool(CPUAgents);
     if (err != HSA_STATUS_SUCCESS) {
       DP("Error when reading memory pools\n");
       return;
@@ -1104,8 +1119,9 @@
     return (*unwrapped)(data, size);
   };
   return core::RegisterModuleFromMemory(
-      KernelInfoTable, SymbolInfoTable, module_bytes, module_size, DeviceId, L,
-      static_cast<void *>(&cb), HSAExecutables);
+      KernelInfoTable, SymbolInfoTable, module_bytes, module_size,
+      DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&cb),
+      HSAExecutables);
 }
 } // namespace