diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*===--------------------------------------------------------------------------
- *              ATMI (Asynchronous Task and Memory Interface)
- *
- * This file is distributed under the MIT License. See LICENSE.txt for details.
- *===------------------------------------------------------------------------*/
-#ifndef INCLUDE_ATMI_INTEROP_HSA_H_
-#define INCLUDE_ATMI_INTEROP_HSA_H_
-
-#include "atmi_runtime.h"
-#include "hsa.h"
-#include "hsa_ext_amd.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-/** \defgroup interop_hsa_functions ATMI-HSA Interop
- *  @{
- */
-
-/**
- * @brief Get the device address and size of an HSA global symbol
- *
- * @detail Use this function to query the device address and size of an HSA
- * global symbol.
- * The symbol can be set at by the compiler or by the application writer in a
- * language-specific manner. This function is meaningful only after calling one
- * of the @p atmi_module_register functions.
- *
- * @param[in] place The ATMI memory place
- *
- * @param[in] symbol Pointer to a non-NULL global symbol name
- *
- * @param[in] var_addr Pointer to a non-NULL @p void* variable that will
- * hold the device address of the global symbol object.
- *
- * @param[in] var_size Pointer to a non-NULL @p uint variable that will
- * hold the size of the global symbol object.
- *
- * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
- *
- * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are
- * invalid
- * location in the current node, or if ATMI is not initialized.
- *
- * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
- */
-atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place,
-                                               const char *symbol,
-                                               void **var_addr,
-                                               unsigned int *var_size);
-
-/**
- * @brief Get the HSA-specific kernel info from a kernel name
- *
- * @detail Use this function to query the HSA-specific kernel info from the
- * kernel name.
- * This function is meaningful only after calling one
- * of the @p atmi_module_register functions.
- *
- * @param[in] place The ATMI memory place
- *
- * @param[in] kernel_name Pointer to a char array with the kernel name
- *
- * @param[in] info The different possible kernel properties
- *
- * @param[in] value Pointer to a non-NULL @p uint variable that will
- * hold the return value of the kernel property.
- *
- * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
- *
- * @retval ::ATMI_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are
- * invalid
- * location in the current node, or if ATMI is not initialized.
- *
- * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
- */
-atmi_status_t atmi_interop_hsa_get_kernel_info(
-    atmi_mem_place_t place, const char *kernel_name,
-    hsa_executable_symbol_info_t info, uint32_t *value);
-/** @} */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // INCLUDE_ATMI_INTEROP_HSA_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_interop_hsa.cpp
@@ -3,15 +3,14 @@
  *
  * This file is distributed under the MIT License. See LICENSE.txt for details.
  *===------------------------------------------------------------------------*/
-#include "atmi_interop_hsa.h"
 #include "internal.h"
 
 using core::atl_is_atmi_initialized;
 
-atmi_status_t atmi_interop_hsa_get_symbol_info(atmi_mem_place_t place,
-                                               const char *symbol,
-                                               void **var_addr,
-                                               unsigned int *var_size) {
+atmi_status_t atmi_interop_hsa_get_symbol_info(
+    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
+    atmi_mem_place_t place, const char *symbol, void **var_addr,
+    unsigned int *var_size) {
   /*
      // Typical usage:
      void *var_addr;
@@ -32,9 +31,8 @@
 
   // get the symbol info
   std::string symbolStr = std::string(symbol);
-  if (SymbolInfoTable[place.dev_id].find(symbolStr) !=
-      SymbolInfoTable[place.dev_id].end()) {
-    atl_symbol_info_t info = SymbolInfoTable[place.dev_id][symbolStr];
+  if (SymbolInfoTable.find(symbolStr) != SymbolInfoTable.end()) {
+    atl_symbol_info_t info = SymbolInfoTable[symbolStr];
     *var_addr = reinterpret_cast<void *>(info.addr);
     *var_size = info.size;
     return ATMI_STATUS_SUCCESS;
@@ -46,6 +44,7 @@
 }
 
 atmi_status_t atmi_interop_hsa_get_kernel_info(
+    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
     atmi_mem_place_t place, const char *kernel_name,
     hsa_executable_symbol_info_t kernel_info, uint32_t *value) {
   /*
@@ -68,9 +67,8 @@
   atmi_status_t status = ATMI_STATUS_SUCCESS;
   // get the kernel info
   std::string kernelStr = std::string(kernel_name);
-  if (KernelInfoTable[place.dev_id].find(kernelStr) !=
-      KernelInfoTable[place.dev_id].end()) {
-    atl_kernel_info_t info = KernelInfoTable[place.dev_id][kernelStr];
+  if (KernelInfoTable.find(kernelStr) != KernelInfoTable.end()) {
+    atl_kernel_info_t info = KernelInfoTable[kernelStr];
     switch (kernel_info) {
     case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE:
       *value = info.group_segment_size;
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
--- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
@@ -106,9 +106,6 @@
   uint32_t size;
 } atl_symbol_info_t;
 
-extern std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
-extern std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
-
 // ---------------------- Kernel End -------------
 
 namespace core {
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -20,6 +20,20 @@
 
 #include "msgpack.h"
 
+namespace hsa {
+// Wrap HSA iterate API in a shim that allows passing general callables
+template <typename C>
+hsa_status_t executable_iterate_symbols(hsa_executable_t executable, C cb) {
+  auto L = [](hsa_executable_t executable, hsa_executable_symbol_t symbol,
+              void *data) -> hsa_status_t {
+    C *unwrapped = static_cast<C *>(data);
+    return (*unwrapped)(executable, symbol);
+  };
+  return hsa_executable_iterate_symbols(executable, L,
+                                        static_cast<void *>(&cb));
+}
+} // namespace hsa
+
 typedef unsigned char *address;
 /*
  * Note descriptors.
@@ -132,9 +146,6 @@
 
 std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
 
-std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
-std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
-
 bool g_atmi_initialized = false;
 bool g_atmi_hostcall_required = false;
 
@@ -194,16 +205,7 @@
 }
 
 atmi_status_t Runtime::Finalize() {
-  atmi_status_t rc = ATMI_STATUS_SUCCESS;
-  for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) {
-    SymbolInfoTable[i].clear();
-  }
-  SymbolInfoTable.clear();
-  for (uint32_t i = 0; i < KernelInfoTable.size(); i++) {
-    KernelInfoTable[i].clear();
-  }
-  KernelInfoTable.clear();
-
+  atmi_status_t rc;
   atl_reset_atmi_initialized();
   hsa_status_t err = hsa_shut_down();
   if (err != HSA_STATUS_SUCCESS) {
@@ -543,13 +545,6 @@
       return err;
     }
 
-    int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
-    KernelInfoTable.resize(gpu_count);
-    SymbolInfoTable.resize(gpu_count);
-    for (uint32_t i = 0; i < SymbolInfoTable.size(); i++)
-      SymbolInfoTable[i].clear();
-    for (uint32_t i = 0; i < KernelInfoTable.size(); i++)
-      KernelInfoTable[i].clear();
     atlc.g_hsa_initialized = true;
     DEBUG_PRINT("done\n");
   }
@@ -822,8 +817,9 @@
 }
 } // namespace
 
-static hsa_status_t get_code_object_custom_metadata(void *binary,
-                                                    size_t binSize, int gpu) {
+static hsa_status_t get_code_object_custom_metadata(
+    void *binary, size_t binSize, int gpu,
+    std::map<std::string, atl_kernel_info_t> &KernelInfoTable) {
   // parse code object with different keys from v2
   // also, the kernel name is not the same as the symbol name -- so a
   // symbol->name map is needed
@@ -990,16 +986,16 @@
                 kernel_segment_size, info.kernel_segment_size);
 
     // kernel received, now add it to the kernel info table
-    KernelInfoTable[gpu][kernelName] = info;
+    KernelInfoTable[kernelName] = info;
   }
 
   return HSA_STATUS_SUCCESS;
 }
 
-static hsa_status_t populate_InfoTables(hsa_executable_t executable,
-                                        hsa_executable_symbol_t symbol,
-                                        void *data) {
-  int gpu = *static_cast<int *>(data);
+static hsa_status_t
+populate_InfoTables(hsa_executable_symbol_t symbol, int gpu,
+                    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
+                    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable) {
   hsa_symbol_kind_t type;
 
   uint32_t name_length;
@@ -1036,11 +1032,16 @@
     // by now, the kernel info table should already have an entry
     // because the non-ROCr custom code object parsing is called before
     // iterating over the code object symbols using ROCr
-    if (KernelInfoTable[gpu].find(kernelName) == KernelInfoTable[gpu].end()) {
-      return HSA_STATUS_ERROR;
+    if (KernelInfoTable.find(kernelName) == KernelInfoTable.end()) {
+      if (HSA_STATUS_ERROR_INVALID_CODE_OBJECT != HSA_STATUS_SUCCESS) {
+        printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
+               "Finding the entry kernel info table",
+               get_error_string(HSA_STATUS_ERROR_INVALID_CODE_OBJECT));
+        exit(1);
+      }
     }
     // found, so assign and update
-    info = KernelInfoTable[gpu][kernelName];
+    info = KernelInfoTable[kernelName];
 
     /* Extract dispatch information from the symbol */
     err = hsa_executable_symbol_get_info(
@@ -1078,7 +1079,7 @@
         info.private_segment_size, info.kernel_segment_size);
 
     // assign it back to the kernel info table
-    KernelInfoTable[gpu][kernelName] = info;
+    KernelInfoTable[kernelName] = info;
     free(name);
   } else if (type == HSA_SYMBOL_KIND_VARIABLE) {
     err = hsa_executable_symbol_get_info(
@@ -1119,12 +1120,9 @@
     atmi_mem_place_t place = ATMI_MEM_PLACE(ATMI_DEVTYPE_GPU, gpu, 0);
     DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr,
                 info.size);
-    err = register_allocation(reinterpret_cast<void *>(info.addr),
-                              (size_t)info.size, place);
-    if (err != HSA_STATUS_SUCCESS) {
-      return err;
-    }
-    SymbolInfoTable[gpu][std::string(name)] = info;
+    register_allocation(reinterpret_cast<void *>(info.addr), (size_t)info.size,
+                        place);
+    SymbolInfoTable[std::string(name)] = info;
     if (strcmp(name, "needs_hostcall_buffer") == 0)
       g_atmi_hostcall_required = true;
     free(name);
@@ -1134,7 +1132,9 @@
   return HSA_STATUS_SUCCESS;
 }
 
-atmi_status_t Runtime::RegisterModuleFromMemory(
+atmi_status_t RegisterModuleFromMemory(
+    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
+    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
     void *module_bytes, size_t module_size, atmi_place_t place,
     atmi_status_t (*on_deserialized_data)(void *data, size_t size,
                                           void *cb_state),
@@ -1174,7 +1174,8 @@
       // Some metadata info is not available through ROCr API, so use custom
       // code object metadata parsing to collect such metadata info
 
-      err = get_code_object_custom_metadata(module_bytes, module_size, gpu);
+      err = get_code_object_custom_metadata(module_bytes, module_size, gpu,
+                                            KernelInfoTable);
       if (err != HSA_STATUS_SUCCESS) {
         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
                     "Getting custom code object metadata",
@@ -1228,8 +1229,12 @@
       return ATMI_STATUS_ERROR;
     }
 
-    err = hsa_executable_iterate_symbols(executable, populate_InfoTables,
-                                         static_cast<void *>(&gpu));
+    err = hsa::executable_iterate_symbols(
+        executable,
+        [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t {
+          return populate_InfoTables(symbol, gpu, KernelInfoTable,
+                                     SymbolInfoTable);
+        });
     if (err != HSA_STATUS_SUCCESS) {
       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
              "Iterating over symbols for execuatable", get_error_string(err));
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -30,7 +30,6 @@
 #include <vector>
 
 // Header from ATMI interface
-#include "atmi_interop_hsa.h"
 #include "atmi_runtime.h"
 
 #include "internal.h"
@@ -86,6 +85,25 @@
 
 #include "elf_common.h"
 
+atmi_status_t atmi_interop_hsa_get_symbol_info(
+    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
+    atmi_mem_place_t place, const char *symbol, void **var_addr,
+    unsigned int *var_size);
+atmi_status_t atmi_interop_hsa_get_kernel_info(
+    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
+    atmi_mem_place_t place, const char *kernel_name,
+    hsa_executable_symbol_info_t info, uint32_t *value);
+
+namespace core {
+atmi_status_t RegisterModuleFromMemory(
+    std::map<std::string, atl_kernel_info_t> &KernelInfo,
+    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, void *, size_t,
+    atmi_place_t,
+    atmi_status_t (*on_deserialized_data)(void *data, size_t size,
+                                          void *cb_state),
+    void *cb_state, std::vector<hsa_executable_t> &HSAExecutables);
+}
+
 /// Keep entries table per device
 struct FuncOrGblEntryTy {
   __tgt_target_table Table;
@@ -337,6 +355,9 @@
 
   std::vector<hsa_executable_t> HSAExecutables;
 
+  std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
+  std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
+
   struct atmiFreePtrDeletor {
     void operator()(void *p) {
       atmi_free(p); // ignore failure to free
@@ -480,6 +501,13 @@
     NumTeams.resize(NumberOfDevices);
     NumThreads.resize(NumberOfDevices);
     deviceStateStore.resize(NumberOfDevices);
+    KernelInfoTable.resize(NumberOfDevices);
+    SymbolInfoTable.resize(NumberOfDevices);
+
+    for (uint32_t i = 0; i < SymbolInfoTable.size(); i++)
+      SymbolInfoTable[i].clear();
+    for (uint32_t i = 0; i < KernelInfoTable.size(); i++)
+      KernelInfoTable[i].clear();
 
     for (int i = 0; i < NumberOfDevices; i++) {
       HSAQueues[i] = nullptr;
@@ -554,6 +582,8 @@
     // atmi_finalize removes access to it
     deviceStateStore.clear();
     KernelArgPoolMap.clear();
+    KernelInfoTable.clear();
+    SymbolInfoTable.clear();
     // Terminate hostrpc before finalizing ATMI
     hostrpc_terminate();
 
@@ -991,15 +1021,17 @@
 
 template <typename C>
 atmi_status_t module_register_from_memory_to_place(
+    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
+    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
     void *module_bytes, size_t module_size, atmi_place_t place, C cb,
     std::vector<hsa_executable_t> &HSAExecutables) {
   auto L = [](void *data, size_t size, void *cb_state) -> atmi_status_t {
     C *unwrapped = static_cast<C *>(cb_state);
     return (*unwrapped)(data, size);
   };
-  return core::Runtime::RegisterModuleFromMemory(
-      module_bytes, module_size, place, L, static_cast<void *>(&cb),
-      HSAExecutables);
+  return core::RegisterModuleFromMemory(
+      KernelInfoTable, SymbolInfoTable, module_bytes, module_size, place, L,
+      static_cast<void *>(&cb), HSAExecutables);
 }
 } // namespace
 
@@ -1114,11 +1146,12 @@
         DP("Setting global device environment after load (%u bytes)\n",
            si.size);
         int device_id = host_device_env.device_num;
-
+        auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
         void *state_ptr;
         uint32_t state_ptr_size;
         atmi_status_t err = atmi_interop_hsa_get_symbol_info(
-            get_gpu_mem_place(device_id), sym(), &state_ptr, &state_ptr_size);
+            SymbolInfo, get_gpu_mem_place(device_id), sym(), &state_ptr,
+            &state_ptr_size);
         if (err != ATMI_STATUS_SUCCESS) {
           DP("failed to find %s in loaded image\n", sym());
           return err;
@@ -1197,8 +1230,11 @@
     auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image,
                                   img_size);
 
+    auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id];
+    auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
     atmi_status_t err = module_register_from_memory_to_place(
-        (void *)image->ImageStart, img_size, get_gpu_place(device_id),
+        KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size,
+        get_gpu_place(device_id),
         [&](void *data, size_t size) { return env.before_loading(data, size); },
         DeviceInfo.HSAExecutables);
 
@@ -1227,9 +1263,10 @@
 
     void *state_ptr;
     uint32_t state_ptr_size;
+    auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
     atmi_status_t err = atmi_interop_hsa_get_symbol_info(
-        get_gpu_mem_place(device_id), "omptarget_nvptx_device_State",
-        &state_ptr, &state_ptr_size);
+        SymbolInfoMap, get_gpu_mem_place(device_id),
+        "omptarget_nvptx_device_State", &state_ptr, &state_ptr_size);
 
     if (err != ATMI_STATUS_SUCCESS) {
       DP("No device_state symbol found, skipping initialization\n");
@@ -1311,8 +1348,10 @@
       void *varptr;
       uint32_t varsize;
 
+      auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
       atmi_status_t err = atmi_interop_hsa_get_symbol_info(
-          get_gpu_mem_place(device_id), e->name, &varptr, &varsize);
+          SymbolInfoMap, get_gpu_mem_place(device_id), e->name, &varptr,
+          &varsize);
 
       if (err != ATMI_STATUS_SUCCESS) {
         // Inform the user what symbol prevented offloading
@@ -1353,8 +1392,10 @@
 
     atmi_mem_place_t place = get_gpu_mem_place(device_id);
     uint32_t kernarg_segment_size;
+    auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id];
     atmi_status_t err = atmi_interop_hsa_get_kernel_info(
-        place, e->name, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+        KernelInfoMap, place, e->name,
+        HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
         &kernarg_segment_size);
 
     // each arg is a void * in this openmp implementation
@@ -1782,6 +1823,7 @@
   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
 
   std::string kernel_name = std::string(KernelInfo->Name);
+  auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
   if (KernelInfoTable[device_id].find(kernel_name) ==
       KernelInfoTable[device_id].end()) {
     DP("Kernel %s not found\n", kernel_name.c_str());