Index: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -883,1355 +883,1351 @@
   AsyncInfo->Queue = 0;
 }
 
-bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
-  const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
-  int32_t r = elf_check_machine(image, amdgcnMachineID);
-  if (!r) {
-    DP("Supported machine ID not found\n");
-  }
-  return r;
-}
+// Determine launch values for kernel.
+struct launchVals {
+  int WorkgroupSize;
+  int GridSize;
+};
+launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
+                         int ConstWGSize,
+                         llvm::omp::OMPTgtExecModeFlags ExecutionMode,
+                         int num_teams, int thread_limit,
+                         uint64_t loop_tripcount, int DeviceNumTeams) {
 
-uint32_t elf_e_flags(__tgt_device_image *image) {
-  char *img_begin = (char *)image->ImageStart;
-  size_t img_size = (char *)image->ImageEnd - img_begin;
+  int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
+  int num_groups = 0;
 
-  Elf *e = elf_memory(img_begin, img_size);
-  if (!e) {
-    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
-    return 0;
+  int Max_Teams =
+      Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
+  if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit)
+    Max_Teams = RTLDeviceInfoTy::HardTeamLimit;
+
+  if (print_kernel_trace & STARTUP_DETAILS) {
+    DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams);
+    DP("Max_Teams: %d\n", Max_Teams);
+    DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
+    DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size);
+    DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
+       RTLDeviceInfoTy::Default_WG_Size);
+    DP("thread_limit: %d\n", thread_limit);
+    DP("threadsPerGroup: %d\n", threadsPerGroup);
+    DP("ConstWGSize: %d\n", ConstWGSize);
+  }
+  // check for thread_limit() clause
+  if (thread_limit > 0) {
+    threadsPerGroup = thread_limit;
+    DP("Setting threads per block to requested %d\n", thread_limit);
+    // Add master warp for GENERIC
+    if (ExecutionMode ==
+        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
+      threadsPerGroup += WarpSize;
+      DP("Adding master wavefront: +%d threads\n", WarpSize);
+    }
+    if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
+      threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
+      DP("Setting threads per block to maximum %d\n", threadsPerGroup);
+    }
+  }
+  // check flat_max_work_group_size attr here
+  if (threadsPerGroup > ConstWGSize) {
+    threadsPerGroup = ConstWGSize;
+    DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
+       threadsPerGroup);
   }
+  if (print_kernel_trace & STARTUP_DETAILS)
+    DP("threadsPerGroup: %d\n", threadsPerGroup);
+  DP("Preparing %d threads\n", threadsPerGroup);
 
-  Elf64_Ehdr *eh64 = elf64_getehdr(e);
+  // Set default num_groups (teams)
+  if (Env.TeamLimit > 0)
+    num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit;
+  else
+    num_groups = Max_Teams;
+  DP("Set default num of groups %d\n", num_groups);
 
-  if (!eh64) {
-    DP("Unable to get machine ID from ELF file!\n");
-    elf_end(e);
-    return 0;
+  if (print_kernel_trace & STARTUP_DETAILS) {
+    DP("num_groups: %d\n", num_groups);
+    DP("num_teams: %d\n", num_teams);
   }
 
-  uint32_t Flags = eh64->e_flags;
-
-  elf_end(e);
-  DP("ELF Flags: 0x%x\n", Flags);
-  return Flags;
-}
-} // namespace
+  // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
+  // This reduction is typical for default case (no thread_limit clause).
+  // or when user goes crazy with num_teams clause.
+  // FIXME: We cant distinguish between a constant or variable thread limit.
+  // So we only handle constant thread_limits.
+  if (threadsPerGroup >
+      RTLDeviceInfoTy::Default_WG_Size) //  256 < threadsPerGroup <= 1024
+    // Should we round threadsPerGroup up to nearest WarpSize
+    // here?
+    num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
 
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
-  return elf_machine_id_is_amdgcn(image);
-}
+  // check for num_teams() clause
+  if (num_teams > 0) {
+    num_groups = (num_teams < num_groups) ? num_teams : num_groups;
+  }
+  if (print_kernel_trace & STARTUP_DETAILS) {
+    DP("num_groups: %d\n", num_groups);
+    DP("Env.NumTeams %d\n", Env.NumTeams);
+    DP("Env.TeamLimit %d\n", Env.TeamLimit);
+  }
 
-int __tgt_rtl_number_of_devices() {
-  // If the construction failed, no methods are safe to call
-  if (DeviceInfo.ConstructionSucceeded) {
-    return DeviceInfo.NumberOfDevices;
+  if (Env.NumTeams > 0) {
+    num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups;
+    DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
+  } else if (Env.TeamLimit > 0) {
+    num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups;
+    DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
   } else {
-    DP("AMDGPU plugin construction failed. Zero devices available\n");
-    return 0;
+    if (num_teams <= 0) {
+      if (loop_tripcount > 0) {
+        if (ExecutionMode ==
+            llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
+          // round up to the nearest integer
+          num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
+        } else if (ExecutionMode ==
+                   llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
+          num_groups = loop_tripcount;
+        } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
+          // This is a generic kernel that was transformed to use SPMD-mode
+          // execution but uses Generic-mode semantics for scheduling.
+          num_groups = loop_tripcount;
+        }
+        DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
+           "threads per block %d\n",
+           num_groups, loop_tripcount, threadsPerGroup);
+      }
+    } else {
+      num_groups = num_teams;
+    }
+    if (num_groups > Max_Teams) {
+      num_groups = Max_Teams;
+      if (print_kernel_trace & STARTUP_DETAILS)
+        DP("Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams);
+    }
+    if (num_groups > num_teams && num_teams > 0) {
+      num_groups = num_teams;
+      if (print_kernel_trace & STARTUP_DETAILS)
+        DP("Limiting num_groups %d to clause num_teams %d \n", num_groups,
+           num_teams);
+    }
   }
-}
 
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  DP("Init requires flags to %ld\n", RequiresFlags);
-  DeviceInfo.RequiresFlags = RequiresFlags;
-  return RequiresFlags;
+  // num_teams clause always honored, no matter what, unless DEFAULT is active.
+  if (num_teams > 0) {
+    num_groups = num_teams;
+    // Cap num_groups to EnvMaxTeamsDefault if set.
+    if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault)
+      num_groups = Env.MaxTeamsDefault;
+  }
+  if (print_kernel_trace & STARTUP_DETAILS) {
+    DP("threadsPerGroup: %d\n", threadsPerGroup);
+    DP("num_groups: %d\n", num_groups);
+    DP("loop_tripcount: %ld\n", loop_tripcount);
+  }
+  DP("Final %d num_groups and %d threadsPerGroup\n", num_groups,
+     threadsPerGroup);
+
+  launchVals res;
+  res.WorkgroupSize = threadsPerGroup;
+  res.GridSize = threadsPerGroup * num_groups;
+  return res;
 }
 
-namespace {
-template <typename T> bool enforce_upper_bound(T *value, T upper) {
-  bool changed = *value > upper;
-  if (changed) {
-    *value = upper;
+static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
+  uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
+  bool full = true;
+  while (full) {
+    full =
+        packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue));
   }
-  return changed;
+  return packet_id;
 }
-} // namespace
-
-int32_t __tgt_rtl_init_device(int device_id) {
-  hsa_status_t err;
 
-  // this is per device id init
-  DP("Initialize the device id: %d\n", device_id);
+// Base kernel launch function used by synchronous and asynchronous versions
+int32_t runRegionNowaitLocked(int32_t device_id, void *tgt_entry_ptr,
+                              void **tgt_args, ptrdiff_t *tgt_offsets,
+                              int32_t arg_num, int32_t num_teams,
+                              int32_t thread_limit, uint64_t loop_tripcount) {
+  // Set the context we are using
+  // update thread limit content in gpu memory if un-initialized or specified
+  // from host
 
-  hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
+  DP("Run target team region thread_limit %d\n", thread_limit);
 
-  // Get number of Compute Unit
-  uint32_t compute_units = 0;
-  err = hsa_agent_get_info(
-      agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
-      &compute_units);
-  if (err != HSA_STATUS_SUCCESS) {
-    DeviceInfo.ComputeUnits[device_id] = 1;
-    DP("Error getting compute units : settiing to 1\n");
-  } else {
-    DeviceInfo.ComputeUnits[device_id] = compute_units;
-    DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
-  }
+  // All args are references.
+  std::vector<void *> args(arg_num);
+  std::vector<void *> ptrs(arg_num);
 
-  char GetInfoName[64]; // 64 max size returned by get info
-  err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
-                           (void *)GetInfoName);
-  if (err)
-    DeviceInfo.GPUName[device_id] = "--unknown gpu--";
-  else {
-    DeviceInfo.GPUName[device_id] = GetInfoName;
+  DP("Arg_num: %d\n", arg_num);
+  for (int32_t i = 0; i < arg_num; ++i) {
+    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+    args[i] = &ptrs[i];
+    DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
   }
 
-  if (print_kernel_trace & STARTUP_DETAILS)
-    DP("Device#%-2d CU's: %2d %s\n", device_id,
-       DeviceInfo.ComputeUnits[device_id],
-       DeviceInfo.GPUName[device_id].c_str());
+  KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
 
-  // Query attributes to determine number of threads/block and blocks/grid.
-  uint16_t workgroup_max_dim[3];
-  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
-                           &workgroup_max_dim);
-  if (err != HSA_STATUS_SUCCESS) {
-    DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
-    DP("Error getting grid dims: num groups : %d\n",
-       RTLDeviceInfoTy::DefaultNumTeams);
-  } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
-    DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0];
-    DP("Using %d ROCm blocks per grid\n",
-       DeviceInfo.GroupsPerDevice[device_id]);
-  } else {
-    DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit;
-    DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
-       "at the hard limit\n",
-       workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit);
+  std::string kernel_name = std::string(KernelInfo->Name);
+  auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
+  if (KernelInfoTable[device_id].find(kernel_name) ==
+      KernelInfoTable[device_id].end()) {
+    DP("Kernel %s not found\n", kernel_name.c_str());
+    return OFFLOAD_FAIL;
   }
 
-  // Get thread limit
-  hsa_dim3_t grid_max_dim;
-  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim);
-  if (err == HSA_STATUS_SUCCESS) {
-    DeviceInfo.ThreadsPerGroup[device_id] =
-        reinterpret_cast<uint32_t *>(&grid_max_dim)[0] /
-        DeviceInfo.GroupsPerDevice[device_id];
-
-    if (DeviceInfo.ThreadsPerGroup[device_id] == 0) {
-      DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
-      DP("Default thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
-    } else if (enforce_upper_bound(&DeviceInfo.ThreadsPerGroup[device_id],
-                                   RTLDeviceInfoTy::Max_WG_Size)) {
-      DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
-    } else {
-      DP("Using ROCm Queried thread limit: %d\n",
-         DeviceInfo.ThreadsPerGroup[device_id]);
-    }
-  } else {
-    DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
-    DP("Error getting max block dimension, use default:%d \n",
-       RTLDeviceInfoTy::Max_WG_Size);
-  }
+  const atl_kernel_info_t KernelInfoEntry =
+      KernelInfoTable[device_id][kernel_name];
+  const uint32_t group_segment_size = KernelInfoEntry.group_segment_size;
+  const uint32_t sgpr_count = KernelInfoEntry.sgpr_count;
+  const uint32_t vgpr_count = KernelInfoEntry.vgpr_count;
+  const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count;
+  const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count;
 
-  // Get wavefront size
-  uint32_t wavefront_size = 0;
-  err =
-      hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size);
-  if (err == HSA_STATUS_SUCCESS) {
-    DP("Queried wavefront size: %d\n", wavefront_size);
-    DeviceInfo.WarpSize[device_id] = wavefront_size;
-  } else {
-    // TODO: Burn the wavefront size into the code object
-    DP("Warning: Unknown wavefront size, assuming 64\n");
-    DeviceInfo.WarpSize[device_id] = 64;
-  }
+  assert(arg_num == (int)KernelInfoEntry.explicit_argument_count);
 
-  // Adjust teams to the env variables
+  /*
+   * Set limit based on ThreadsPerGroup and GroupsPerDevice
+   */
+  launchVals LV =
+      getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env,
+                    KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
+                    num_teams,      // From run_region arg
+                    thread_limit,   // From run_region arg
+                    loop_tripcount, // From run_region arg
+                    DeviceInfo.NumTeams[KernelInfo->device_id]);
+  const int GridSize = LV.GridSize;
+  const int WorkgroupSize = LV.WorkgroupSize;
 
-  if (DeviceInfo.Env.TeamLimit > 0 &&
-      (enforce_upper_bound(&DeviceInfo.GroupsPerDevice[device_id],
-                           DeviceInfo.Env.TeamLimit))) {
-    DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
-       DeviceInfo.Env.TeamLimit);
+  if (print_kernel_trace >= LAUNCH) {
+    int num_groups = GridSize / WorkgroupSize;
+    // enum modes are SPMD, GENERIC, NONE 0,1,2
+    // if doing rtl timing, print to stderr, unless stdout requested.
+    bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
+    fprintf(traceToStdout ? stdout : stderr,
+            "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
+            "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
+            "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
+            device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
+            arg_num, num_groups, WorkgroupSize, num_teams, thread_limit,
+            group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count,
+            vgpr_spill_count, loop_tripcount, KernelInfo->Name);
   }
 
-  // Set default number of teams
-  if (DeviceInfo.Env.NumTeams > 0) {
-    DeviceInfo.NumTeams[device_id] = DeviceInfo.Env.NumTeams;
-    DP("Default number of teams set according to environment %d\n",
-       DeviceInfo.Env.NumTeams);
-  } else {
-    char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
-    int TeamsPerCU = DefaultTeamsPerCU;
-    if (TeamsPerCUEnvStr) {
-      TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
+  // Run on the device.
+  {
+    hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();
+    if (!queue) {
+      return OFFLOAD_FAIL;
     }
+    uint64_t packet_id = acquire_available_packet_id(queue);
 
-    DeviceInfo.NumTeams[device_id] =
-        TeamsPerCU * DeviceInfo.ComputeUnits[device_id];
-    DP("Default number of teams = %d * number of compute units %d\n",
-       TeamsPerCU, DeviceInfo.ComputeUnits[device_id]);
-  }
-
-  if (enforce_upper_bound(&DeviceInfo.NumTeams[device_id],
-                          DeviceInfo.GroupsPerDevice[device_id])) {
-    DP("Default number of teams exceeds device limit, capping at %d\n",
-       DeviceInfo.GroupsPerDevice[device_id]);
-  }
-
-  // Adjust threads to the env variables
-  if (DeviceInfo.Env.TeamThreadLimit > 0 &&
-      (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
-                           DeviceInfo.Env.TeamThreadLimit))) {
-    DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
-       DeviceInfo.Env.TeamThreadLimit);
-  }
+    const uint32_t mask = queue->size - 1; // size is a power of 2
+    hsa_kernel_dispatch_packet_t *packet =
+        (hsa_kernel_dispatch_packet_t *)queue->base_address +
+        (packet_id & mask);
 
-  // Set default number of threads
-  DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size;
-  DP("Default number of threads set according to library's default %d\n",
-     RTLDeviceInfoTy::Default_WG_Size);
-  if (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
-                          DeviceInfo.ThreadsPerGroup[device_id])) {
-    DP("Default number of threads exceeds device limit, capping at %d\n",
-       DeviceInfo.ThreadsPerGroup[device_id]);
-  }
+    // packet->header is written last
+    packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    packet->workgroup_size_x = WorkgroupSize;
+    packet->workgroup_size_y = 1;
+    packet->workgroup_size_z = 1;
+    packet->reserved0 = 0;
+    packet->grid_size_x = GridSize;
+    packet->grid_size_y = 1;
+    packet->grid_size_z = 1;
+    packet->private_segment_size = KernelInfoEntry.private_segment_size;
+    packet->group_segment_size = KernelInfoEntry.group_segment_size;
+    packet->kernel_object = KernelInfoEntry.kernel_object;
+    packet->kernarg_address = 0;     // use the block allocator
+    packet->reserved2 = 0;           // impl writes id_ here
+    packet->completion_signal = {0}; // may want a pool of signals
 
-  DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
-     device_id, DeviceInfo.GroupsPerDevice[device_id],
-     DeviceInfo.ThreadsPerGroup[device_id]);
+    KernelArgPool *ArgPool = nullptr;
+    void *kernarg = nullptr;
+    {
+      auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
+      if (it != KernelArgPoolMap.end()) {
+        ArgPool = (it->second).get();
+      }
+    }
+    if (!ArgPool) {
+      DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
+         device_id);
+    }
+    {
+      if (ArgPool) {
+        assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
+        kernarg = ArgPool->allocate(arg_num);
+      }
+      if (!kernarg) {
+        DP("Allocate kernarg failed\n");
+        return OFFLOAD_FAIL;
+      }
 
-  DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id,
-     DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id],
-     DeviceInfo.GroupsPerDevice[device_id],
-     DeviceInfo.GroupsPerDevice[device_id] *
-         DeviceInfo.ThreadsPerGroup[device_id]);
+      // Copy explicit arguments
+      for (int i = 0; i < arg_num; i++) {
+        memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *));
+      }
 
-  return OFFLOAD_SUCCESS;
-}
+      // Initialize implicit arguments. TODO: Which of these can be dropped
+      impl_implicit_args_t *impl_args =
+          reinterpret_cast<impl_implicit_args_t *>(
+              static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size);
+      memset(impl_args, 0,
+             sizeof(impl_implicit_args_t)); // may not be necessary
+      impl_args->offset_x = 0;
+      impl_args->offset_y = 0;
+      impl_args->offset_z = 0;
 
-namespace {
-Elf64_Shdr *find_only_SHT_HASH(Elf *elf) {
-  size_t N;
-  int rc = elf_getshdrnum(elf, &N);
-  if (rc != 0) {
-    return nullptr;
-  }
+      // assign a hostcall buffer for the selected Q
+      if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) {
+        // hostrpc_assign_buffer is not thread safe, and this function is
+        // under a multiple reader lock, not a writer lock.
+        static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
+        pthread_mutex_lock(&hostcall_init_lock);
+        unsigned long buffer = hostrpc_assign_buffer(
+            DeviceInfo.HSAAgents[device_id], queue, device_id);
+        pthread_mutex_unlock(&hostcall_init_lock);
+        if (!buffer) {
+          DP("hostrpc_assign_buffer failed, gpu would dereference null and "
+             "error\n");
+          return OFFLOAD_FAIL;
+        }
 
-  Elf64_Shdr *result = nullptr;
-  for (size_t i = 0; i < N; i++) {
-    Elf_Scn *scn = elf_getscn(elf, i);
-    if (scn) {
-      Elf64_Shdr *shdr = elf64_getshdr(scn);
-      if (shdr) {
-        if (shdr->sh_type == SHT_HASH) {
-          if (result == nullptr) {
-            result = shdr;
+        if (KernelInfoEntry.implicit_argument_count >= 4) {
+          // Initialise pointer for implicit_argument_count != 0 ABI
+          // Guess that the right implicit argument is at offset 24 after
+          // the explicit arguments. In the future, should be able to read
+          // the offset from msgpack. Clang is not annotating it at present.
+          uint64_t Offset =
+              sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
+          if ((Offset + 8) > (ArgPool->kernarg_segment_size)) {
+            DP("Bad offset of hostcall, exceeds kernarg segment size\n");
           } else {
-            // multiple SHT_HASH sections not handled
-            return nullptr;
+            memcpy(static_cast<char *>(kernarg) + Offset, &buffer, 8);
           }
         }
+
+        // initialise pointer for implicit_argument_count == 0 ABI
+        impl_args->hostcall_ptr = buffer;
       }
-    }
-  }
-  return result;
-}
 
-const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash,
-                            const char *symname) {
+      packet->kernarg_address = kernarg;
+    }
 
-  assert(section_hash);
-  size_t section_symtab_index = section_hash->sh_link;
-  Elf64_Shdr *section_symtab =
-      elf64_getshdr(elf_getscn(elf, section_symtab_index));
-  size_t section_strtab_index = section_symtab->sh_link;
+    hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
+    if (s.handle == 0) {
+      DP("Failed to get signal instance\n");
+      return OFFLOAD_FAIL;
+    }
+    packet->completion_signal = s;
+    hsa_signal_store_relaxed(packet->completion_signal, 1);
 
-  const Elf64_Sym *symtab =
-      reinterpret_cast<const Elf64_Sym *>(base + section_symtab->sh_offset);
+    // Publish the packet indicating it is ready to be processed
+    core::packet_store_release(reinterpret_cast<uint32_t *>(packet),
+                               core::create_header(), packet->setup);
 
-  const uint32_t *hashtab =
-      reinterpret_cast<const uint32_t *>(base + section_hash->sh_offset);
+    // Since the packet is already published, its contents must not be
+    // accessed any more
+    hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
 
-  // Layout:
-  // nbucket
-  // nchain
-  // bucket[nbucket]
-  // chain[nchain]
-  uint32_t nbucket = hashtab[0];
-  const uint32_t *bucket = &hashtab[2];
-  const uint32_t *chain = &hashtab[nbucket + 2];
+    while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
+                                     HSA_WAIT_STATE_BLOCKED) != 0)
+      ;
 
-  const size_t max = strlen(symname) + 1;
-  const uint32_t hash = elf_hash(symname);
-  for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) {
-    char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name);
-    if (strncmp(symname, n, max) == 0) {
-      return &symtab[i];
-    }
+    assert(ArgPool);
+    ArgPool->deallocate(kernarg);
+    DeviceInfo.FreeSignalPool.push(s);
   }
 
-  return nullptr;
+  DP("Kernel completed\n");
+  return OFFLOAD_SUCCESS;
 }
 
-struct symbol_info {
-  void *addr = nullptr;
-  uint32_t size = UINT32_MAX;
-  uint32_t sh_type = SHT_NULL;
-};
-
-int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname,
-                                    symbol_info *res) {
-  if (elf_kind(elf) != ELF_K_ELF) {
-    return 1;
-  }
-
-  Elf64_Shdr *section_hash = find_only_SHT_HASH(elf);
-  if (!section_hash) {
-    return 1;
+bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
+  const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
+  int32_t r = elf_check_machine(image, amdgcnMachineID);
+  if (!r) {
+    DP("Supported machine ID not found\n");
   }
+  return r;
+}
 
-  const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname);
-  if (!sym) {
-    return 1;
-  }
+uint32_t elf_e_flags(__tgt_device_image *image) {
+  char *img_begin = (char *)image->ImageStart;
+  size_t img_size = (char *)image->ImageEnd - img_begin;
 
-  if (sym->st_size > UINT32_MAX) {
-    return 1;
+  Elf *e = elf_memory(img_begin, img_size);
+  if (!e) {
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return 0;
   }
 
-  if (sym->st_shndx == SHN_UNDEF) {
-    return 1;
-  }
+  Elf64_Ehdr *eh64 = elf64_getehdr(e);
 
-  Elf_Scn *section = elf_getscn(elf, sym->st_shndx);
-  if (!section) {
-    return 1;
+  if (!eh64) {
+    DP("Unable to get machine ID from ELF file!\n");
+    elf_end(e);
+    return 0;
   }
 
-  Elf64_Shdr *header = elf64_getshdr(section);
-  if (!header) {
-    return 1;
-  }
+  uint32_t Flags = eh64->e_flags;
 
-  res->addr = sym->st_value + base;
-  res->size = static_cast<uint32_t>(sym->st_size);
-  res->sh_type = header->sh_type;
-  return 0;
+  elf_end(e);
+  DP("ELF Flags: 0x%x\n", Flags);
+  return Flags;
 }
+} // namespace
 
-int get_symbol_info_without_loading(char *base, size_t img_size,
-                                    const char *symname, symbol_info *res) {
-  Elf *elf = elf_memory(base, img_size);
-  if (elf) {
-    int rc = get_symbol_info_without_loading(elf, base, symname, res);
-    elf_end(elf);
-    return rc;
-  }
-  return 1;
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+  return elf_machine_id_is_amdgcn(image);
 }
 
-hsa_status_t interop_get_symbol_info(char *base, size_t img_size,
-                                     const char *symname, void **var_addr,
-                                     uint32_t *var_size) {
-  symbol_info si;
-  int rc = get_symbol_info_without_loading(base, img_size, symname, &si);
-  if (rc == 0) {
-    *var_addr = si.addr;
-    *var_size = si.size;
-    return HSA_STATUS_SUCCESS;
+int __tgt_rtl_number_of_devices() {
+  // If the construction failed, no methods are safe to call
+  if (DeviceInfo.ConstructionSucceeded) {
+    return DeviceInfo.NumberOfDevices;
   } else {
-    return HSA_STATUS_ERROR;
+    DP("AMDGPU plugin construction failed. Zero devices available\n");
+    return 0;
   }
 }
 
-template <typename C>
-hsa_status_t module_register_from_memory_to_place(
-    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
-    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    void *module_bytes, size_t module_size, int DeviceId, C cb,
-    std::vector<hsa_executable_t> &HSAExecutables) {
-  auto L = [](void *data, size_t size, void *cb_state) -> hsa_status_t {
-    C *unwrapped = static_cast<C *>(cb_state);
-    return (*unwrapped)(data, size);
-  };
-  return core::RegisterModuleFromMemory(
-      KernelInfoTable, SymbolInfoTable, module_bytes, module_size,
-      DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&cb),
-      HSAExecutables);
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+  DP("Init requires flags to %ld\n", RequiresFlags);
+  DeviceInfo.RequiresFlags = RequiresFlags;
+  return RequiresFlags;
 }
-} // namespace
-
-static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) {
-  uint64_t device_State_bytes = 0;
-  {
-    // If this is the deviceRTL, get the state variable size
-    symbol_info size_si;
-    int rc = get_symbol_info_without_loading(
-        ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si);
-
-    if (rc == 0) {
-      if (size_si.size != sizeof(uint64_t)) {
-        DP("Found device_State_size variable with wrong size\n");
-        return 0;
-      }
 
-      // Read number of bytes directly from the elf
-      memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t));
-    }
+namespace {
+template <typename T> bool enforce_upper_bound(T *value, T upper) {
+  bool changed = *value > upper;
+  if (changed) {
+    *value = upper;
   }
-  return device_State_bytes;
+  return changed;
 }
+} // namespace
 
-static __tgt_target_table *
-__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
-
-static __tgt_target_table *
-__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
+int32_t __tgt_rtl_init_device(int device_id) {
+  hsa_status_t err;
 
-__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
-                                          __tgt_device_image *image) {
-  DeviceInfo.load_run_lock.lock();
-  __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image);
-  DeviceInfo.load_run_lock.unlock();
-  return res;
-}
+  // this is per device id init
+  DP("Initialize the device id: %d\n", device_id);
 
-struct device_environment {
-  // initialise an DeviceEnvironmentTy in the deviceRTL
-  // patches around differences in the deviceRTL between trunk, aomp,
-  // rocmcc. Over time these differences will tend to zero and this class
-  // simplified.
-  // Symbol may be in .data or .bss, and may be missing fields, todo:
-  // review aomp/trunk/rocm and simplify the following
+  hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
 
-  // The symbol may also have been deadstripped because the device side
-  // accessors were unused.
+  // Get number of Compute Unit
+  uint32_t compute_units = 0;
+  err = hsa_agent_get_info(
+      agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
+      &compute_units);
+  if (err != HSA_STATUS_SUCCESS) {
+    DeviceInfo.ComputeUnits[device_id] = 1;
+    DP("Error getting compute units : settiing to 1\n");
+  } else {
+    DeviceInfo.ComputeUnits[device_id] = compute_units;
+    DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
+  }
 
-  // If the symbol is in .data (aomp, rocm) it can be written directly.
-  // If it is in .bss, we must wait for it to be allocated space on the
-  // gpu (trunk) and initialize after loading.
-  const char *sym() { return "omptarget_device_environment"; }
+  char GetInfoName[64]; // 64 max size returned by get info
+  err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
+                           (void *)GetInfoName);
+  if (err)
+    DeviceInfo.GPUName[device_id] = "--unknown gpu--";
+  else {
+    DeviceInfo.GPUName[device_id] = GetInfoName;
+  }
 
-  DeviceEnvironmentTy host_device_env;
-  symbol_info si;
-  bool valid = false;
+  if (print_kernel_trace & STARTUP_DETAILS)
+    DP("Device#%-2d CU's: %2d %s\n", device_id,
+       DeviceInfo.ComputeUnits[device_id],
+       DeviceInfo.GPUName[device_id].c_str());
 
-  __tgt_device_image *image;
-  const size_t img_size;
+  // Query attributes to determine number of threads/block and blocks/grid.
+  uint16_t workgroup_max_dim[3];
+  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
+                           &workgroup_max_dim);
+  if (err != HSA_STATUS_SUCCESS) {
+    DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+    DP("Error getting grid dims: num groups : %d\n",
+       RTLDeviceInfoTy::DefaultNumTeams);
+  } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
+    DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0];
+    DP("Using %d ROCm blocks per grid\n",
+       DeviceInfo.GroupsPerDevice[device_id]);
+  } else {
+    DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit;
+    DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
+       "at the hard limit\n",
+       workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit);
+  }
 
-  device_environment(int device_id, int number_devices,
-                     __tgt_device_image *image, const size_t img_size)
-      : image(image), img_size(img_size) {
+  // Get thread limit
+  hsa_dim3_t grid_max_dim;
+  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim);
+  if (err == HSA_STATUS_SUCCESS) {
+    DeviceInfo.ThreadsPerGroup[device_id] =
+        reinterpret_cast<uint32_t *>(&grid_max_dim)[0] /
+        DeviceInfo.GroupsPerDevice[device_id];
 
-    host_device_env.NumDevices = number_devices;
-    host_device_env.DeviceNum = device_id;
-    host_device_env.DebugKind = 0;
-    host_device_env.DynamicMemSize = 0;
-    if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
-      host_device_env.DebugKind = std::stoi(envStr);
+    if (DeviceInfo.ThreadsPerGroup[device_id] == 0) {
+      DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
+      DP("Default thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
+    } else if (enforce_upper_bound(&DeviceInfo.ThreadsPerGroup[device_id],
+                                   RTLDeviceInfoTy::Max_WG_Size)) {
+      DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
+    } else {
+      DP("Using ROCm Queried thread limit: %d\n",
+         DeviceInfo.ThreadsPerGroup[device_id]);
     }
-
-    int rc = get_symbol_info_without_loading((char *)image->ImageStart,
-                                             img_size, sym(), &si);
-    if (rc != 0) {
-      DP("Finding global device environment '%s' - symbol missing.\n", sym());
-      return;
-    }
-
-    if (si.size > sizeof(host_device_env)) {
-      DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size,
-         sizeof(host_device_env));
-      return;
-    }
-
-    valid = true;
+  } else {
+    DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
+    DP("Error getting max block dimension, use default:%d \n",
+       RTLDeviceInfoTy::Max_WG_Size);
   }
 
-  bool in_image() { return si.sh_type != SHT_NOBITS; }
-
-  hsa_status_t before_loading(void *data, size_t size) {
-    if (valid) {
-      if (in_image()) {
-        DP("Setting global device environment before load (%u bytes)\n",
-           si.size);
-        uint64_t offset = (char *)si.addr - (char *)image->ImageStart;
-        void *pos = (char *)data + offset;
-        memcpy(pos, &host_device_env, si.size);
-      }
-    }
-    return HSA_STATUS_SUCCESS;
+  // Get wavefront size
+  uint32_t wavefront_size = 0;
+  err =
+      hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size);
+  if (err == HSA_STATUS_SUCCESS) {
+    DP("Queried wavefront size: %d\n", wavefront_size);
+    DeviceInfo.WarpSize[device_id] = wavefront_size;
+  } else {
+    // TODO: Burn the wavefront size into the code object
+    DP("Warning: Unknown wavefront size, assuming 64\n");
+    DeviceInfo.WarpSize[device_id] = 64;
   }
 
-  hsa_status_t after_loading() {
-    if (valid) {
-      if (!in_image()) {
-        DP("Setting global device environment after load (%u bytes)\n",
-           si.size);
-        int device_id = host_device_env.DeviceNum;
-        auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
-        void *state_ptr;
-        uint32_t state_ptr_size;
-        hsa_status_t err = interop_hsa_get_symbol_info(
-            SymbolInfo, device_id, sym(), &state_ptr, &state_ptr_size);
-        if (err != HSA_STATUS_SUCCESS) {
-          DP("failed to find %s in loaded image\n", sym());
-          return err;
-        }
+  // Adjust teams to the env variables
 
-        if (state_ptr_size != si.size) {
-          DP("Symbol had size %u before loading, %u after\n", state_ptr_size,
-             si.size);
-          return HSA_STATUS_ERROR;
-        }
+  if (DeviceInfo.Env.TeamLimit > 0 &&
+      (enforce_upper_bound(&DeviceInfo.GroupsPerDevice[device_id],
+                           DeviceInfo.Env.TeamLimit))) {
+    DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
+       DeviceInfo.Env.TeamLimit);
+  }
 
-        return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env,
-                                                    state_ptr_size, device_id);
-      }
+  // Set default number of teams
+  if (DeviceInfo.Env.NumTeams > 0) {
+    DeviceInfo.NumTeams[device_id] = DeviceInfo.Env.NumTeams;
+    DP("Default number of teams set according to environment %d\n",
+       DeviceInfo.Env.NumTeams);
+  } else {
+    char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
+    int TeamsPerCU = DefaultTeamsPerCU;
+    if (TeamsPerCUEnvStr) {
+      TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
     }
-    return HSA_STATUS_SUCCESS;
-  }
-};
 
-static hsa_status_t impl_calloc(void **ret_ptr, size_t size, int DeviceId) {
-  uint64_t rounded = 4 * ((size + 3) / 4);
-  void *ptr;
-  hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
-  hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, rounded, 0, &ptr);
-  if (err != HSA_STATUS_SUCCESS) {
-    return err;
+    DeviceInfo.NumTeams[device_id] =
+        TeamsPerCU * DeviceInfo.ComputeUnits[device_id];
+    DP("Default number of teams = %d * number of compute units %d\n",
+       TeamsPerCU, DeviceInfo.ComputeUnits[device_id]);
   }
 
-  hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4);
-  if (rc != HSA_STATUS_SUCCESS) {
-    DP("zero fill device_state failed with %u\n", rc);
-    core::Runtime::Memfree(ptr);
-    return HSA_STATUS_ERROR;
+  if (enforce_upper_bound(&DeviceInfo.NumTeams[device_id],
+                          DeviceInfo.GroupsPerDevice[device_id])) {
+    DP("Default number of teams exceeds device limit, capping at %d\n",
+       DeviceInfo.GroupsPerDevice[device_id]);
   }
 
-  *ret_ptr = ptr;
-  return HSA_STATUS_SUCCESS;
-}
+  // Adjust threads to the env variables
+  if (DeviceInfo.Env.TeamThreadLimit > 0 &&
+      (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
+                           DeviceInfo.Env.TeamThreadLimit))) {
+    DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
+       DeviceInfo.Env.TeamThreadLimit);
+  }
 
-static bool image_contains_symbol(void *data, size_t size, const char *sym) {
-  symbol_info si;
-  int rc = get_symbol_info_without_loading((char *)data, size, sym, &si);
-  return (rc == 0) && (si.addr != nullptr);
-}
+  // Set default number of threads
+  DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size;
+  DP("Default number of threads set according to library's default %d\n",
+     RTLDeviceInfoTy::Default_WG_Size);
+  if (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
+                          DeviceInfo.ThreadsPerGroup[device_id])) {
+    DP("Default number of threads exceeds device limit, capping at %d\n",
+       DeviceInfo.ThreadsPerGroup[device_id]);
+  }
 
-__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
-                                                 __tgt_device_image *image) {
-  // This function loads the device image onto gpu[device_id] and does other
-  // per-image initialization work. Specifically:
-  //
-  // - Initialize an DeviceEnvironmentTy instance embedded in the
-  //   image at the symbol "omptarget_device_environment"
-  //   Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
-  //
-  // - Allocate a large array per-gpu (could be moved to init_device)
-  //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
-  //   - Allocate at least that many bytes of gpu memory
-  //   - Zero initialize it
-  //   - Write the pointer to the symbol omptarget_nvptx_device_State
-  //
-  // - Pulls some per-kernel information together from various sources and
-  //   records it in the KernelsList for quicker access later
-  //
-  // The initialization can be done before or after loading the image onto the
-  // gpu. This function presently does a mixture. Using the hsa api to get/set
-  // the information is simpler to implement, in exchange for more complicated
-  // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
-  // back from the gpu vs a hashtable lookup on the host.
+  DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
+     device_id, DeviceInfo.GroupsPerDevice[device_id],
+     DeviceInfo.ThreadsPerGroup[device_id]);
 
-  const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart;
+  DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id,
+     DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id],
+     DeviceInfo.GroupsPerDevice[device_id],
+     DeviceInfo.GroupsPerDevice[device_id] *
+         DeviceInfo.ThreadsPerGroup[device_id]);
 
-  DeviceInfo.clearOffloadEntriesTable(device_id);
+  return OFFLOAD_SUCCESS;
+}
 
-  // We do not need to set the ELF version because the caller of this function
-  // had to do that to decide the right runtime to use
+namespace {
+Elf64_Shdr *find_only_SHT_HASH(Elf *elf) {
+  size_t N;
+  int rc = elf_getshdrnum(elf, &N);
+  if (rc != 0) {
+    return nullptr;
+  }
 
-  if (!elf_machine_id_is_amdgcn(image)) {
-    return NULL;
+  Elf64_Shdr *result = nullptr;
+  for (size_t i = 0; i < N; i++) {
+    Elf_Scn *scn = elf_getscn(elf, i);
+    if (scn) {
+      Elf64_Shdr *shdr = elf64_getshdr(scn);
+      if (shdr) {
+        if (shdr->sh_type == SHT_HASH) {
+          if (result == nullptr) {
+            result = shdr;
+          } else {
+            // multiple SHT_HASH sections not handled
+            return nullptr;
+          }
+        }
+      }
+    }
   }
+  return result;
+}
 
-  {
-    auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image,
-                                  img_size);
+const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash,
+                            const char *symname) {
 
-    auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id];
-    auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
-    hsa_status_t err = module_register_from_memory_to_place(
-        KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size, device_id,
-        [&](void *data, size_t size) {
-          if (image_contains_symbol(data, size, "needs_hostcall_buffer")) {
-            __atomic_store_n(&DeviceInfo.hostcall_required, true,
-                             __ATOMIC_RELEASE);
-          }
-          return env.before_loading(data, size);
-        },
-        DeviceInfo.HSAExecutables);
+  assert(section_hash);
+  size_t section_symtab_index = section_hash->sh_link;
+  Elf64_Shdr *section_symtab =
+      elf64_getshdr(elf_getscn(elf, section_symtab_index));
+  size_t section_strtab_index = section_symtab->sh_link;
 
-    check("Module registering", err);
-    if (err != HSA_STATUS_SUCCESS) {
-      const char *DeviceName = DeviceInfo.GPUName[device_id].c_str();
-      const char *ElfName = get_elf_mach_gfx_name(elf_e_flags(image));
+  const Elf64_Sym *symtab =
+      reinterpret_cast<const Elf64_Sym *>(base + section_symtab->sh_offset);
 
-      if (strcmp(DeviceName, ElfName) != 0) {
-        DP("Possible gpu arch mismatch: device:%s, image:%s please check"
-           " compiler flag: -march=<gpu>\n",
-           DeviceName, ElfName);
-      } else {
-        DP("Error loading image onto GPU: %s\n", get_error_string(err));
-      }
+  const uint32_t *hashtab =
+      reinterpret_cast<const uint32_t *>(base + section_hash->sh_offset);
 
-      return NULL;
-    }
+  // Layout:
+  // nbucket
+  // nchain
+  // bucket[nbucket]
+  // chain[nchain]
+  uint32_t nbucket = hashtab[0];
+  const uint32_t *bucket = &hashtab[2];
+  const uint32_t *chain = &hashtab[nbucket + 2];
 
-    err = env.after_loading();
-    if (err != HSA_STATUS_SUCCESS) {
-      return NULL;
+  const size_t max = strlen(symname) + 1;
+  const uint32_t hash = elf_hash(symname);
+  for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) {
+    char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name);
+    if (strncmp(symname, n, max) == 0) {
+      return &symtab[i];
     }
   }
 
-  DP("AMDGPU module successfully loaded!\n");
+  return nullptr;
+}
 
-  {
-    // the device_State array is either large value in bss or a void* that
-    // needs to be assigned to a pointer to an array of size device_state_bytes
-    // If absent, it has been deadstripped and needs no setup.
-
-    void *state_ptr;
-    uint32_t state_ptr_size;
-    auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
-    hsa_status_t err = interop_hsa_get_symbol_info(
-        SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr,
-        &state_ptr_size);
-
-    if (err != HSA_STATUS_SUCCESS) {
-      DP("No device_state symbol found, skipping initialization\n");
-    } else {
-      if (state_ptr_size < sizeof(void *)) {
-        DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size,
-           sizeof(void *));
-        return NULL;
-      }
+struct symbol_info {
+  void *addr = nullptr;
+  uint32_t size = UINT32_MAX;
+  uint32_t sh_type = SHT_NULL;
+};
 
-      // if it's larger than a void*, assume it's a bss array and no further
-      // initialization is required. Only try to set up a pointer for
-      // sizeof(void*)
-      if (state_ptr_size == sizeof(void *)) {
-        uint64_t device_State_bytes =
-            get_device_State_bytes((char *)image->ImageStart, img_size);
-        if (device_State_bytes == 0) {
-          DP("Can't initialize device_State, missing size information\n");
-          return NULL;
-        }
+int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname,
+                                    symbol_info *res) {
+  if (elf_kind(elf) != ELF_K_ELF) {
+    return 1;
+  }
 
-        auto &dss = DeviceInfo.deviceStateStore[device_id];
-        if (dss.first.get() == nullptr) {
-          assert(dss.second == 0);
-          void *ptr = NULL;
-          hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id);
-          if (err != HSA_STATUS_SUCCESS) {
-            DP("Failed to allocate device_state array\n");
-            return NULL;
-          }
-          dss = {
-              std::unique_ptr<void, RTLDeviceInfoTy::implFreePtrDeletor>{ptr},
-              device_State_bytes,
-          };
-        }
+  Elf64_Shdr *section_hash = find_only_SHT_HASH(elf);
+  if (!section_hash) {
+    return 1;
+  }
 
-        void *ptr = dss.first.get();
-        if (device_State_bytes != dss.second) {
-          DP("Inconsistent sizes of device_State unsupported\n");
-          return NULL;
-        }
+  const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname);
+  if (!sym) {
+    return 1;
+  }
 
-        // write ptr to device memory so it can be used by later kernels
-        err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr,
-                                                   sizeof(void *), device_id);
-        if (err != HSA_STATUS_SUCCESS) {
-          DP("memcpy install of state_ptr failed\n");
-          return NULL;
-        }
-      }
-    }
+  if (sym->st_size > UINT32_MAX) {
+    return 1;
   }
 
-  // Here, we take advantage of the data that is appended after img_end to get
-  // the symbols' name we need to load. This data consist of the host entries
-  // begin and end as well as the target name (see the offloading linker script
-  // creation in clang compiler).
+  if (sym->st_shndx == SHN_UNDEF) {
+    return 1;
+  }
 
-  // Find the symbols in the module by name. The name can be obtain by
-  // concatenating the host entry name with the target name
+  Elf_Scn *section = elf_getscn(elf, sym->st_shndx);
+  if (!section) {
+    return 1;
+  }
 
-  __tgt_offload_entry *HostBegin = image->EntriesBegin;
-  __tgt_offload_entry *HostEnd = image->EntriesEnd;
+  Elf64_Shdr *header = elf64_getshdr(section);
+  if (!header) {
+    return 1;
+  }
 
-  for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
+  res->addr = sym->st_value + base;
+  res->size = static_cast<uint32_t>(sym->st_size);
+  res->sh_type = header->sh_type;
+  return 0;
+}
 
-    if (!e->addr) {
-      // The host should have always something in the address to
-      // uniquely identify the target region.
-      DP("Analyzing host entry '<null>' (size = %lld)...\n",
-         (unsigned long long)e->size);
-      return NULL;
-    }
+int get_symbol_info_without_loading(char *base, size_t img_size,
+                                    const char *symname, symbol_info *res) {
+  Elf *elf = elf_memory(base, img_size);
+  if (elf) {
+    int rc = get_symbol_info_without_loading(elf, base, symname, res);
+    elf_end(elf);
+    return rc;
+  }
+  return 1;
+}
 
-    if (e->size) {
-      __tgt_offload_entry entry = *e;
+hsa_status_t interop_get_symbol_info(char *base, size_t img_size,
+                                     const char *symname, void **var_addr,
+                                     uint32_t *var_size) {
+  symbol_info si;
+  int rc = get_symbol_info_without_loading(base, img_size, symname, &si);
+  if (rc == 0) {
+    *var_addr = si.addr;
+    *var_size = si.size;
+    return HSA_STATUS_SUCCESS;
+  } else {
+    return HSA_STATUS_ERROR;
+  }
+}
 
-      void *varptr;
-      uint32_t varsize;
+template <typename C>
+hsa_status_t module_register_from_memory_to_place(
+    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
+    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
+    void *module_bytes, size_t module_size, int DeviceId, C cb,
+    std::vector<hsa_executable_t> &HSAExecutables) {
+  auto L = [](void *data, size_t size, void *cb_state) -> hsa_status_t {
+    C *unwrapped = static_cast<C *>(cb_state);
+    return (*unwrapped)(data, size);
+  };
+  return core::RegisterModuleFromMemory(
+      KernelInfoTable, SymbolInfoTable, module_bytes, module_size,
+      DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&cb),
+      HSAExecutables);
+}
+} // namespace
 
-      auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
-      hsa_status_t err = interop_hsa_get_symbol_info(
-          SymbolInfoMap, device_id, e->name, &varptr, &varsize);
+static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) {
+  uint64_t device_State_bytes = 0;
+  {
+    // If this is the deviceRTL, get the state variable size
+    symbol_info size_si;
+    int rc = get_symbol_info_without_loading(
+        ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si);
 
-      if (err != HSA_STATUS_SUCCESS) {
-        // Inform the user what symbol prevented offloading
-        DP("Loading global '%s' (Failed)\n", e->name);
-        return NULL;
+    if (rc == 0) {
+      if (size_si.size != sizeof(uint64_t)) {
+        DP("Found device_State_size variable with wrong size\n");
+        return 0;
       }
 
-      if (varsize != e->size) {
-        DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name,
-           varsize, e->size);
-        return NULL;
-      }
+      // Read number of bytes directly from the elf
+      memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t));
+    }
+  }
+  return device_State_bytes;
+}
 
-      DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
-         DPxPTR(e - HostBegin), e->name, DPxPTR(varptr));
-      entry.addr = (void *)varptr;
+static __tgt_target_table *
+__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
 
-      DeviceInfo.addOffloadEntry(device_id, entry);
+static __tgt_target_table *
+__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
 
-      if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-          e->flags & OMP_DECLARE_TARGET_LINK) {
-        // If unified memory is present any target link variables
-        // can access host addresses directly. There is no longer a
-        // need for device copies.
-        err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
-                                                   sizeof(void *), device_id);
-        if (err != HSA_STATUS_SUCCESS)
-          DP("Error when copying USM\n");
-        DP("Copy linked variable host address (" DPxMOD ")"
-           "to device address (" DPxMOD ")\n",
-           DPxPTR(*((void **)e->addr)), DPxPTR(varptr));
-      }
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+                                          __tgt_device_image *image) {
+  DeviceInfo.load_run_lock.lock();
+  __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image);
+  DeviceInfo.load_run_lock.unlock();
+  return res;
+}
 
-      continue;
-    }
+struct device_environment {
+  // initialise an DeviceEnvironmentTy in the deviceRTL
+  // patches around differences in the deviceRTL between trunk, aomp,
+  // rocmcc. Over time these differences will tend to zero and this class
+  // simplified.
+  // Symbol may be in .data or .bss, and may be missing fields, todo:
+  // review aomp/trunk/rocm and simplify the following
 
-    DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name));
+  // The symbol may also have been deadstripped because the device side
+  // accessors were unused.
 
-    // errors in kernarg_segment_size previously treated as = 0 (or as undef)
-    uint32_t kernarg_segment_size = 0;
-    auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id];
-    hsa_status_t err = HSA_STATUS_SUCCESS;
-    if (!e->name) {
-      err = HSA_STATUS_ERROR;
-    } else {
-      std::string kernelStr = std::string(e->name);
-      auto It = KernelInfoMap.find(kernelStr);
-      if (It != KernelInfoMap.end()) {
-        atl_kernel_info_t info = It->second;
-        kernarg_segment_size = info.kernel_segment_size;
-      } else {
-        err = HSA_STATUS_ERROR;
-      }
-    }
+  // If the symbol is in .data (aomp, rocm) it can be written directly.
+  // If it is in .bss, we must wait for it to be allocated space on the
+  // gpu (trunk) and initialize after loading.
+  const char *sym() { return "omptarget_device_environment"; }
 
-    // default value GENERIC (in case symbol is missing from cubin file)
-    llvm::omp::OMPTgtExecModeFlags ExecModeVal =
-        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
+  DeviceEnvironmentTy host_device_env;
+  symbol_info si;
+  bool valid = false;
 
-    // get flat group size if present, else Default_WG_Size
-    int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
+  __tgt_device_image *image;
+  const size_t img_size;
 
-    // get Kernel Descriptor if present.
-    // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
-    struct KernDescValType {
-      uint16_t Version;
-      uint16_t TSize;
-      uint16_t WG_Size;
-    };
-    struct KernDescValType KernDescVal;
-    std::string KernDescNameStr(e->name);
-    KernDescNameStr += "_kern_desc";
-    const char *KernDescName = KernDescNameStr.c_str();
+  device_environment(int device_id, int number_devices,
+                     __tgt_device_image *image, const size_t img_size)
+      : image(image), img_size(img_size) {
 
-    void *KernDescPtr;
-    uint32_t KernDescSize;
-    void *CallStackAddr = nullptr;
-    err = interop_get_symbol_info((char *)image->ImageStart, img_size,
-                                  KernDescName, &KernDescPtr, &KernDescSize);
+    host_device_env.NumDevices = number_devices;
+    host_device_env.DeviceNum = device_id;
+    host_device_env.DebugKind = 0;
+    host_device_env.DynamicMemSize = 0;
+    if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
+      host_device_env.DebugKind = std::stoi(envStr);
+    }
 
-    if (err == HSA_STATUS_SUCCESS) {
-      if ((size_t)KernDescSize != sizeof(KernDescVal))
-        DP("Loading global computation properties '%s' - size mismatch (%u != "
-           "%lu)\n",
-           KernDescName, KernDescSize, sizeof(KernDescVal));
+    int rc = get_symbol_info_without_loading((char *)image->ImageStart,
+                                             img_size, sym(), &si);
+    if (rc != 0) {
+      DP("Finding global device environment '%s' - symbol missing.\n", sym());
+      return;
+    }
 
-      memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
+    if (si.size > sizeof(host_device_env)) {
+      DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size,
+         sizeof(host_device_env));
+      return;
+    }
 
-      // Check structure size against recorded size.
-      if ((size_t)KernDescSize != KernDescVal.TSize)
-        DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
-           sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
+    valid = true;
+  }
 
-      DP("After loading global for %s KernDesc \n", KernDescName);
-      DP("KernDesc: Version: %d\n", KernDescVal.Version);
-      DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
-      DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
+  bool in_image() { return si.sh_type != SHT_NOBITS; }
 
-      if (KernDescVal.WG_Size == 0) {
-        KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
-        DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
+  hsa_status_t before_loading(void *data, size_t size) {
+    if (valid) {
+      if (in_image()) {
+        DP("Setting global device environment before load (%u bytes)\n",
+           si.size);
+        uint64_t offset = (char *)si.addr - (char *)image->ImageStart;
+        void *pos = (char *)data + offset;
+        memcpy(pos, &host_device_env, si.size);
       }
-      WGSizeVal = KernDescVal.WG_Size;
-      DP("WGSizeVal %d\n", WGSizeVal);
-      check("Loading KernDesc computation property", err);
-    } else {
-      DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
-
-      // Flat group size
-      std::string WGSizeNameStr(e->name);
-      WGSizeNameStr += "_wg_size";
-      const char *WGSizeName = WGSizeNameStr.c_str();
-
-      void *WGSizePtr;
-      uint32_t WGSize;
-      err = interop_get_symbol_info((char *)image->ImageStart, img_size,
-                                    WGSizeName, &WGSizePtr, &WGSize);
+    }
+    return HSA_STATUS_SUCCESS;
+  }
 
-      if (err == HSA_STATUS_SUCCESS) {
-        if ((size_t)WGSize != sizeof(int16_t)) {
-          DP("Loading global computation properties '%s' - size mismatch (%u "
-             "!= "
-             "%lu)\n",
-             WGSizeName, WGSize, sizeof(int16_t));
-          return NULL;
+  hsa_status_t after_loading() {
+    if (valid) {
+      if (!in_image()) {
+        DP("Setting global device environment after load (%u bytes)\n",
+           si.size);
+        int device_id = host_device_env.DeviceNum;
+        auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
+        void *state_ptr;
+        uint32_t state_ptr_size;
+        hsa_status_t err = interop_hsa_get_symbol_info(
+            SymbolInfo, device_id, sym(), &state_ptr, &state_ptr_size);
+        if (err != HSA_STATUS_SUCCESS) {
+          DP("failed to find %s in loaded image\n", sym());
+          return err;
         }
 
-        memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
-
-        DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
-
-        if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size ||
-            WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) {
-          DP("Error wrong WGSize value specified in HSA code object file: "
-             "%d\n",
-             WGSizeVal);
-          WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
+        if (state_ptr_size != si.size) {
+          DP("Symbol had size %u before loading, %u after\n", state_ptr_size,
+             si.size);
+          return HSA_STATUS_ERROR;
         }
-      } else {
-        DP("Warning: Loading WGSize '%s' - symbol not found, "
-           "using default value %d\n",
-           WGSizeName, WGSizeVal);
-      }
-
-      check("Loading WGSize computation property", err);
-    }
 
-    // Read execution mode from global in binary
-    std::string ExecModeNameStr(e->name);
-    ExecModeNameStr += "_exec_mode";
-    const char *ExecModeName = ExecModeNameStr.c_str();
-
-    void *ExecModePtr;
-    uint32_t varsize;
-    err = interop_get_symbol_info((char *)image->ImageStart, img_size,
-                                  ExecModeName, &ExecModePtr, &varsize);
-
-    if (err == HSA_STATUS_SUCCESS) {
-      if ((size_t)varsize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
-        DP("Loading global computation properties '%s' - size mismatch(%u != "
-           "%lu)\n",
-           ExecModeName, varsize, sizeof(llvm::omp::OMPTgtExecModeFlags));
-        return NULL;
-      }
-
-      memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
-
-      DP("After loading global for %s ExecMode = %d\n", ExecModeName,
-         ExecModeVal);
-
-      if (ExecModeVal < 0 ||
-          ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
-        DP("Error wrong exec_mode value specified in HSA code object file: "
-           "%d\n",
-           ExecModeVal);
-        return NULL;
+        return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env,
+                                                    state_ptr_size, device_id);
       }
-    } else {
-      DP("Loading global exec_mode '%s' - symbol missing, using default "
-         "value "
-         "GENERIC (1)\n",
-         ExecModeName);
     }
-    check("Loading computation property", err);
-
-    KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
-                                   CallStackAddr, e->name, kernarg_segment_size,
-                                   DeviceInfo.KernArgPool));
-    __tgt_offload_entry entry = *e;
-    entry.addr = (void *)&KernelsList.back();
-    DeviceInfo.addOffloadEntry(device_id, entry);
-    DP("Entry point %ld maps to %s\n", e - HostBegin, e->name);
+    return HSA_STATUS_SUCCESS;
   }
+};
 
-  return DeviceInfo.getOffloadEntriesTable(device_id);
-}
-
-void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) {
-  void *ptr = NULL;
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+static hsa_status_t impl_calloc(void **ret_ptr, size_t size, int DeviceId) {
+  uint64_t rounded = 4 * ((size + 3) / 4);
+  void *ptr;
+  hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
+  hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, rounded, 0, &ptr);
+  if (err != HSA_STATUS_SUCCESS) {
+    return err;
+  }
 
-  if (kind != TARGET_ALLOC_DEFAULT) {
-    REPORT("Invalid target data allocation kind or requested allocator not "
-           "implemented yet\n");
-    return NULL;
+  hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4);
+  if (rc != HSA_STATUS_SUCCESS) {
+    DP("zero fill device_state failed with %u\n", rc);
+    core::Runtime::Memfree(ptr);
+    return HSA_STATUS_ERROR;
   }
 
-  hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(device_id);
-  hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, &ptr);
-  DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
-     (long long unsigned)(Elf64_Addr)ptr);
-  ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL;
-  return ptr;
+  *ret_ptr = ptr;
+  return HSA_STATUS_SUCCESS;
 }
 
-int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr,
-                              int64_t size) {
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
-  __tgt_async_info AsyncInfo;
-  int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo);
-  if (rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(device_id, &AsyncInfo);
+static bool image_contains_symbol(void *data, size_t size, const char *sym) {
+  symbol_info si;
+  int rc = get_symbol_info_without_loading((char *)data, size, sym, &si);
+  return (rc == 0) && (si.addr != nullptr);
 }
 
-int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr,
-                                    int64_t size, __tgt_async_info *AsyncInfo) {
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
-  if (AsyncInfo) {
-    initAsyncInfo(AsyncInfo);
-    return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo);
-  } else {
-    return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size);
-  }
-}
+__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
+                                                 __tgt_device_image *image) {
+  // This function loads the device image onto gpu[device_id] and does other
+  // per-image initialization work. Specifically:
+  //
+  // - Initialize an DeviceEnvironmentTy instance embedded in the
+  //   image at the symbol "omptarget_device_environment"
+  //   Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
+  //
+  // - Allocate a large array per-gpu (could be moved to init_device)
+  //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
+  //   - Allocate at least that many bytes of gpu memory
+  //   - Zero initialize it
+  //   - Write the pointer to the symbol omptarget_nvptx_device_State
+  //
+  // - Pulls some per-kernel information together from various sources and
+  //   records it in the KernelsList for quicker access later
+  //
+  // The initialization can be done before or after loading the image onto the
+  // gpu. This function presently does a mixture. Using the hsa api to get/set
+  // the information is simpler to implement, in exchange for more complicated
+  // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
+  // back from the gpu vs a hashtable lookup on the host.
 
-int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr,
-                                int64_t size) {
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
-  __tgt_async_info AsyncInfo;
-  int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo);
-  if (rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
+  const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart;
 
-  return __tgt_rtl_synchronize(device_id, &AsyncInfo);
-}
+  DeviceInfo.clearOffloadEntriesTable(device_id);
 
-int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr,
-                                      void *tgt_ptr, int64_t size,
-                                      __tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
-  initAsyncInfo(AsyncInfo);
-  return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo);
-}
+  // We do not need to set the ELF version because the caller of this function
+  // had to do that to decide the right runtime to use
 
-int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) {
-  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
-  hsa_status_t err;
-  DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr);
-  err = core::Runtime::Memfree(tgt_ptr);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error when freeing CUDA memory\n");
-    return OFFLOAD_FAIL;
+  if (!elf_machine_id_is_amdgcn(image)) {
+    return NULL;
   }
-  return OFFLOAD_SUCCESS;
-}
 
-// Determine launch values for kernel.
-struct launchVals {
-  int WorkgroupSize;
-  int GridSize;
-};
-launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
-                         int ConstWGSize,
-                         llvm::omp::OMPTgtExecModeFlags ExecutionMode,
-                         int num_teams, int thread_limit,
-                         uint64_t loop_tripcount, int DeviceNumTeams) {
+  {
+    auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image,
+                                  img_size);
 
-  int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
-  int num_groups = 0;
+    auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id];
+    auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
+    hsa_status_t err = module_register_from_memory_to_place(
+        KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size, device_id,
+        [&](void *data, size_t size) {
+          if (image_contains_symbol(data, size, "needs_hostcall_buffer")) {
+            __atomic_store_n(&DeviceInfo.hostcall_required, true,
+                             __ATOMIC_RELEASE);
+          }
+          return env.before_loading(data, size);
+        },
+        DeviceInfo.HSAExecutables);
 
-  int Max_Teams =
-      Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
-  if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit)
-    Max_Teams = RTLDeviceInfoTy::HardTeamLimit;
+    check("Module registering", err);
+    if (err != HSA_STATUS_SUCCESS) {
+      const char *DeviceName = DeviceInfo.GPUName[device_id].c_str();
+      const char *ElfName = get_elf_mach_gfx_name(elf_e_flags(image));
 
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams);
-    DP("Max_Teams: %d\n", Max_Teams);
-    DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
-    DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size);
-    DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
-       RTLDeviceInfoTy::Default_WG_Size);
-    DP("thread_limit: %d\n", thread_limit);
-    DP("threadsPerGroup: %d\n", threadsPerGroup);
-    DP("ConstWGSize: %d\n", ConstWGSize);
-  }
-  // check for thread_limit() clause
-  if (thread_limit > 0) {
-    threadsPerGroup = thread_limit;
-    DP("Setting threads per block to requested %d\n", thread_limit);
-    // Add master warp for GENERIC
-    if (ExecutionMode ==
-        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
-      threadsPerGroup += WarpSize;
-      DP("Adding master wavefront: +%d threads\n", WarpSize);
+      if (strcmp(DeviceName, ElfName) != 0) {
+        DP("Possible gpu arch mismatch: device:%s, image:%s please check"
+           " compiler flag: -march=<gpu>\n",
+           DeviceName, ElfName);
+      } else {
+        DP("Error loading image onto GPU: %s\n", get_error_string(err));
+      }
+
+      return NULL;
     }
-    if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
-      threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
-      DP("Setting threads per block to maximum %d\n", threadsPerGroup);
+
+    err = env.after_loading();
+    if (err != HSA_STATUS_SUCCESS) {
+      return NULL;
     }
   }
-  // check flat_max_work_group_size attr here
-  if (threadsPerGroup > ConstWGSize) {
-    threadsPerGroup = ConstWGSize;
-    DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
-       threadsPerGroup);
-  }
-  if (print_kernel_trace & STARTUP_DETAILS)
-    DP("threadsPerGroup: %d\n", threadsPerGroup);
-  DP("Preparing %d threads\n", threadsPerGroup);
 
-  // Set default num_groups (teams)
-  if (Env.TeamLimit > 0)
-    num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit;
-  else
-    num_groups = Max_Teams;
-  DP("Set default num of groups %d\n", num_groups);
+  DP("AMDGPU module successfully loaded!\n");
 
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("num_groups: %d\n", num_groups);
-    DP("num_teams: %d\n", num_teams);
-  }
+  {
+    // the device_State array is either large value in bss or a void* that
+    // needs to be assigned to a pointer to an array of size device_state_bytes
+    // If absent, it has been deadstripped and needs no setup.
 
-  // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
-  // This reduction is typical for default case (no thread_limit clause).
-  // or when user goes crazy with num_teams clause.
-  // FIXME: We cant distinguish between a constant or variable thread limit.
-  // So we only handle constant thread_limits.
-  if (threadsPerGroup >
-      RTLDeviceInfoTy::Default_WG_Size) //  256 < threadsPerGroup <= 1024
-    // Should we round threadsPerGroup up to nearest WarpSize
-    // here?
-    num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
+    void *state_ptr;
+    uint32_t state_ptr_size;
+    auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
+    hsa_status_t err = interop_hsa_get_symbol_info(
+        SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr,
+        &state_ptr_size);
 
-  // check for num_teams() clause
-  if (num_teams > 0) {
-    num_groups = (num_teams < num_groups) ? num_teams : num_groups;
-  }
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("num_groups: %d\n", num_groups);
-    DP("Env.NumTeams %d\n", Env.NumTeams);
-    DP("Env.TeamLimit %d\n", Env.TeamLimit);
-  }
+    if (err != HSA_STATUS_SUCCESS) {
+      DP("No device_state symbol found, skipping initialization\n");
+    } else {
+      if (state_ptr_size < sizeof(void *)) {
+        DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size,
+           sizeof(void *));
+        return NULL;
+      }
 
-  if (Env.NumTeams > 0) {
-    num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups;
-    DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
-  } else if (Env.TeamLimit > 0) {
-    num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups;
-    DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
-  } else {
-    if (num_teams <= 0) {
-      if (loop_tripcount > 0) {
-        if (ExecutionMode ==
-            llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
-          // round up to the nearest integer
-          num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
-        } else if (ExecutionMode ==
-                   llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
-          num_groups = loop_tripcount;
-        } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
-          // This is a generic kernel that was transformed to use SPMD-mode
-          // execution but uses Generic-mode semantics for scheduling.
-          num_groups = loop_tripcount;
+      // if it's larger than a void*, assume it's a bss array and no further
+      // initialization is required. Only try to set up a pointer for
+      // sizeof(void*)
+      if (state_ptr_size == sizeof(void *)) {
+        uint64_t device_State_bytes =
+            get_device_State_bytes((char *)image->ImageStart, img_size);
+        if (device_State_bytes == 0) {
+          DP("Can't initialize device_State, missing size information\n");
+          return NULL;
+        }
+
+        auto &dss = DeviceInfo.deviceStateStore[device_id];
+        if (dss.first.get() == nullptr) {
+          assert(dss.second == 0);
+          void *ptr = NULL;
+          hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id);
+          if (err != HSA_STATUS_SUCCESS) {
+            DP("Failed to allocate device_state array\n");
+            return NULL;
+          }
+          dss = {
+              std::unique_ptr<void, RTLDeviceInfoTy::implFreePtrDeletor>{ptr},
+              device_State_bytes,
+          };
+        }
+
+        void *ptr = dss.first.get();
+        if (device_State_bytes != dss.second) {
+          DP("Inconsistent sizes of device_State unsupported\n");
+          return NULL;
+        }
+
+        // write ptr to device memory so it can be used by later kernels
+        err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr,
+                                                   sizeof(void *), device_id);
+        if (err != HSA_STATUS_SUCCESS) {
+          DP("memcpy install of state_ptr failed\n");
+          return NULL;
         }
-        DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
-           "threads per block %d\n",
-           num_groups, loop_tripcount, threadsPerGroup);
       }
-    } else {
-      num_groups = num_teams;
-    }
-    if (num_groups > Max_Teams) {
-      num_groups = Max_Teams;
-      if (print_kernel_trace & STARTUP_DETAILS)
-        DP("Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams);
-    }
-    if (num_groups > num_teams && num_teams > 0) {
-      num_groups = num_teams;
-      if (print_kernel_trace & STARTUP_DETAILS)
-        DP("Limiting num_groups %d to clause num_teams %d \n", num_groups,
-           num_teams);
     }
   }
 
-  // num_teams clause always honored, no matter what, unless DEFAULT is active.
-  if (num_teams > 0) {
-    num_groups = num_teams;
-    // Cap num_groups to EnvMaxTeamsDefault if set.
-    if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault)
-      num_groups = Env.MaxTeamsDefault;
-  }
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("threadsPerGroup: %d\n", threadsPerGroup);
-    DP("num_groups: %d\n", num_groups);
-    DP("loop_tripcount: %ld\n", loop_tripcount);
-  }
-  DP("Final %d num_groups and %d threadsPerGroup\n", num_groups,
-     threadsPerGroup);
+  // Here, we take advantage of the data that is appended after img_end to get
+  // the symbols' name we need to load. This data consist of the host entries
+  // begin and end as well as the target name (see the offloading linker script
+  // creation in clang compiler).
 
-  launchVals res;
-  res.WorkgroupSize = threadsPerGroup;
-  res.GridSize = threadsPerGroup * num_groups;
-  return res;
-}
+  // Find the symbols in the module by name. The name can be obtain by
+  // concatenating the host entry name with the target name
 
-static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
-  uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
-  bool full = true;
-  while (full) {
-    full =
-        packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue));
-  }
-  return packet_id;
-}
+  __tgt_offload_entry *HostBegin = image->EntriesBegin;
+  __tgt_offload_entry *HostEnd = image->EntriesEnd;
 
-static int32_t __tgt_rtl_run_target_team_region_locked(
-    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
-    ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
-    int32_t thread_limit, uint64_t loop_tripcount);
+  for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
 
-int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
-                                         void **tgt_args,
-                                         ptrdiff_t *tgt_offsets,
-                                         int32_t arg_num, int32_t num_teams,
-                                         int32_t thread_limit,
-                                         uint64_t loop_tripcount) {
+    if (!e->addr) {
+      // The host should have always something in the address to
+      // uniquely identify the target region.
+      DP("Analyzing host entry '<null>' (size = %lld)...\n",
+         (unsigned long long)e->size);
+      return NULL;
+    }
 
-  DeviceInfo.load_run_lock.lock_shared();
-  int32_t res = __tgt_rtl_run_target_team_region_locked(
-      device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams,
-      thread_limit, loop_tripcount);
+    if (e->size) {
+      __tgt_offload_entry entry = *e;
 
-  DeviceInfo.load_run_lock.unlock_shared();
-  return res;
-}
+      void *varptr;
+      uint32_t varsize;
 
-int32_t __tgt_rtl_run_target_team_region_locked(
-    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
-    ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
-    int32_t thread_limit, uint64_t loop_tripcount) {
-  // Set the context we are using
-  // update thread limit content in gpu memory if un-initialized or specified
-  // from host
+      auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
+      hsa_status_t err = interop_hsa_get_symbol_info(
+          SymbolInfoMap, device_id, e->name, &varptr, &varsize);
 
-  DP("Run target team region thread_limit %d\n", thread_limit);
+      if (err != HSA_STATUS_SUCCESS) {
+        // Inform the user what symbol prevented offloading
+        DP("Loading global '%s' (Failed)\n", e->name);
+        return NULL;
+      }
 
-  // All args are references.
-  std::vector<void *> args(arg_num);
-  std::vector<void *> ptrs(arg_num);
+      if (varsize != e->size) {
+        DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name,
+           varsize, e->size);
+        return NULL;
+      }
 
-  DP("Arg_num: %d\n", arg_num);
-  for (int32_t i = 0; i < arg_num; ++i) {
-    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
-    args[i] = &ptrs[i];
-    DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
-  }
+      DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+         DPxPTR(e - HostBegin), e->name, DPxPTR(varptr));
+      entry.addr = (void *)varptr;
 
-  KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
+      DeviceInfo.addOffloadEntry(device_id, entry);
 
-  std::string kernel_name = std::string(KernelInfo->Name);
-  auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
-  if (KernelInfoTable[device_id].find(kernel_name) ==
-      KernelInfoTable[device_id].end()) {
-    DP("Kernel %s not found\n", kernel_name.c_str());
-    return OFFLOAD_FAIL;
-  }
+      if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+          e->flags & OMP_DECLARE_TARGET_LINK) {
+        // If unified memory is present any target link variables
+        // can access host addresses directly. There is no longer a
+        // need for device copies.
+        err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
+                                                   sizeof(void *), device_id);
+        if (err != HSA_STATUS_SUCCESS)
+          DP("Error when copying USM\n");
+        DP("Copy linked variable host address (" DPxMOD ")"
+           "to device address (" DPxMOD ")\n",
+           DPxPTR(*((void **)e->addr)), DPxPTR(varptr));
+      }
 
-  const atl_kernel_info_t KernelInfoEntry =
-      KernelInfoTable[device_id][kernel_name];
-  const uint32_t group_segment_size = KernelInfoEntry.group_segment_size;
-  const uint32_t sgpr_count = KernelInfoEntry.sgpr_count;
-  const uint32_t vgpr_count = KernelInfoEntry.vgpr_count;
-  const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count;
-  const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count;
+      continue;
+    }
 
-  assert(arg_num == (int)KernelInfoEntry.explicit_argument_count);
+    DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name));
 
-  /*
-   * Set limit based on ThreadsPerGroup and GroupsPerDevice
-   */
-  launchVals LV =
-      getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env,
-                    KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
-                    num_teams,      // From run_region arg
-                    thread_limit,   // From run_region arg
-                    loop_tripcount, // From run_region arg
-                    DeviceInfo.NumTeams[KernelInfo->device_id]);
-  const int GridSize = LV.GridSize;
-  const int WorkgroupSize = LV.WorkgroupSize;
+    // errors in kernarg_segment_size previously treated as = 0 (or as undef)
+    uint32_t kernarg_segment_size = 0;
+    auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id];
+    hsa_status_t err = HSA_STATUS_SUCCESS;
+    if (!e->name) {
+      err = HSA_STATUS_ERROR;
+    } else {
+      std::string kernelStr = std::string(e->name);
+      auto It = KernelInfoMap.find(kernelStr);
+      if (It != KernelInfoMap.end()) {
+        atl_kernel_info_t info = It->second;
+        kernarg_segment_size = info.kernel_segment_size;
+      } else {
+        err = HSA_STATUS_ERROR;
+      }
+    }
 
-  if (print_kernel_trace >= LAUNCH) {
-    int num_groups = GridSize / WorkgroupSize;
-    // enum modes are SPMD, GENERIC, NONE 0,1,2
-    // if doing rtl timing, print to stderr, unless stdout requested.
-    bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
-    fprintf(traceToStdout ? stdout : stderr,
-            "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
-            "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
-            "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
-            device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
-            arg_num, num_groups, WorkgroupSize, num_teams, thread_limit,
-            group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count,
-            vgpr_spill_count, loop_tripcount, KernelInfo->Name);
-  }
+    // default value GENERIC (in case symbol is missing from cubin file)
+    llvm::omp::OMPTgtExecModeFlags ExecModeVal =
+        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
 
-  // Run on the device.
-  {
-    hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();
-    if (!queue) {
-      return OFFLOAD_FAIL;
-    }
-    uint64_t packet_id = acquire_available_packet_id(queue);
+    // get flat group size if present, else Default_WG_Size
+    int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
+
+    // get Kernel Descriptor if present.
+    // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
+    struct KernDescValType {
+      uint16_t Version;
+      uint16_t TSize;
+      uint16_t WG_Size;
+    };
+    struct KernDescValType KernDescVal;
+    std::string KernDescNameStr(e->name);
+    KernDescNameStr += "_kern_desc";
+    const char *KernDescName = KernDescNameStr.c_str();
+
+    void *KernDescPtr;
+    uint32_t KernDescSize;
+    void *CallStackAddr = nullptr;
+    err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+                                  KernDescName, &KernDescPtr, &KernDescSize);
+
+    if (err == HSA_STATUS_SUCCESS) {
+      if ((size_t)KernDescSize != sizeof(KernDescVal))
+        DP("Loading global computation properties '%s' - size mismatch (%u != "
+           "%lu)\n",
+           KernDescName, KernDescSize, sizeof(KernDescVal));
+
+      memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
+
+      // Check structure size against recorded size.
+      if ((size_t)KernDescSize != KernDescVal.TSize)
+        DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
+           sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
+
+      DP("After loading global for %s KernDesc \n", KernDescName);
+      DP("KernDesc: Version: %d\n", KernDescVal.Version);
+      DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
+      DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
+
+      if (KernDescVal.WG_Size == 0) {
+        KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
+        DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
+      }
+      WGSizeVal = KernDescVal.WG_Size;
+      DP("WGSizeVal %d\n", WGSizeVal);
+      check("Loading KernDesc computation property", err);
+    } else {
+      DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
+
+      // Flat group size
+      std::string WGSizeNameStr(e->name);
+      WGSizeNameStr += "_wg_size";
+      const char *WGSizeName = WGSizeNameStr.c_str();
+
+      void *WGSizePtr;
+      uint32_t WGSize;
+      err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+                                    WGSizeName, &WGSizePtr, &WGSize);
+
+      if (err == HSA_STATUS_SUCCESS) {
+        if ((size_t)WGSize != sizeof(int16_t)) {
+          DP("Loading global computation properties '%s' - size mismatch (%u "
+             "!= "
+             "%lu)\n",
+             WGSizeName, WGSize, sizeof(int16_t));
+          return NULL;
+        }
 
-    const uint32_t mask = queue->size - 1; // size is a power of 2
-    hsa_kernel_dispatch_packet_t *packet =
-        (hsa_kernel_dispatch_packet_t *)queue->base_address +
-        (packet_id & mask);
+        memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
 
-    // packet->header is written last
-    packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-    packet->workgroup_size_x = WorkgroupSize;
-    packet->workgroup_size_y = 1;
-    packet->workgroup_size_z = 1;
-    packet->reserved0 = 0;
-    packet->grid_size_x = GridSize;
-    packet->grid_size_y = 1;
-    packet->grid_size_z = 1;
-    packet->private_segment_size = KernelInfoEntry.private_segment_size;
-    packet->group_segment_size = KernelInfoEntry.group_segment_size;
-    packet->kernel_object = KernelInfoEntry.kernel_object;
-    packet->kernarg_address = 0;     // use the block allocator
-    packet->reserved2 = 0;           // impl writes id_ here
-    packet->completion_signal = {0}; // may want a pool of signals
+        DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
 
-    KernelArgPool *ArgPool = nullptr;
-    void *kernarg = nullptr;
-    {
-      auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
-      if (it != KernelArgPoolMap.end()) {
-        ArgPool = (it->second).get();
+        if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size ||
+            WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) {
+          DP("Error wrong WGSize value specified in HSA code object file: "
+             "%d\n",
+             WGSizeVal);
+          WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
+        }
+      } else {
+        DP("Warning: Loading WGSize '%s' - symbol not found, "
+           "using default value %d\n",
+           WGSizeName, WGSizeVal);
       }
+
+      check("Loading WGSize computation property", err);
     }
-    if (!ArgPool) {
-      DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
-         device_id);
-    }
-    {
-      if (ArgPool) {
-        assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
-        kernarg = ArgPool->allocate(arg_num);
-      }
-      if (!kernarg) {
-        DP("Allocate kernarg failed\n");
-        return OFFLOAD_FAIL;
-      }
 
-      // Copy explicit arguments
-      for (int i = 0; i < arg_num; i++) {
-        memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *));
-      }
+    // Read execution mode from global in binary
+    std::string ExecModeNameStr(e->name);
+    ExecModeNameStr += "_exec_mode";
+    const char *ExecModeName = ExecModeNameStr.c_str();
 
-      // Initialize implicit arguments. TODO: Which of these can be dropped
-      impl_implicit_args_t *impl_args =
-          reinterpret_cast<impl_implicit_args_t *>(
-              static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size);
-      memset(impl_args, 0,
-             sizeof(impl_implicit_args_t)); // may not be necessary
-      impl_args->offset_x = 0;
-      impl_args->offset_y = 0;
-      impl_args->offset_z = 0;
+    void *ExecModePtr;
+    uint32_t varsize;
+    err = interop_get_symbol_info((char *)image->ImageStart, img_size,
+                                  ExecModeName, &ExecModePtr, &varsize);
 
-      // assign a hostcall buffer for the selected Q
-      if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) {
-        // hostrpc_assign_buffer is not thread safe, and this function is
-        // under a multiple reader lock, not a writer lock.
-        static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
-        pthread_mutex_lock(&hostcall_init_lock);
-        unsigned long buffer = hostrpc_assign_buffer(
-            DeviceInfo.HSAAgents[device_id], queue, device_id);
-        pthread_mutex_unlock(&hostcall_init_lock);
-        if (!buffer) {
-          DP("hostrpc_assign_buffer failed, gpu would dereference null and "
-             "error\n");
-          return OFFLOAD_FAIL;
-        }
+    if (err == HSA_STATUS_SUCCESS) {
+      if ((size_t)varsize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
+        DP("Loading global computation properties '%s' - size mismatch(%u != "
+           "%lu)\n",
+           ExecModeName, varsize, sizeof(llvm::omp::OMPTgtExecModeFlags));
+        return NULL;
+      }
 
-        if (KernelInfoEntry.implicit_argument_count >= 4) {
-          // Initialise pointer for implicit_argument_count != 0 ABI
-          // Guess that the right implicit argument is at offset 24 after
-          // the explicit arguments. In the future, should be able to read
-          // the offset from msgpack. Clang is not annotating it at present.
-          uint64_t Offset =
-              sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
-          if ((Offset + 8) > (ArgPool->kernarg_segment_size)) {
-            DP("Bad offset of hostcall, exceeds kernarg segment size\n");
-          } else {
-            memcpy(static_cast<char *>(kernarg) + Offset, &buffer, 8);
-          }
-        }
+      memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
 
-        // initialise pointer for implicit_argument_count == 0 ABI
-        impl_args->hostcall_ptr = buffer;
-      }
+      DP("After loading global for %s ExecMode = %d\n", ExecModeName,
+         ExecModeVal);
 
-      packet->kernarg_address = kernarg;
+      if (ExecModeVal < 0 ||
+          ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
+        DP("Error wrong exec_mode value specified in HSA code object file: "
+           "%d\n",
+           ExecModeVal);
+        return NULL;
+      }
+    } else {
+      DP("Loading global exec_mode '%s' - symbol missing, using default "
+         "value "
+         "GENERIC (1)\n",
+         ExecModeName);
     }
+    check("Loading computation property", err);
 
-    hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
-    if (s.handle == 0) {
-      DP("Failed to get signal instance\n");
-      return OFFLOAD_FAIL;
-    }
-    packet->completion_signal = s;
-    hsa_signal_store_relaxed(packet->completion_signal, 1);
+    KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
+                                   CallStackAddr, e->name, kernarg_segment_size,
+                                   DeviceInfo.KernArgPool));
+    __tgt_offload_entry entry = *e;
+    entry.addr = (void *)&KernelsList.back();
+    DeviceInfo.addOffloadEntry(device_id, entry);
+    DP("Entry point %ld maps to %s\n", e - HostBegin, e->name);
+  }
 
-    // Publish the packet indicating it is ready to be processed
-    core::packet_store_release(reinterpret_cast<uint32_t *>(packet),
-                               core::create_header(), packet->setup);
+  return DeviceInfo.getOffloadEntriesTable(device_id);
+}
 
-    // Since the packet is already published, its contents must not be
-    // accessed any more
-    hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
+void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) {
+  void *ptr = NULL;
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
 
-    while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
-                                     HSA_WAIT_STATE_BLOCKED) != 0)
-      ;
+  if (kind != TARGET_ALLOC_DEFAULT) {
+    REPORT("Invalid target data allocation kind or requested allocator not "
+           "implemented yet\n");
+    return NULL;
+  }
 
-    assert(ArgPool);
-    ArgPool->deallocate(kernarg);
-    DeviceInfo.FreeSignalPool.push(s);
+  hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(device_id);
+  hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, &ptr);
+  DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
+     (long long unsigned)(Elf64_Addr)ptr);
+  ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL;
+  return ptr;
+}
+
+int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr,
+                              int64_t size) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  __tgt_async_info AsyncInfo;
+  int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(device_id, &AsyncInfo);
+}
+
+int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr,
+                                    int64_t size, __tgt_async_info *AsyncInfo) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  if (AsyncInfo) {
+    initAsyncInfo(AsyncInfo);
+    return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo);
+  } else {
+    return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size);
   }
+}
 
-  DP("Kernel completed\n");
+int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr,
+                                int64_t size) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  __tgt_async_info AsyncInfo;
+  int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(device_id, &AsyncInfo);
+}
+
+int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr,
+                                      void *tgt_ptr, int64_t size,
+                                      __tgt_async_info *AsyncInfo) {
+  assert(AsyncInfo && "AsyncInfo is nullptr");
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  initAsyncInfo(AsyncInfo);
+  return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo);
+}
+
+int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) {
+  assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
+  hsa_status_t err;
+  DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr);
+  err = core::Runtime::Memfree(tgt_ptr);
+  if (err != HSA_STATUS_SUCCESS) {
+    DP("Error when freeing CUDA memory\n");
+    return OFFLOAD_FAIL;
+  }
   return OFFLOAD_SUCCESS;
 }
 
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+                                         void **tgt_args,
+                                         ptrdiff_t *tgt_offsets,
+                                         int32_t arg_num, int32_t num_teams,
+                                         int32_t thread_limit,
+                                         uint64_t loop_tripcount) {
+
+  DeviceInfo.load_run_lock.lock_shared();
+  int32_t res =
+      runRegionNowaitLocked(device_id, tgt_entry_ptr, tgt_args, tgt_offsets,
+                            arg_num, num_teams, thread_limit, loop_tripcount);
+
+  DeviceInfo.load_run_lock.unlock_shared();
+  return res;
+}
+
 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
                                     void **tgt_args, ptrdiff_t *tgt_offsets,
                                     int32_t arg_num) {
@@ -2244,6 +2240,20 @@
                                           thread_limit, 0);
 }
 
+int32_t __tgt_rtl_run_target_team_region_async(
+    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+    ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
+    int32_t thread_limit, uint64_t loop_tripcount) {
+
+  DeviceInfo.load_run_lock.lock_shared();
+  int32_t res =
+      runRegionNowaitLocked(device_id, tgt_entry_ptr, tgt_args, tgt_offsets,
+                            arg_num, num_teams, thread_limit, loop_tripcount);
+
+  DeviceInfo.load_run_lock.unlock_shared();
+  return res;
+}
+
 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
                                           void *tgt_entry_ptr, void **tgt_args,
                                           ptrdiff_t *tgt_offsets,
@@ -2256,9 +2266,9 @@
   // fix thread num
   int32_t team_num = 1;
   int32_t thread_limit = 0; // use default
-  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
-                                          tgt_offsets, arg_num, team_num,
-                                          thread_limit, 0);
+  return __tgt_rtl_run_target_team_region_async(device_id, tgt_entry_ptr,
+                                                tgt_args, tgt_offsets, arg_num,
+                                                team_num, thread_limit, 0);
 }
 
 int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) {